/*
 *   (C) Copyright IBM Corp. 2001, 2003
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * Module: mdregmgr
 * File: md_discover.c
 *
 * Description: This file contains all functions related to the initial
 *              discovery of MD physical volumes, volume groups, and logical
 *              volumes.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <time.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <unistd.h>
#include <plugin.h>

#include "md.h"
#include "linear_discover.h"
#include "raid0_discover.h"
#include "raid1_discover.h"
#include "raid5_discover.h"

#define my_plugin_record my_plugin

// function to check if the uuids of 2 super block are equal,
// returns 1 if they are equal, 0 otherwise.
int md_uuids_equal(mdp_super_t * sb1, mdp_super_t * sb2)
{
	LOG_ENTRY();
	if (sb1->set_uuid0 == sb2->set_uuid0 &&
		sb1->set_uuid1 == sb2->set_uuid1 &&
		sb1->set_uuid2 == sb2->set_uuid2 &&
		sb1->set_uuid3 == sb2->set_uuid3 ) {
		LOG_EXIT_INT(1);
		return 1;
	} else {
		LOG_EXIT_INT(0);
		return 0;
	}
}

// function to return the event counts as a 64 bit field.
static u_int64_t md_get_event(mdp_super_t * sb)
{
	u_int64_t event;
	LOG_ENTRY();

	event = (u_int64_t) (sb->events_hi);
	event <<= 32;
	event += (u_int64_t) (sb->events_lo);
	LOG_EXIT_INT((int)event);
	return event;
}


/* returns csum of superblock, super->csum is left unchanged */
static int calc_sb_csum(mdp_super_t *super)
{
	unsigned int  oldcsum = super->sb_csum;
	unsigned long long newcsum = 0;
	unsigned long csum;
	int i;
	unsigned int *superc = (int*) super;
	super->sb_csum = 0;

	for (i=0; i<MD_SB_BYTES/4; i++)
		newcsum+= superc[i];
	csum = (newcsum& 0xffffffff) + (newcsum>>32);
	super->sb_csum = oldcsum;
	return csum;
}

/* returns csum of saved SB INFO, saved_sb->csum is left unchanged */
static int calc_saved_sb_csum(mdp_saved_super_t *saved_sb)
{
	unsigned int  oldcsum = saved_sb->sb_csum;
	unsigned long long newcsum = 0;
	unsigned long csum;
	int i;
	unsigned int *superc = (int*) saved_sb;
	saved_sb->sb_csum = 0;

	for (i=0; i<MD_SAVED_SB_INFO_WORDS; i++)
		newcsum+= superc[i];
	csum = (newcsum& 0xffffffff) + (newcsum>>32);
	saved_sb->sb_csum = oldcsum;
	return csum;
}

/* Function: md_validate_disk_sb
 *
 *	This function validates one RAID superblock for generic plausibility
 */

static int md_validate_disk_sb(mdp_super_t *sb)
{
	int old_csum, new_csum;
        LOG_ENTRY();

	if ( ! ( sb->md_magic == MD_SB_MAGIC &&
			 sb->major_version == 0 &&
			 sb->minor_version == 90 ) ) {
		LOG_EXTRA("Bad signature or version\n");
		LOG_EXIT_INT(ENXIO);
		return ENXIO;
	}

	old_csum = sb->sb_csum;

	new_csum = calc_sb_csum(sb);

	if (new_csum != old_csum) {
		LOG_DEFAULT("Bad Checksum\n");
		LOG_EXIT_INT(ENXIO);
		return ENXIO;
	}
	LOG_EXIT_INT(0);
	return 0;
}

/* Function: md_validate_saved_sb
 *
 *	This function validates one RAID superblock for generic plausibility
 */

static int md_validate_saved_sb(mdp_saved_super_t *saved_sb)
{
	int old_csum, new_csum;
	LOG_ENTRY();

	if ( ! ( saved_sb->md_magic == MD_SB_MAGIC &&
			 saved_sb->major_version == 0 &&
			 saved_sb->minor_version == 90 ) ) {
		LOG_EXTRA("Bad signature or version\n");
		LOG_EXIT_INT(ENXIO);
		return ENXIO;
	}

	old_csum = saved_sb->sb_csum;

	new_csum = calc_saved_sb_csum(saved_sb);

	if (new_csum != old_csum) {
		LOG_DEFAULT("Bad Checksum\n");
		LOG_EXIT_INT(ENXIO);
		return ENXIO;
	}
	LOG_EXIT_INT(0);
	return 0;
}

/* Function:  md_read_metadata_from_disk
 *
 *	This function reads MD superblock or "backup" MD info from disk.
 */
static int  md_read_metadata_from_disk(
	storage_object_t *object,
	u_int64_t location,
	void * buffer,
	u_int32_t sects)
{
	int rc = 0;
	LOG_ENTRY();

	if (object->data_type != DATA_TYPE) {
		LOG_DETAILS("Object not data type, skipping %s\n", object->name);
		LOG_EXIT_INT(rc);
		return rc;
	}

	if (object->size <= MD_RESERVED_SECTORS) {
		LOG_DETAILS("Object too small for MD, skipping %s\n", object->name);
		LOG_EXIT_INT(rc);
		return rc;
	}

	LOG_DEBUG("Looking for MD metadata at %"PRIu64" on %s\n",location, object->name);

	/* Make sure that we read what's on disk */
	md_ioctl_flush_buffer_cache(object);

	if ( READ(object, location, sects, (char*)buffer )) {
		LOG_SERIOUS("Error reading MD superblock from object %s\n", object->name);
		LOG_EXIT_INT(EIO);
		return EIO;
	}

	LOG_EXIT_INT(rc);
	return rc;

}

/* function: md_save_sb
 *
 *	Save some information from the original superblock,
 *	use the last MD_SAVED_SB_INFO_SECTS of reserved MD sectors.
 */
static int md_save_sb(md_volume_t * volume, int i)
{
	u_int64_t location;
	mdp_super_t *sb;
	mdp_saved_super_t *saved_sb = NULL;
	storage_object_t *object = volume->child_object[i];
	int rc = 0;

	LOG_ENTRY();

	sb = volume->super_array[i];

	//must have MD_COMMIT_SAVE_SB flag set to continue
	if ( !(volume->commit_flag & MD_COMMIT_SAVE_SB) ||
	     (sb->disks[i].state & (1<<MD_DISK_NEW)) ) {
	   	LOG_EXIT_INT(0);
		return 0;
	}
	

	if (volume->saved_super_array[i]) {
		saved_sb = volume->saved_super_array[i];
		LOG_DEBUG("skipping %s, saved SB info already exists.\n", object->name);
		LOG_EXIT_INT(0);
		return 0;

	} else {

		saved_sb = EngFncs->engine_alloc(MD_SB_BYTES);
		if (!saved_sb) {
			LOG_EXIT_INT(ENOMEM);
			return ENOMEM;
		}
		
		saved_sb->md_magic = sb->md_magic;
		saved_sb->major_version = sb->major_version;
		saved_sb->minor_version = sb->minor_version;
		saved_sb->patch_version = sb->patch_version;
		saved_sb->set_uuid0 = sb->set_uuid0;
		saved_sb->set_uuid1 = sb->set_uuid1;
		saved_sb->set_uuid2 = sb->set_uuid2;
		saved_sb->set_uuid3 = sb->set_uuid3;
		saved_sb->ctime = sb->ctime;
		saved_sb->level = sb->level;
		saved_sb->size = sb->size;
		saved_sb->md_minor = sb->md_minor;

		saved_sb->this_disk_number = sb->disks[i].number;
		saved_sb->this_disk_major = sb->disks[i].major;
		saved_sb->this_disk_minor = sb->disks[i].minor;

		saved_sb->sb_csum = calc_saved_sb_csum(saved_sb);
	
		//Location of MD superblock
		location = MD_NEW_SIZE_SECTORS(object->size);
		location += MD_SAVED_SB_SECTOR_OFFSET;

		LOG_DEBUG("Writing MD backup SB at %"PRIu64" on %s\n", location, object->name);
		if ( WRITE(object, location, MD_SAVED_SB_INFO_SECTS,
			   (char*)saved_sb)) {
			LOG_SERIOUS("Error writing saved SB info to %s\n", object->name);
			rc = EIO;
			goto out_free;
		}
		LOG_EXIT_INT(0);
		return 0;
	}

out_free:
	if (saved_sb)
		EngFncs->engine_free(saved_sb);
	LOG_EXIT_INT(rc);
	return rc;
}

/* write all of the super block for an MD region to disk */
int  md_write_sbs_to_disk(md_volume_t * volume) {
	int rc = 0;
	u_int64_t location;
	int i,j;
	time_t utime;
	mdp_super_t *sb;
	LOG_ENTRY();

	if (!volume->region) {
		LOG_EXIT_INT(EFAULT);
		return EFAULT;
	}
	if (!(volume->flags & MD_DIRTY)) {
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}

	if (!(volume->commit_flag & MD_COMMIT_DONT_CHECK_ACTIVE) &&
	    md_is_region_active(volume->region)) {
		LOG_ERROR("Region %s is still active, skip writting superblocks\n", volume->region->name);
		LOG_EXIT_INT(EBUSY);
		return EINVAL;
	}

	// update utime
	utime = time(NULL);
	volume->super_block->utime = utime;
	volume->super_block->events_lo++;
	if (volume->super_block->events_lo == 0) {
		volume->super_block->events_hi++;
	}
	
	//if ( md_is_region_active(volume->region) &&
	//     !(volume->super_block->state & (1 << MD_SB_CLEAN)) ) {
	//
	//	/* The array was active when we read the superblock.
	//	 * If we did not find any problems, mark the array clean.
	//	 */
	//
	//	if (!(volume->flags & MD_CORRUPT)) {
	//		volume->super_block->state |= (1 << MD_SB_CLEAN);
	//	}
	//}

	//if (volume->commit_flag & MD_COMMIT_RESTORE_SAVED_SB) {
	//	if (volume->region)
	//		md_deactivate_region(volume->region);
	//	else
	//		LOG_ERROR("No region for %s to deactivate\n", volume->name);
	//}
	
	md_print_array_sb(message_buffer, MD_MESSAGE_BUF_SIZE, volume->super_block);
	LOG_DEBUG("%s\n", message_buffer);

	for (i = 0; i < MAX_MD_DEVICES; i++) {

		if (!volume->child_object[i])
			continue;

		sb = volume->super_array[i];
		if (!sb)
			continue;
			
		// First, we might have to write some saved superblock
		rc = md_save_sb(volume,i);
		if (rc) {
			LOG_EXIT_INT(rc);
			return rc;
		}

		// copy superblock form master, and fix up this_disk
		memcpy(sb, volume->super_block, MD_SB_BYTES);

		if (sb->disks[i].state & (1<<MD_DISK_NEW)) {
			sb->events_lo = 0; // null out event counter
			sb->events_hi = 0; // as signal to kernel on new devices
		}

		// turn off new bit in master SB
		for (j = 0; j < MAX_MD_DEVICES; j++) {
			sb->disks[j].state &= ~(1<<MD_DISK_NEW | 1<<MD_DISK_PENDING_ACTIVE);
		}
		volume->super_block->disks[i].state &= ~(1<<MD_DISK_NEW | 1<<MD_DISK_PENDING_ACTIVE);

		sb->this_disk = sb->disks[i];

		if (volume->commit_flag & MD_COMMIT_RESTORE_SAVED_SB) {
			LOG_DEFAULT("%s should have old dev number: (%d:%d)\n",
			    volume->child_object[i]->name,
			    sb->this_disk.major,
			    sb->this_disk.minor);
		}

		location = MD_NEW_SIZE_SECTORS(volume->child_object[i]->size);

		LOG_DEFAULT("Writing MD Superblock at %"PRIu64" on %s\n",
		    location, volume->child_object[i]->name);

		sb->sb_csum = calc_sb_csum(volume->super_array[i]);

		if ( WRITE(volume->child_object[i], location, MD_SB_SECTORS, (char*)sb)) {
			LOG_SERIOUS("Error writing MD SUperBlock from object %s\n", volume->child_object[i]->name);
			LOG_EXIT_INT(EIO);
			return EIO;
		}
	}

	if (!rc) {
		volume->region->flags |= SOFLAG_NEEDS_ACTIVATE;
		volume->flags &= ~MD_DIRTY;
	}

	volume->commit_flag &= ~MD_COMMIT_SAVE_SB; //only write MD saved SB info once
	volume->commit_flag &= ~MD_COMMIT_RESTORE_SAVED_SB;
	LOG_EXIT_INT(rc);
	return rc;

}

int md_free_volume(md_volume_t *volume)
{
	int i, nr_disks;

	LOG_ENTRY();

	if (!volume) {
		LOG_EXIT_INT(EFAULT);
		return EFAULT;
	}
	for (i = 0, nr_disks = 0;
	     (i < MAX_MD_DEVICES) && (nr_disks < volume->nr_disks); i++) {
		if (volume->child_object[i] || volume->stale_object[i]) {
			nr_disks++;
			EngFncs->engine_free(volume->super_array[i]);
			if (volume->saved_super_array[i])
				EngFncs->engine_free(volume->saved_super_array[i]);

		}
	}

	if (volume->setup_funcs != NULL) {
		empty_setup_funcs_queue(volume);
		EngFncs->destroy_list(volume->setup_funcs);
		volume->setup_funcs = NULL;
	}

	if (volume->ioctl_pkgs != NULL) {
		empty_ioctl_queue(volume);
		EngFncs->destroy_list(volume->ioctl_pkgs);
		volume->ioctl_pkgs = NULL;
	}
	
	if (volume->ioctl_cleanup != NULL) {
		free_ioctl_pkgs(volume);
		EngFncs->destroy_list(volume->ioctl_cleanup);
		volume->ioctl_cleanup = NULL;
	}

	md_remove_volume_from_list(volume);
	EngFncs->engine_free(volume);
	LOG_EXIT_INT(0);
	return 0;
}


static int find_slot_for_multipath_object( storage_object_t * object,
					   mdp_super_t *md_super_buffer,
					   md_volume_t *volume )
{
	int i,slot_index=-1;
	LOG_ENTRY();
	
	LOG_DEBUG("Finding slot for multipath device %s\n", object->name);
	for (i=0; (slot_index==-1) && i<md_super_buffer->nr_disks; i++) {
		if ( md_super_buffer->disks[i].major == object->dev_major &&
		     md_super_buffer->disks[i].minor == object->dev_minor ) {
			LOG_DEBUG("Exact match with objects major minor ... slot %d\n", i);			
			slot_index = i;
			if ( volume->super_array[i] != NULL ){
				LOG_DEBUG("Somebody is sleeping in my bed.\n");
				for (i = 0; i < MAX_MD_DEVICES; i++) {
					if ( volume->super_array[i] == NULL ) {
						volume->super_array[i] = volume->super_array[slot_index];
						volume->child_object[i] = volume->child_object[slot_index];
						break;
					}
				}
			}

		}
	}
	if (slot_index==-1) {
		LOG_DEBUG("Object not found in SB so using 1st available slot\n");
		for (i = 0; (slot_index==-1) && i< MAX_MD_DEVICES; i++) {
			if (!volume->super_array[i]) {
				LOG_DEBUG("Slot %d is first available\n", i);
				slot_index=i;
			}
		}
	}
	
	LOG_EXIT_INT(slot_index);
	return slot_index;
}


// the input object has been validated as a pv for this personality.
// using the list head for this personality, search to see if other PVs
// for this volume already exist, if so add this supreblock to the list for that volume
// else create a new volume structure for it and add it to the volume list

int md_find_volume_for_object(storage_object_t * object, mdp_super_t *md_super_buffer) {

	int i;
	md_volume_t * volume = volume_list_head;
	mdp_super_t * tmp_super = NULL;
	storage_object_t * tmp_object = NULL;
	mdp_saved_super_t      * saved_sb = NULL;
	int slot_index;

	LOG_ENTRY();
	while (volume) {
		if (md_uuids_equal(volume->super_block, md_super_buffer)) {
			if (md_get_event(md_super_buffer) > md_get_event(volume->super_block)) {
				LOG_WARNING("MD volume (%s) : Replacing master SB with the SB of object %s\n",
					    volume->name, object->name);
				memcpy(volume->super_block,
					   md_super_buffer,
					   MD_SB_BYTES);
			}
			if (volume->flags & MD_DISCOVERED) {
				// The volume has beed discovered.  The
				// volume may have been exported in degrade
				// mode and this object is late for the show,
				// or this may be a stale object that has
				// been resurrected.
				LOG_EXIT_INT(EEXIST);
				return EEXIST;
			}
			// check to make sure slot is empty
			if (volume->personality == MULTIPATH) {				
				LOG_DEBUG("Finding slot for multipath device %s\n", object->name);
				slot_index = find_slot_for_multipath_object(object,md_super_buffer,volume);
				if (slot_index>=0) {
					LOG_DEBUG("Placing object in slot %d\n", slot_index);
					volume->super_array[slot_index]  = md_super_buffer;
					volume->child_object[slot_index] = object;								
					memcpy( &md_super_buffer->this_disk,
					        &md_super_buffer->disks[slot_index],
						sizeof(md_super_buffer->this_disk));
				}				
				else {
					MESSAGE("Unable to find slot for device %s\n",object->name);					
  				}
			}
			else if (!volume->super_array[md_super_buffer->this_disk.number]) {
				volume->super_array[md_super_buffer->this_disk.number] = md_super_buffer;
				volume->child_object[md_super_buffer->this_disk.number] = object;
			} else {
				if (md_get_event(md_super_buffer) > md_get_event(volume->super_array[md_super_buffer->this_disk.number])) {
					// new buffer is newer, use it.
					// save off the old super and object
		 //			volume->flags |= MD_DIRTY;
					tmp_super = volume->super_array[md_super_buffer->this_disk.number];
					tmp_object = volume->child_object[md_super_buffer->this_disk.number];
					// fill in the new info
					volume->super_array[md_super_buffer->this_disk.number] = md_super_buffer;
					volume->child_object[md_super_buffer->this_disk.number] = object;
				} else if (md_get_event(md_super_buffer) < md_get_event(volume->super_array[md_super_buffer->this_disk.number])) {
					//existing buffer the correct one, reassign this one into tmp
			//		volume->flags |= MD_DIRTY;
					tmp_super = md_super_buffer;
					tmp_object = object;
				} else {
					// event counters are equal, bad news.
					MESSAGE("Multiple superblocks found for region %s index %d\n",volume->name,md_super_buffer->this_disk.number);
					MESSAGE("Object 1 %s, Object 2 %s\n",object->name,volume->child_object[md_super_buffer->this_disk.number]->name);
					volume->flags |= MD_CORRUPT;
					// don't want to write this out on commit.
					tmp_super = md_super_buffer;
					tmp_object = object;
				}
				// stick in next available slot, figure out later
				for (i = 0; i< MAX_MD_DEVICES; i++) {
					if (volume->super_array[i] == NULL) {
						volume->super_array[i] = tmp_super;
						volume->child_object[i] = tmp_object;
						// fix up the this_disk field
						memcpy(&tmp_super->this_disk, &tmp_super->disks[i], sizeof(tmp_super->this_disk));
						break;
					}
				}
			}

			volume->nr_disks++;
			break;
		}
		volume = volume->next;

	}
	if (!volume) {
		// add new entry
		if ( !(volume = EngFncs->engine_alloc(sizeof(md_volume_t) ) )) {
			LOG_CRITICAL("Memory error creating buffer to read super block.\n");
			LOG_EXIT_INT(ENOMEM);
			return ENOMEM;
		}

		if (level_to_pers(md_super_buffer->level) == MULTIPATH) {				
			LOG_DEBUG("Finding slot for multipath device %s\n", object->name);
			slot_index = find_slot_for_multipath_object(object,md_super_buffer,volume);
			if (slot_index>=0) {				
				tmp_super = md_super_buffer;
				memcpy( &md_super_buffer->this_disk,
				        &md_super_buffer->disks[slot_index],
					sizeof(md_super_buffer->this_disk));
			}
		}
		else {			
			slot_index = md_super_buffer->this_disk.number;
		}

		volume->super_array[slot_index]  = md_super_buffer;
		volume->child_object[slot_index] = object;

		volume->personality = level_to_pers(md_super_buffer->level);
		volume->nr_disks = 1;
		volume->setup_funcs = EngFncs->allocate_list();
		volume->ioctl_pkgs = EngFncs->allocate_list();
		volume->ioctl_cleanup = EngFncs->allocate_list();
		volume->commit_flag = MD_COMMIT_USE_DISK; // use on disk state for 'discovered' volumes
		volume->super_block = EngFncs->engine_alloc(MD_SB_BYTES);
		memcpy(volume->super_block,
			   md_super_buffer,
			   MD_SB_BYTES);
		md_add_volume_to_list(volume);

		// implement cluster naming for all md personalities
		if (object->disk_group) {
			sprintf(volume->name, "%s/md/md%d",object->disk_group->name,md_super_buffer->md_minor);
		}
		else {
			sprintf(volume->name, "md/md%d",md_super_buffer->md_minor);
		}
 		     	
		LOG_DEBUG("MD volume [%s] has been created with object (%s)\n", volume->name, object->name);
	}
	
	//Now try to get the saved SB info
	if (!md_check_for_saved_sb(object, &saved_sb))
		volume->saved_super_array[md_super_buffer->this_disk.number] = saved_sb;

	LOG_EXIT_INT(0);
	return 0;
}

/* Function: md_check_for_pv
 *
 *	This function is used during discovery as an argument to PruneList.
 *	When this function is called, one object is passed in. The first
 *	sector of that object is examined to determine if it is an MD PV.
 *      If it is a PV, then a 0 return code is returned along with the super_block
 *      buffer allocated.
 */
int  md_check_for_pv(storage_object_t * object,mdp_super_t ** md_super_buffer) {
	int rc = 0;
	LOG_ENTRY();


	if ( !(*md_super_buffer = EngFncs->engine_alloc(MD_SB_BYTES))) {
		LOG_CRITICAL("Memory error creating buffer to read super block.\n");
		LOG_EXIT_INT(ENOMEM);
		return ENOMEM;
	}

	// Read the first block and look for a PV signature.
	if ( md_read_metadata_from_disk(object,
					MD_NEW_SIZE_SECTORS(object->size),
                                        *md_super_buffer,
					MD_SB_SECTORS) ) {
		LOG_SERIOUS("I/O error on object %s.\n", object->name);
		EngFncs->engine_free(*md_super_buffer);
		LOG_EXIT_INT(EIO);
		return EIO;
	}
	
	rc = md_validate_disk_sb(*md_super_buffer);
	if (rc) {
		LOG_DEBUG("(%s) does not have MD superblock.\n", object->name);
		EngFncs->engine_free(*md_super_buffer);
	} else {
		LOG_DEBUG("(%s) has MD superblock.\n", object->name);
	}
	LOG_EXIT_INT(rc);
	return rc;
}

/* Function: md_check_for_saved_sb
 *
 *	This function is used during discovery.  It's called after we know that
 * the object is an MD device.  If a saved SB info block exists, then return 0
 * along with the allocated saved_sb.
 *
 */
int md_check_for_saved_sb(storage_object_t * object, mdp_saved_super_t ** saved_sb) {
	int rc = 0;
	LOG_ENTRY();

	if ( !(*saved_sb = EngFncs->engine_alloc(MD_SAVED_SB_INFO_BYTES))) {
		LOG_CRITICAL("No memory to read saved SB info.\n");
		LOG_EXIT_INT(ENOMEM);
		return ENOMEM;
	}

	if ( md_read_metadata_from_disk(
		object,
		MD_NEW_SIZE_SECTORS(object->size) + MD_SAVED_SB_SECTOR_OFFSET,
		*saved_sb,
		MD_SAVED_SB_INFO_SECTS) ) {
		
		LOG_SERIOUS("I/O error on object %s.\n", object->name);
		rc = EIO;
		goto out_free;
	}
	
	rc = md_validate_saved_sb(*saved_sb);
	if (!rc) {
		LOG_DEBUG("Found saved SB info on %s\n", object->name);
		goto out;
	} else {
		LOG_EXTRA("%s does not have a valid MD saved SB info\n", object->name);
		goto out_free;	
	}

out_free:
	EngFncs->engine_free(*saved_sb);
	*saved_sb = NULL;
out:
	LOG_EXIT_INT(rc);
	return rc;
}

int md_can_restore_saved_sb(storage_object_t *region)
{
	int i;
	logical_volume_t *volume;
	md_volume_t * md_volume;
	mdp_saved_super_t *saved_sb;
	mdp_super_t *sb;

	LOG_ENTRY();

	if (region == NULL) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}

	volume = region->volume;
	if (volume == NULL) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}
	if ((volume->flags & VOLFLAG_COMPATIBILITY) == 0) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}

	md_volume = (md_volume_t *) region->private_data;
	if (md_volume == NULL) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}

	if (md_volume->flags & MD_USE_OLD_DEV) {
		LOG_EXIT_BOOL(FALSE);
		return FALSE;
	}

	/*
	 * If we found backup superblock in all devices of the MD array,
	 * we will provide an option to convert it back.
	 */
	md_volume->flags |= MD_SAVED_SB_EXISTS;
	for (i = 0; i < MAX_MD_DEVICES; i++) {
		if (md_volume->child_object[i]) {
			sb = md_volume->super_array[i];
			saved_sb = md_volume->saved_super_array[i];

			// All devices must contain a saved SB info block
			if (saved_sb == NULL) {
				md_volume->flags &= ~MD_SAVED_SB_EXISTS;
				break;
			}

			if (md_volume->super_block->disks[i].major == saved_sb->this_disk_major) {
				md_volume->flags &= ~MD_SAVED_SB_EXISTS;
				break;
			}

			// Validate the constant part of MD superblock
			if ((sb->set_uuid0 == saved_sb->set_uuid0) &&
			    (sb->set_uuid1 == saved_sb->set_uuid1) &&
			    (sb->set_uuid2 == saved_sb->set_uuid2) &&
			    (sb->set_uuid3 == saved_sb->set_uuid3) &&
			    (sb->ctime == saved_sb->ctime) &&
			    (sb->level == saved_sb->level) &&
			    (sb->size == saved_sb->size) &&
			    (sb->md_minor == saved_sb->md_minor)) {
				LOG_DEBUG("%s contains a valid MD saved info block\n",
					  md_volume->child_object[i]->name);
			} else {
				md_volume->flags &= ~MD_SAVED_SB_EXISTS;
				break;
			}
		}
	}

	if (md_volume->flags & MD_SAVED_SB_EXISTS) {
		LOG_EXIT_BOOL(TRUE);
		return TRUE;
	}
	LOG_EXIT_BOOL(FALSE);
	return FALSE;
}

/* Function: md_restore_saved_sb
 * 	This function restores saved superblock info to the volume master superblock.
 */
int md_restore_saved_sb(md_volume_t *volume)
{
	int i;
	u_int32_t number;
	mdp_saved_super_t *saved_sb;
	mdp_super_t *sb = volume->super_block;

	LOG_ENTRY();

	for (i = 0; i < MAX_MD_DEVICES; i++) {
		if ((volume->child_object[i] != NULL) &&
		    (volume->super_array[i] != NULL) &&
		    (volume->saved_super_array[i] != NULL)) {
			sb->disks[i].major = volume->saved_super_array[i]->this_disk_major;
			sb->disks[i].minor = volume->saved_super_array[i]->this_disk_minor;
		}
	}

	number = sb->this_disk.number;

	saved_sb  = volume->saved_super_array[number];

	sb->major_version = saved_sb->major_version;
	sb->minor_version = saved_sb->minor_version;
	sb->patch_version = saved_sb->patch_version;
	sb->this_disk.major = saved_sb->this_disk_major;
	sb->this_disk.minor = saved_sb->this_disk_minor;

	volume->commit_flag |= MD_COMMIT_RESTORE_SAVED_SB;
	volume->flags |= MD_USE_OLD_DEV;
	LOG_EXIT_INT(0);
	return 0;
}


/* Function: md_discover_volume
 *
 *	This function is the entry point into the first half of discovery,
 *	which examines all objects to find PVs and assigns them to the
 *	appropriate groups.
 */
void md_discover_volumes( list_anchor_t input_list, list_anchor_t output_list) {
	storage_object_t * object;
	mdp_super_t      * md_super_buffer;
	list_element_t iter1 = NULL;

	LOG_ENTRY();
	LOG_DETAILS("Searching for MD Super Blocks.\n");

	// A buffer for md_check_object_for_pv to use for reading
	// the PV metadata.
	LIST_FOR_EACH(input_list, iter1, object) {
		if (object->data_type == DATA_TYPE) {
			if (md_check_for_pv(object, &md_super_buffer)) {
				// wasn't ours, put it on the output list now.
				EngFncs->insert_thing(output_list, object, INSERT_AFTER, NULL);
			} else {
				if ( md_find_volume_for_object(object, md_super_buffer)) {
					EngFncs->engine_free(md_super_buffer);
					LOG_WARNING("Error finding volume minor %d for PV %s\n", md_super_buffer->md_minor, object->name);
					// Object was not clamed for a volume.
					// Put it on the output list.
					EngFncs->insert_thing(output_list, object, INSERT_AFTER, NULL);
				}
			}
		} else {
			LOG_DETAILS("Skipping object %s because not DATA_TYPE\n",object->name);
		}

	}

	LOG_EXIT_VOID();
}



static boolean  md_namespace_registered = FALSE;

int md_register_name_space(void) {

	int rc = 0;

	LOG_ENTRY();

	if (!md_namespace_registered) {
		rc = EngFncs->register_name(MD_NAME_SPACE);

		if (rc == 0) {
			md_namespace_registered = TRUE;
		} else {
			LOG_SERIOUS("Error registering the MD name space \"%s\".\n", MD_NAME_SPACE);
		}
	}

	LOG_EXIT_INT(rc);
	return rc;
}


static boolean md_final_call = FALSE;

void md_discover_final_call( list_anchor_t input_list, list_anchor_t output_list, int *count ) {

	LOG_ENTRY();

	if (!md_final_call) {
		md_discover_volumes(input_list, output_list);
		LOG_DETAILS("PV discovery complete.\n");

		raid5_discover_regions(output_list, count, TRUE);
		LOG_DETAILS("RAID4/5 volume discovery complete.\n");

		EngFncs->delete_all_elements(input_list);
		EngFncs->merge_lists(input_list, output_list, NULL);

		md_discover_volumes(input_list, output_list);
		LOG_DETAILS("PV discovery complete.\n");

		raid1_discover_regions(output_list, count, TRUE);
		LOG_DETAILS("RAID1 volume discovery complete.\n");

		EngFncs->delete_all_elements(input_list);
		EngFncs->merge_lists(input_list, output_list, NULL);

		md_discover_volumes(input_list, output_list);
		LOG_DETAILS("PV discovery complete.\n");

		raid0_discover_regions(output_list, count, TRUE);
		LOG_DETAILS("RAID0 volume discovery complete.\n");

		EngFncs->delete_all_elements(input_list);
		EngFncs->merge_lists(input_list, output_list, NULL);

		md_discover_volumes(input_list, output_list);
		LOG_DETAILS("PV discovery complete.\n");

		linear_discover_regions(output_list, count, TRUE);
		LOG_DETAILS("LINEAR volume discovery complete.\n");

		md_final_call = TRUE;

	} else {
		/*
		 * Final discovery has been done.  Just move the
		 * objects from the input list to the output list.
		 */
		EngFncs->concatenate_lists(output_list, input_list);
	}

	LOG_EXIT_VOID();
}

static int md_count_children(md_volume_t *volume)
{
	int i;
	int count;

	LOG_ENTRY();
	for (i=0, count=0; i < MAX_MD_DEVICES; i++) {
		if (volume->child_object[i] && volume->super_array[i])
			count++;
	}
	LOG_EXIT_INT(count);
	return count;
}

static int md_count_stale_disks(md_volume_t *volume)
{
	int i;
	int count;

	LOG_ENTRY();
	for (i=0, count=0; i < MAX_MD_DEVICES; i++) {
		if (volume->stale_object[i] && volume->super_array[i])
			count++;
	}
	LOG_EXIT_INT(count);
	return count;
}

static void move_object_to_stale_list(md_volume_t *volume, int i)
{
	if (volume->stale_object[i]) {
		md_log_internal_bug(__FILE__, __FUNCTION__, __LINE__);
		return;
	}
	volume->stale_object[i] = volume->child_object[i];
	volume->child_object[i] = NULL;
}

static void move_stale_object_to_child_list(md_volume_t *volume, int i)
{
	if (volume->child_object[i]) {
		md_log_internal_bug(__FILE__, __FUNCTION__, __LINE__);
		return;
	}
	volume->child_object[i] = volume->stale_object[i];
	volume->stale_object[i] = NULL;
}


static inline void debug_print_desc(mdp_disk_t *d)
{
	LOG_DEBUG("*** number(%d) major(%d) minor(%d) raid_disk(%d) state(0x%08X)\n",
		  d->number, d->major, d->minor, d->raid_disk, d->state);
}

/*
 * Function: md_rebuild_array_from_master_sb
 *
 *	This function rebuilds the MD array from the master superblock
 */
int md_rebuild_array_from_master_sb(md_volume_t *vol)
{
	int i, count;

	LOG_ENTRY();

	if (vol->flags & MD_CORRUPT) {
		LOG_DEFAULT("MD region %s is %s.\n", vol->name, 
			    (vol->flags & MD_CORRUPT) ? "corrupt" :
			    (vol->flags & MD_PROBLEMATIC_SB) ? "inconsistent" : "???");
	}

	if (vol->super_block->failed_disks) {
		LOG_ERROR("Master superblock has %d failed disks.\n",
			  vol->super_block->failed_disks);
		LOG_EXIT_INT(vol->flags);
		return vol->flags;
	}

	count = md_count_children(vol) + md_count_stale_disks(vol);

	if (vol->super_block->nr_disks != count) {
		LOG_ERROR("Master superblock has %d disks, but we found only %d children.\n",
			  vol->super_block->nr_disks, count);
		LOG_EXIT_INT(EINVAL);
		return EINVAL;
	}

	for (i=0; i < MAX_MD_DEVICES; i++) {
		move_stale_object_to_child_list(vol, i);
		md_clone_superblock(vol, i);
	}

	md_analyze_volume(vol);

	LOG_EXIT_INT(vol->flags);
	return vol->flags;
}

/*
 * Function: md_analyze_volume
 *
 *	Note:  This function can be called before the region is allocated and initilized.
 */
int md_analyze_volume(md_volume_t *vol)
{
	int i;
	int nr_disks;
	int found_disks;
	mdp_super_t *sb = vol->super_block;

	LOG_ENTRY();

	vol->flags &= ~(MD_CORRUPT | MD_DEGRADED | MD_PROBLEMATIC_SB); //clear flags

	vol->active_disks = vol->working_disks = vol->spare_disks = vol->failed_disks = vol->stale_disks = 0;

	nr_disks = vol->nr_disks = md_count_children(vol);

    LOG_DEBUG("walking child object array, number of child objects = %d\n", nr_disks);

	for (i=0, found_disks=0; (i < MAX_MD_DEVICES); i++ ) {
		
		if (vol->child_object[i] == NULL) {

            LOG_DEBUG("object[%d]= NULL\n", i);

			if (vol->stale_object[i])
				continue; //already moved it to list of stale objects

			if ( disk_removed(&sb->disks[i]) ) {
				LOG_WARNING("Disk (index=%d) had been removed\n", i);
				if ( (found_disks < nr_disks) && disk_faulty(&sb->disks[i]) ) {
					LOG_WARNING("... and disk (index=%d) was faulty\n", i);
				}
			} else {
				if (found_disks < sb->nr_disks) {
					debug_print_desc(&sb->disks[i]);	
					if ( disk_faulty(&sb->disks[i])) {
						LOG_WARNING("... and disk (index=%d) was faulty, but not yet removed.\n", i);
						vol->failed_disks++;
					}
				}
			}
			continue;
		}
        else {
            LOG_DEBUG("object[%d]=  %s\n", i, vol->child_object[i]->name);
        }

		found_disks++;
		
		if (md_get_event(vol->super_array[i]) < md_get_event(sb)) {
			
			if (disk_faulty(&sb->disks[i]) && !disk_removed(&sb->disks[i])) {
				vol->failed_disks++;
				LOG_WARNING("Disk (index=%d) is faulty.\n", i);
			} else {
				move_object_to_stale_list(vol, i);
				LOG_WARNING("Disk (index=%d) is stale\n", i);

				/* Did the kernel mark it removed ? */
				if (descriptor_removed(&sb->disks[i])) {
					vol->nr_disks--;
					LOG_WARNING("... and was marked removed.\n");
				}
			}
			continue;
		}

		if (disk_active(&sb->disks[i])) {
			if (!disk_sync(&sb->disks[i])) {
				LOG_WARNING("Disk (index=%d) is active, but not synced!\n", i);
				debug_print_desc(&sb->disks[i]);
                                move_object_to_stale_list(vol, i);
				vol->nr_disks--;
			} else {
				vol->active_disks++;
				vol->working_disks++;
			}
			continue;
		}

		/* Must be a spare */
		if (disk_spare(&sb->disks[i])) {
			vol->spare_disks++;
			vol->working_disks++;
			continue;
		}

		if (disk_removed(&sb->disks[i])) {
			continue;
		}
		
		if (disk_faulty(&sb->disks[i])) {
			vol->failed_disks++;
			continue;
		}

		vol->flags |= MD_CORRUPT;
		md_log_internal_bug(__FILE__, __FUNCTION__, __LINE__);
		debug_print_desc(&sb->disks[i]);
	}
	
	vol->stale_disks = md_count_stale_disks(vol);
	
	if (found_disks != nr_disks) {
		LOG_ERROR("We found %d descriptors for %d objects\n", found_disks, nr_disks);
		vol->flags |= MD_PROBLEMATIC_SB;
	}

	LOG_DETAILS("Array %s: We found nr_disks(%d) active_disks(%d)"
		    " working_disks(%d) spare_disks(%d) failed_disks(%d)"
		    " stale_disks(%d)\n",
		    vol->name, vol->nr_disks, vol->active_disks,
		    vol->working_disks, vol->spare_disks, vol->failed_disks,
		    vol->stale_disks);

	if (vol->nr_disks != sb->nr_disks) {
		LOG_WARNING("We found nr_disks(%d), the superblock has nr_disks(%d)\n",
			    vol->nr_disks, sb->nr_disks);
		/* 
		 * Note: Kernel MD driver allows creating an MD array with missing disk(s).
		 * If this is the case, ignore the difference between sb->nr_disks and 
		 * volume->nr_disks.
		 */
		if ((sb->nr_disks > vol->nr_disks) && 
		    (sb->failed_disks == (sb->nr_disks - vol->nr_disks))) {
			LOG_WARNING("But we are ignoring the difference.\n");
		} else {
			vol->flags |= MD_PROBLEMATIC_SB;
		}
	}
	
	if (vol->working_disks != sb->working_disks) {
		LOG_WARNING("We found working_disks(%d), the superblock has working_disks(%d)\n",
			    vol->working_disks, sb->working_disks);
		vol->flags |= MD_PROBLEMATIC_SB;
	}
	
	if (vol->spare_disks != sb->spare_disks) {
		LOG_WARNING("We found spare_disks(%d), the superblock has spare_disks(%d)\n",
			    vol->spare_disks, sb->spare_disks);
		vol->flags |= MD_PROBLEMATIC_SB;
	}
	
	if (vol->failed_disks != sb->failed_disks) {
		LOG_WARNING("We found failed_disks(%d), the superblock has failed_disks(%d)\n",
			    vol->failed_disks, sb->failed_disks);
		vol->flags |= MD_PROBLEMATIC_SB;
	}

	if (!vol->active_disks) {
		LOG_CRITICAL("We found active_disks(%d), the superblock has active_disks(%d)\n",
			    vol->active_disks, sb->active_disks);
		vol->flags |= MD_CORRUPT;
	}

	if (vol->active_disks != sb->active_disks) {
		LOG_WARNING("We found active_disks(%d), the superblock has active_disks(%d)\n",
			    vol->active_disks, sb->active_disks);
		vol->flags |= MD_PROBLEMATIC_SB;
	}

	if (!(vol->flags & MD_CORRUPT) && (vol->active_disks < sb->raid_disks)) {
		LOG_WARNING("We found active_disks(%d), the superblock has raid_disks(%d)."
			    "  The array is either corrupt or degraded\n",
			    vol->active_disks, sb->raid_disks);

		switch (level_to_pers(sb->level)) {
		case MULTIPATH:
		case RAID1:
			if (vol->active_disks >= 1)
				vol->flags |= MD_DEGRADED;
			else
				vol->flags |= (MD_CORRUPT | MD_PROBLEMATIC_SB);
			break;
		case RAID5:
			if ( (sb->raid_disks - vol->active_disks) == 1 )
				vol->flags |= MD_DEGRADED;
			else
				vol->flags |= (MD_CORRUPT | MD_PROBLEMATIC_SB);
			break;
		default:
			vol->flags |= (MD_CORRUPT | MD_PROBLEMATIC_SB);
		}
	}
	if (vol->flags & MD_CORRUPT) {
		LOG_CRITICAL("MD region %s is corrupt\n", vol->name);
	}
	if (vol->flags & MD_DEGRADED) {
		LOG_WARNING("MD region %s is degraded\n", vol->name);
	}

	LOG_EXIT_INT(vol->flags);
	return vol->flags;
}

void md_remove_missing_disk_entries(md_volume_t *vol)
{
	int i;
	mdp_super_t *sb = vol->super_block;

	for (i=0; i<MAX_MD_DEVICES; i++) {
		if (!vol->child_object[i]) {
			remove_descriptor(&sb->disks[i]);
		}
	}
}

static u_int32_t cpy_str(char *target, u_int32_t target_size, char *src)
{
	u_int32_t cpy_size;
	cpy_size = strlen(src);
	if (target_size > cpy_size) {
		strncpy(target, src, cpy_size);
		return cpy_size;
	} else
		return 0;
}

void md_print_array_sb(char *buf, u_int32_t buf_size, mdp_super_t *sb)
{
	char temp[256];
	u_int32_t cpy_size;
	char *cur;
	int i;

	cur = buf;
	
	sprintf(temp, "Detail description:\n");
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;
	
	sprintf(temp, "       MD minor : %d\n", sb->md_minor);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;
	
	sprintf(temp, "        Version : %02d.%02d.%02d\n", sb->major_version, sb->minor_version, sb->patch_version);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;
	
	sprintf(temp, "     Raid Level : %s\n", level_to_string(sb->level));
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, "     Array Size : %d sectors\n", sb->size * 2);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, "  Array Devices : %d\n", sb->nr_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, "   RAID Devices : %d\n", sb->raid_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, " Active Devices : %d\n", sb->active_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, "Working Devices : %d\n", sb->working_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, "  Spare Devices : %d\n", sb->spare_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;

	sprintf(temp, " Failed Devices : %d\n", sb->failed_disks);
	if (!(cpy_size = cpy_str(cur, buf_size, temp)))
		return;
	buf_size -= cpy_size;
	cur += cpy_size;
	
	for (i=0; i<MAX_MD_DEVICES; i++) {
		mdp_disk_t *d = &sb->disks[i];
		if (!descriptor_removed(d) && !descriptor_empty(d)) {
			sprintf(temp, "Disk %d : Number(%d) Raid Device(%d) Major(%d) Minor(%d) State(%s%s%s%s)\n",
				i, d->number, d->raid_disk, d->major, d->minor,
				disk_active(d) ? " active" : " ",
				disk_sync(d) ? " sync" : " ",
				disk_spare(d) ? " spare" : " ",
				disk_faulty(d) ? " faulty" : " ");

			if (!(cpy_size = cpy_str(cur, buf_size, temp)))
				return;
			buf_size -= cpy_size;
			cur += cpy_size;
		}
	}
	
	*cur = '\0';
}

