/*
 * Copyright (c) 2001-2003 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: ssi_crlam_blcr.c,v 1.5.2.3 2003/11/11 13:13:03 jsquyres Exp $
 *
 *	Function:	- BLCR crlam module 
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>

#include <lam_config.h>
#include <app_mgmt.h>
#include <etc_misc.h>
#include <priority.h>
#include <preq.h>
#include <sfh.h>

#include <lam-ssi-cr.h>
#include <lam-ssi-crlam-blcr.h>
#include <lam-ssi-crlam-blcr-config.h>

#include <libcr.h>

#define signal_puts(S)  write(STDOUT_FILENO, (S), strlen(S))


/*
 * local functions
 */
 static int crlam_thread_callback(void *arg);
 static int crlam_signal_callback(void *arg);
 static int cr_signal_app(struct _gps *, int);
 static char *get_checkpoint_file(int node, int pid);
 static char *get_cr_util(const char* util);
 static char *create_app_schema(struct _gps *, int);


/*
 * local variables
 */
static lam_ssi_crlam_actions_t module_actions = {
  lam_ssi_crlam_blcr_checkpoint,
  lam_ssi_crlam_blcr_continue,
  lam_ssi_crlam_blcr_disable_checkpoint,
  lam_ssi_crlam_blcr_enable_checkpoint,
  lam_ssi_crlam_blcr_finalize,
  lam_ssi_crlam_blcr_init,
  /* use the base utility function for restart action */
  lam_ssi_crlam_base_do_exec
};

static struct _gps *cr_world = NULL;
static int cr_world_n = -1;
static int app_session = 0;
static char *chkpnt_app_schema = NULL;
static char *executable;
static cr_client_id_t client_id;


/*
 * lam_ssi_crlam_blcr_query
 *
 * Function: determine if the module wants to run.
 */
const lam_ssi_crlam_actions_t *
lam_ssi_crlam_blcr_query(int *priority)
{
  /* Check this module's priority before doing anything else */
  if (lam_ssi_cr_base_check_priority("blcr", 50, LAM_SSI_CRLAM_BLCR_DEFAULT,
                                      priority) < 0)
    return NULL;

  /* If we're verbose, print stuff out */
  if (lam_ssi_cr_verbose >= 5) {
    lam_debug(lam_ssi_cr_did, "blcr: module initializing");
    lam_debug(lam_ssi_cr_did, "blcr:verbose: %d",
              lam_ssi_cr_verbose);
    lam_debug(lam_ssi_cr_did, "blcr:priority: %d", *priority);
  }
  
  /* Return the struct with the function pointers in it for all the APIs */
  return (&module_actions);
}


/*
 *	lam_ssi_crlam_blcr_init
 *
 *	Function:	- primary initialization of CR subsystem
 *	Returns		- 0 or LAMERROR
 */
int
lam_ssi_crlam_blcr_init(char *path, char **argv, OPT *ad, 
                        struct _gps *world, int world_n)
{
  cr_callback_id_t cr_thread_callback_id, cr_signal_callback_id;
  void *cr_thread_callback_arg = NULL;
  void *cr_signal_callback_arg = NULL;

  client_id = cr_init();
  if (client_id < 0)
    return LAMERROR;

  /* 
   * Disable checkpoints till the caller explicitly invokes
   * _enable_checkpoint(). 
   */
  lam_ssi_crlam_blcr_disable_checkpoint();

  /*
   * Register the checkpoint signal handler
   */
  cr_thread_callback_id = cr_register_callback(crlam_thread_callback, 
                                               cr_thread_callback_arg, 
                                               CR_THREAD_CONTEXT);
  cr_signal_callback_id = cr_register_callback(crlam_signal_callback, 
                                               cr_signal_callback_arg, 
                                               CR_SIGNAL_CONTEXT);

  if (lam_ssi_cr_verbose >= 40)
    lam_debug(lam_ssi_cr_did, "Registered C/R handlers\n");

  cr_world = world;
  cr_world_n = world_n;
  executable = strdup(path);

  /* setup the argv to be used at restart */
  if (lam_ssi_crlam_base_create_restart_argv(argv, ad) < 0)
    return LAMERROR;

  return 0;
}


/*
 *	lam_ssi_crlam_blcr_finalize
 *
 *	Function:	- crlam cleanup
 *	Returns:	- 0 or LAMERROR
 */
int
lam_ssi_crlam_blcr_finalize(void)
{
  /* This function doesn't really need to do anything. Just return 0. */
  return 0;
}


/*
 *      lam_ssi_crlam_blcr_checkpoint
 *
 *      Function:       - crlam checkpoint
 *      Returns:        - 0 or LAMERROR
 */
int lam_ssi_crlam_blcr_checkpoint(void)
{
  pid_t pid;
  int status;

  /* set the session id */
  app_session = lam_getpid();

  /* 
   * fork a child to proppagate the checkpoint request to all the MPI
   * aprocesses. 
   */

  if ((pid = fork()) < 0) {
    lam_perror("fork() in async handler");
    return LAMERROR;

  } else if (0 == pid) {
    /* Child process */

    /* Need seperate pid so that we can attach to the lamd */
    lam_reset_pid_cache();
    if (kinit(PRCMD)) {
      /* well, we can't recover from this - abort */
      lam_perror("kinit failed");
      kexit(1);
    }

    if (cr_signal_app(cr_world, cr_world_n) != 0)
      kexit(1);

    lam_debug(lam_ssi_cr_did, "Child exiting\n");
    kexit(0);

  } 

  /* Parent process */

  lam_debug(lam_ssi_cr_did, "Parent waiting for pid %d\n", pid);
  if (waitpid(pid, &status, 0) != pid) {
    lam_perror("async_checkpoint waitpid");
    lam_debug(lam_ssi_cr_did, "waitpid(%d, ...) failed: %d",
              pid, errno);
    return LAMERROR;
  }
  lam_debug(lam_ssi_cr_did, "Parent reaped pid %d\n", pid);

  if ((WIFEXITED(status) != 0) && (WEXITSTATUS(status) != 0)) {
    lam_debug(lam_ssi_cr_did, "Bad exit status from child: %d. Aborting...",
              WEXITSTATUS(status));
    return LAMERROR;
  }
  
  /*
   * create an app_schema to be used at restart, and store it in memory.  At
   * restart, this will be written to file and passed to mpirun.
   */
  chkpnt_app_schema = create_app_schema(cr_world, cr_world_n);
  if (chkpnt_app_schema == NULL) {
    perror("create_app_schema");
    lam_debug(lam_ssi_cr_did, "Couldn't make app schema!");
    return LAMERROR;
  }
  
  return 0; 
}


/* lam_ssi_crlam_blcr_continue
 *
 * Function:        - crlam continue
 * Returns:         - 0 or LAMERROR
 */
int
lam_ssi_crlam_blcr_continue(void)
{
  /* no-op.*/
  return 0;
}


/* 
 * lam_ssi_crlam_blcr_disable_checkpoint
 *
 * Function:        - disable checkpoints till the caller explicitly
 *                    calls _enable_checkpoint to enable it.
 */
void
lam_ssi_crlam_blcr_disable_checkpoint(void)
{
  /* pass the non-negative int that was returned from cr_init() as the arg */
  cr_enter_cs(client_id);

  return;
}


/*
 * lam_ssi_crlam_blcr_enable_checkpoint
 *
 * Function:        - enable checkpointing
 */
void
lam_ssi_crlam_blcr_enable_checkpoint(void)
{
  /* pass the same arg that was passed to the call to cr_enter_cs() */
  cr_leave_cs(client_id);

  return;
}

  
/*
 * crlam_thread_callback
 *
 * Function:
 *  
 */
static int
crlam_thread_callback(void *arg)
{
  int ret;

  lam_debug(lam_ssi_cr_did, "Started async C/R handler");

  /* call the glue checkpoint function */
  if (lam_ssi_crlam_base_checkpoint(cr_world, cr_world_n) != 0) {
    lam_debug(lam_ssi_cr_did, "mpirun_ASYNC%d: prepare for chkpt failed.",
              getpid());
    kexit(1);
  }

  lam_debug(lam_ssi_cr_did, "mpirun_async calling cr_checkpoint()");
  ret = cr_checkpoint(0);
    
  if (ret < 0) {
    /* BWB: error handling */
    /* SS: we should probably continue, instead of aborting. */
    lam_debug(lam_ssi_cr_did, "mpirun_async failed checkpoint - %d", ret);
    kexit(1);
  } else if (ret != 0) {
      /* no-op. restart-work is done only in the signal-handler context */
  } else {
    if (lam_ssi_crlam_base_continue() != 0) {
      lam_debug(lam_ssi_cr_did, "mpirun_ASYNC%d: prepare for continue "
                "failed.", getpid());
      kexit(1);
    }
  }

  lam_debug(lam_ssi_cr_did, "Ending mpirun_async C/R handler");
  return 0;
}


static int
crlam_signal_callback(void *arg)
{
  int ret;

  /* nothing to be done before checkpointing. */
  
  ret = cr_checkpoint(0);

  if (ret < 0) {
    /* BWB: error handling */
    /* SS: we should probably continue, instead of aborting. */
    kexit(1);
  } else if (ret != 0) {
    /* just call the glue function to do all the Right Things at restart. */
    if (lam_ssi_crlam_base_restart(executable, chkpnt_app_schema) != 0)
      lam_debug(lam_ssi_cr_did, "mpirun_SYNC: restart failed.");
  } else {
    /* no-op. */
    lam_debug(lam_ssi_cr_did, "mpirun_SYNC: continuing...");
  }

  return 0;
}


/*
 * cr_signal_app
 *
 * Function:
 * Signal all processes running under itself to checkpoint
 */
static int
cr_signal_app(struct _gps *cr_world, int cr_world_n)
{
  int i;
  char **cr_argv = NULL;
  int cr_argc = 0;
  struct penv cr_env;
  char pid_arg[sizeof(int) * 8];
  char *cr_checkpoint, *cr_file;
  struct _gps       *p;
  int cr_pid, cr_idx;
  int nid, pid, status;

  cr_checkpoint = get_cr_util("cr_checkpoint");
  if (cr_checkpoint == NULL) {
    lam_perror("Getting cr_checkpoint");
    return LAMERROR;
  }

  for(i = 0, p = cr_world; i < cr_world_n; ++i, ++p) {
    /* setup kenya request */
    cr_env.pe_rank = p->gps_node;
    strcpy(cr_env.pe_wrkdir, "");
    cr_env.pe_envv = 0;

    /* Set up program arguments */
    sfh_argv_add(&cr_argc, &cr_argv, cr_checkpoint);

    snprintf(pid_arg, sizeof(pid_arg), "%d", p->gps_pid);
    sfh_argv_add(&cr_argc, &cr_argv, pid_arg);

    cr_file = get_checkpoint_file(p->gps_node, p->gps_pid);
    if (cr_file != NULL) {
      lam_debug(lam_ssi_cr_did, "file: %s\n", cr_file);
      sfh_argv_add(&cr_argc, &cr_argv, "-f");
      sfh_argv_add(&cr_argc, &cr_argv, cr_file);
    } else {
      lam_perror("Getting checkpoint file");
      return LAMERROR;
    }

    /*
     * It is safe to expect cr kernel module to be present on all nodes, so
     * src_node can be the same as dst_node when invoking rploadgo
     */
    lam_debug(lam_ssi_cr_did, "Invoking %s %d -f %s on node:%d\n",
              cr_checkpoint, p->gps_pid, cr_file, p->gps_node);

    if (rploadgov(p->gps_node, p->gps_node, cr_checkpoint, RTF_WAIT, cr_argv,
                  &cr_env, &cr_pid, &cr_idx)) {
      /* We can't do anything more than abort... */
      lam_perror("rploadgov failed.");
      return LAMERROR;
    }

    free(cr_file);
    free(cr_argv);
    cr_argv = NULL;
    cr_argc = 0;
  }

  lam_debug(lam_ssi_cr_did, "Child rpwaiting\n");

  /* a stripped down version of the pwait() function used in  mpirun. */
  for (i = 0; i < cr_world_n; ++i) {
    if (rpwait(&nid, &pid, &status)) {
      lam_perror("rpwait failed");
      free(cr_checkpoint);
      return LAMERROR;
    }

    /*
     * if return status of the child is non-zero, then cr_checkpoint did not
     * complete successfully.
     */
    if (status != 0) {
      lam_perror("rpwait failed");
      free(cr_checkpoint);
      return LAMERROR;
    }
  }

  lam_debug(lam_ssi_cr_did, "Child done pwaiting\n");

  free(cr_checkpoint);

  return 0;
}


/*
 * get_checkpoint_file
 *
 * Function:
 *  Find out where to put checkpoint files:
 *   1) Specified on mpirun command line (handled in MPIRUN)
 *   2) LAM_MPI_SSI_cr_base_dir env variable
 *   3) LAM_CHECKPOINT_FILE_DIR, set at configure time
 *   4) $HOME
 *
 * Notes:
 *  caller must free() returned string
 */
static char*
get_checkpoint_file(int node, int pid)
{
  char *tmp = NULL;
  char *crdir = NULL;
  int str_len = 0;
  struct stat crd;

  crdir = getenv("LAM_MPI_SSI_cr_base_dir");

  if ((crdir != NULL) && (strlen(crdir) > 0)
      && (0 == stat(crdir, &crd)) && (S_ISDIR(crd.st_mode))) {

    /* Empty conditional here makes it easier to make the env variable
       / SSI parameter the default choice */
  }

#ifdef LAM_CHECKPOINT_FILE_DIR
  else if ((strlen(LAM_CHECKPOINT_FILE_DIR) != 0)
             && (0 == stat(LAM_CHECKPOINT_FILE_DIR, &crd))
             && (S_ISDIR(crd.st_mode))) {
    crdir = LAM_CHECKPOINT_FILE_DIR;
  } 
#endif

  /* Otherwise, use the $HOME directory */

  else
    crdir = getenv("HOME");

  if (lam_ssi_cr_verbose >= 10)
    lam_debug(lam_ssi_cr_did, "setting checkpoint_file_dir to %s", crdir);

  /* crdir now points to the base directory where to store the context
     files.  Assume that p->gps_pid could take sizeof(int) * 8 char to
     print */

  str_len = strlen(crdir) + strlen("/context.") + sizeof(int) * 8 * 3 + 4 ;
  tmp = malloc(sizeof(char) * str_len);
  if (tmp == NULL)
    return NULL;

  snprintf(tmp, str_len, "%s/context.%d-n%d-%d",
           crdir, app_session, node, pid);
  if (lam_ssi_cr_verbose >= 10)
    lam_debug(lam_ssi_cr_did, "setting checkpoint_file to %s", tmp);

  return tmp;
}


static char *
get_cr_util(const char *util)
{
  char *tmp;

#ifdef LAM_LBNL_LIBCR_PATH
  int str_len;

    /* Hard-code directory where to find cr_checkpoint binary.  BWB: This
     * should be removed after development
     */
  str_len = strlen(LAM_LBNL_LIBCR_PATH) + strlen("/bin/") + strlen(util) + 1;

  tmp = malloc(sizeof(char) * str_len);
  if (tmp == NULL)
    return tmp;

  snprintf(tmp, str_len, "%s%s%s", LAM_LBNL_LIBCR_PATH, "/bin/", util);

#else

    tmp = strdup(util);
#endif

    return tmp;
}


static char *
create_app_schema(struct _gps *cr_world, int cr_world_n)
{
  int i;
  char *cr_file = NULL;
  char *cr_restart = NULL;
  char *asc_line = NULL;
  int asc_line_len = 0;
  struct _gps       *p;

  char *tmp_as = NULL;
  int tmp_as_len = 0;

  cr_restart = get_cr_util("cr_restart");
  if (cr_restart == NULL) {
    lam_perror("Getting cr_restart");
    kexit(1);
  }

  for (i = 0, p = cr_world; i < cr_world_n; ++i, ++p) {
    /* get file name */
    cr_file = get_checkpoint_file(p->gps_node, p->gps_pid);
    if (cr_file == NULL) {
      lam_perror("Getting checkpoint file name");
      kexit(1);
    }

    /* allocate space for this line */
    asc_line_len = strlen(cr_file) + strlen(cr_restart) + 5 +
      sizeof(int) * 8 + 1;
    asc_line = malloc(sizeof(char) * asc_line_len);
    if (asc_line == NULL) {
      return NULL;
    }

    /* make the line */
    snprintf(asc_line, asc_line_len, "n%d %s %s\n",
             p->gps_node, cr_restart, cr_file);

    /* add line to schema */
    tmp_as_len += asc_line_len;
    tmp_as = realloc(tmp_as, sizeof(char) * tmp_as_len);
    if (tmp_as == NULL) {
      return NULL;
    }
    if (tmp_as_len == asc_line_len) {
      tmp_as[0] = '\0';
    }

    strcat(tmp_as, asc_line);

    free(cr_file);
    free(asc_line);
    asc_line_len = 0;
  }

  chkpnt_app_schema = tmp_as;

  if (chkpnt_app_schema == NULL) {
    lam_debug(lam_ssi_cr_did, "checkpoint app schema build failed");
    return NULL;
  }

  free(cr_restart);

  return tmp_as;
}
