PTL Logo

Fault Tolerance Research @ Open Systems Laboratory

Application Level Checkpoint/Restart Interfaces

  •  

Examples

Some examples that can support step based applications.

Blocking C/R:

Application that only uses the blocking checkpoint/restart API without the help of an asynchronous checkpointing thread.

Tools Used

API Used

Compiling

shell$ mpicc blocking.c -o blocking

Job Submission Script:

#!/bin/bash
#PBS no-kill=true

# A local /tmp available on each node (Default: /tmp)
export APPCR_TMPDIR=/tmp/

# General Reference for this application
export APPCR_HANDLE=example

# Suggested path to store the checkpoint
export APPCR_STORAGE_DIR=/ckpt-fs/$USER

# Setup the CR System
cr-setup
# Source the generated script to load environment variables
source $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh

# Run the application
mpirun blocking <args>

# Check to see if mpirun succeeded
if [ $? != 0 ] ; then 
  # Synchronize the file system before restart
  cr-sync

  # Setup restart of the application
  cr-setup --restart
  # Source the generated script to load environment variables
  source $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh

  # Relaunch the application
  mpirun blocking <args>
fi

# Synchronize the file system before exiting
cr-sync

# Be sure to remove this script before exiting
rm $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh

blocking.c: [Download]

#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>

#define INIT_STEPS  10
#define INC_STEPS   10
#define MAX_STEPS  100

int my_checkpoint_cb(char * dir, int *seq, MPI_Comm ckpt_comm);
int my_restart_cb(char * dir, int seq, MPI_Comm ckpt_comm);

static int setup_work(int step);
static int do_work(int step);
static int update_checkpoint_structure(int step);

int global_step = 0;
int ckpt_step = 0;

int mpi_rank, mpi_size;

int main(int argc, char **argv) {
  char *str_value = NULL;
  int value;
  int cur_step;

  /* Initialize the MPI Library */
  MPI_Init(argc, argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);

  /* Initialize the CR Library */
  CR_Init(MPI_COMM_WORLD);

  /* Register callback functions */
  CR_Register_checkpoint_cb(my_checkpoint_cb);
  CR_Register_restart_cb(my_restart_cb);

  cur_step = INIT_STEPS;
  CR_Attr_get("am_i_restarting", &str_value);
  if( NULL != str_value ) {
    value = atoi(str_value);
    if( 1 == value ) {
      printf("%3d of %3d) I am restarting from step %2d\n",
             mpi_rank, mpi_size, global_step);
      cur_step = global_step;
      CR_Attr_set("am_i_restarting", "0");
    }
  }

  /* Setup initial work unit */
  setup_work(cur_step);

  for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
    /* Do some computation */
    do_work(cur_step);

    /* Protect and update the checkpoint structure */
    CR_Protect_enter();
    update_checkpoint_structure(cur_step);
    CR_Protect_leave();

    /* Take a blocking checkpoint */
    CR_Checkpoint();

    /* Setup next work unit */
    setup_work(cur_step+INC_STEPS);
  }

  /* Finalize the CR Library */
  CR_Finalize();

  /* Finalize the MPI Library */
  MPI_Finalize();

  if( NULL != str_value ) {
    free(str_value);
    str_value = NULL;
  }

  return 0;
}

int my_checkpoint_cb(char * dir, int *seq, MPI_Comm ckpt_comm)
{
  FILE *ckpt_fd = NULL;
  char *loc_path = NULL;

  /* Optionally: Synchronize all processes so that they are writing
   * similar sequence numbers. You can only use the ckpt_comm
   * provided in order to avoid deadlock conditions.
   */
  MPI_Barrier(ckpt_comm);

  printf("%3d of %3d) Checkpoint CB: <%s> <%d> -> %d\n",
         mpi_rank, mpi_size, dir, *seq, ckpt_step);

  *seq = ckpt_step;

  asprintf(&loc_path, "%s/my-ckpt.txt", dir);

  if( NULL == (ckpt_fd = fopen(loc_path, "w")) ) {
    return -1;
  }

  fprintf(ckpt_fd, "%d\n", ckpt_step);

  fclose(ckpt_fd);

  if( NULL != loc_path ) {
    free(loc_path);
    loc_path = NULL;
  }

  /* Successful checkpoint (return non-zero if failure) */
  return 0;
}

int my_restart_cb(char * dir, int seq, MPI_Comm ckpt_comm)
{
  FILE *ckpt_fd = NULL;
  char *loc_path = NULL;

  /* Optionally: Synchronize all processes so that they are restarting
   * from similar sequence numbers. You can only use the ckpt_comm
   * provided in order to avoid deadlock conditions.
   */
  MPI_Barrier(ckpt_comm);

  asprintf(&loc_path, "%s/my-ckpt.txt", dir);

  printf("%3d of %3d) Restart CB: <%s> <%d>\n",
         mpi_rank, mpi_size, dir, seq);

  if( NULL == (ckpt_fd = fopen(loc_path, "r")) ) {
    return -1;
  }

  fscanf(ckpt_fd, "%d", &global_step);

  fclose(ckpt_fd);

  if( NULL != loc_path ) {
    free(loc_path);
    loc_path = NULL;
  }

  printf("%3d of %3d) Restart CB: <%s> <%d> -> %d\n",
         mpi_rank, mpi_size, dir, seq, global_step);

  /* Successful restart (return non-zero if failure) */
  return 0;
}

static int setup_work(int step)
{
    global_step = step;
    return 0;
}

static int do_work(int step)
{
    MPI_Barrier(MPI_COMM_WORLD);
    if( 0 == mpi_rank ) {
        printf("%3d of %3d) Step: %3d / %3d\n",
          mpi_rank, mpi_size, step, MAX_STEPS);
    }
    sleep(1);

    return 0;
}

static int update_checkpoint_structure(int step)
{
    ckpt_step = step;
    return 0;
}

Non-Blocking C/R:

Application that uses the non-blocking checkpoint/restart API. If the checkpoint thread is available then it is used to make concurrent progress, otherwise most/all of the work is done during the CR_Icheckpoint operation.
Only showing the difference from the Blocking C/R example.

Tools Used

API Used

non-blocking.c: [Download]

#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>

int main(int argc, char **argv) {
  CR_Request_t ckpt_req;
  int status;

  ...

  /* Enable the checkpointing thread */
  CR_Attr_set("enable_cr_thread", "1");

  /* Setup the initial work unit */
  setup_work(cur_step);

  for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
    /* Do some computation */
    do_work(cur_step);

    /* Protect and update the checkpoint structure */
    CR_Protect_enter();
    update_checkpoint_structure(cur_step);
    CR_Protect_leave();

    /* Take a non-blocking checkpoint */
    CR_Icheckpoint(&ckpt_req);

    /* Setup the next work unit */
    setup_work(cur_step+INC_STEPS);

    /* Wait for the checkpoint to finish */
    CR_Wait(&ckpt_req, &status);
  }

  ...
}

Timer Triggered C/R:

Application that uses the concurrent checkpointing thread to start a checkpoint as triggered by a timer event. The timer is activated per process without any coordination by the library. If the application requires coordination, the it is the responsibility of the application to do so outside of the Application C/R Library (e.g., by calling MPI_Barrier from the checkpoint callback).
Only showing the difference from the Blocking C/R and Non-Blocking C/R examples below.

Tools Used

API Used

timer.c: [Download]

#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>

int main(int argc, char **argv) {
  ...

  /* Enable the checkpointing thread */
  CR_Attr_set("enable_cr_thread", "1");

  /* Checkpoint approximately every minute */
  CR_Attr_set("checkpoint_freq", "1");

  /* Setup initial work unit */
  setup_work(cur_step);

  for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
    /* Do some computation */
    do_work(cur_step);

    /* Protect and update the checkpoint structure */
    CR_Protect_enter();
    update_checkpoint_structure(cur_step);
    CR_Protect_leave();

    /* Setup next work unit */
    setup_work(cur_step+INC_STEPS);
  }

  ...
}

CIFTS FTB Triggered C/R:

An example of how to use the CHKPT_REQ CIFTS FTB event to trigger a checkpoint in a simple application.
For this application, we will assume that the CIFTS FTB is the only mechanism to request a checkpoint from the application. The application can combine the CIFTS FTB mechanism with any of the other checkpoint request mechanisms available in the library.

This section assumes that you have correctly installed the CIFTS FTB library, and have it running on your machine. For instructions on how to setup the FTB see the following [link].
You will also need to make sure you configure the Application C/R library with support for the CIFTS FTB.

Only showing the difference from the Blocking C/R and Non-Blocking C/R examples.

Tools Used

API Used

counter.c: [Download]

#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>

int main(int argc, char **argv) {
  ...

  /* Enable the checkpointing thread */
  CR_Attr_set("enable_cr_thread", "1");

  /* Setup initial work unit */
  setup_work(cur_step);

  for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
    /* Do some computation */
    do_work(cur_step);

    /* Protect and update the checkpoint structure */
    CR_Protect_enter();
    update_checkpoint_structure(cur_step);
    CR_Protect_leave();

    /* Setup next work unit */
    setup_work(cur_step+INC_STEPS);
  }

  ...
}

Below is a small program that throws the CHKPT_REQ CIFTS FTB event to trigger a checkpoint in a simple application.

ftb_throw_ckpt_req.c: [Download]

#include <stdio.h>
#include <libftb.h>

int main(int argc, char **argv) {
  ...

  if( FTB_SUCCESS != (ret = FTB_Publish(ftb_client_handle,
                                        APPCR_EVENT_CKPT_REQ,
                                        NULL, &event_handle)) ) {
      fprintf(stderr, "Error: FTB_Publish(%s) failed!\n", APPCR_EVENT_CKPT_REQ);
      exit_status = -3;
      goto cleanup;
  }

  ...
}