Some examples that can support step based applications.
Application that only uses the blocking checkpoint/restart API without the help of an asynchronous checkpointing thread.
am_i_restartingshell$ mpicc blocking.c -o blocking
#!/bin/bash #PBS no-kill=true # A local /tmp available on each node (Default: /tmp) export APPCR_TMPDIR=/tmp/ # General Reference for this application export APPCR_HANDLE=example # Suggested path to store the checkpoint export APPCR_STORAGE_DIR=/ckpt-fs/$USER # Setup the CR System cr-setup # Source the generated script to load environment variables source $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh # Run the application mpirun blocking <args> # Check to see if mpirun succeeded if [ $? != 0 ] ; then # Synchronize the file system before restart cr-sync # Setup restart of the application cr-setup --restart # Source the generated script to load environment variables source $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh # Relaunch the application mpirun blocking <args> fi # Synchronize the file system before exiting cr-sync # Be sure to remove this script before exiting rm $APPCR_TMPDIR/$USER-$APPCR_HANDLE.sh
#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>
#define INIT_STEPS 10
#define INC_STEPS 10
#define MAX_STEPS 100
int my_checkpoint_cb(char * dir, int *seq, MPI_Comm ckpt_comm);
int my_restart_cb(char * dir, int seq, MPI_Comm ckpt_comm);
static int setup_work(int step);
static int do_work(int step);
static int update_checkpoint_structure(int step);
int global_step = 0;
int ckpt_step = 0;
int mpi_rank, mpi_size;
int main(int argc, char **argv) {
char *str_value = NULL;
int value;
int cur_step;
/* Initialize the MPI Library */
MPI_Init(argc, argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
/* Initialize the CR Library */
CR_Init(MPI_COMM_WORLD);
/* Register callback functions */
CR_Register_checkpoint_cb(my_checkpoint_cb);
CR_Register_restart_cb(my_restart_cb);
cur_step = INIT_STEPS;
CR_Attr_get("am_i_restarting", &str_value);
if( NULL != str_value ) {
value = atoi(str_value);
if( 1 == value ) {
printf("%3d of %3d) I am restarting from step %2d\n",
mpi_rank, mpi_size, global_step);
cur_step = global_step;
CR_Attr_set("am_i_restarting", "0");
}
}
/* Setup initial work unit */
setup_work(cur_step);
for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
/* Do some computation */
do_work(cur_step);
/* Protect and update the checkpoint structure */
CR_Protect_enter();
update_checkpoint_structure(cur_step);
CR_Protect_leave();
/* Take a blocking checkpoint */
CR_Checkpoint();
/* Setup next work unit */
setup_work(cur_step+INC_STEPS);
}
/* Finalize the CR Library */
CR_Finalize();
/* Finalize the MPI Library */
MPI_Finalize();
if( NULL != str_value ) {
free(str_value);
str_value = NULL;
}
return 0;
}
int my_checkpoint_cb(char * dir, int *seq, MPI_Comm ckpt_comm)
{
FILE *ckpt_fd = NULL;
char *loc_path = NULL;
/* Optionally: Synchronize all processes so that they are writing
* similar sequence numbers. You can only use the ckpt_comm
* provided in order to avoid deadlock conditions.
*/
MPI_Barrier(ckpt_comm);
printf("%3d of %3d) Checkpoint CB: <%s> <%d> -> %d\n",
mpi_rank, mpi_size, dir, *seq, ckpt_step);
*seq = ckpt_step;
asprintf(&loc_path, "%s/my-ckpt.txt", dir);
if( NULL == (ckpt_fd = fopen(loc_path, "w")) ) {
return -1;
}
fprintf(ckpt_fd, "%d\n", ckpt_step);
fclose(ckpt_fd);
if( NULL != loc_path ) {
free(loc_path);
loc_path = NULL;
}
/* Successful checkpoint (return non-zero if failure) */
return 0;
}
int my_restart_cb(char * dir, int seq, MPI_Comm ckpt_comm)
{
FILE *ckpt_fd = NULL;
char *loc_path = NULL;
/* Optionally: Synchronize all processes so that they are restarting
* from similar sequence numbers. You can only use the ckpt_comm
* provided in order to avoid deadlock conditions.
*/
MPI_Barrier(ckpt_comm);
asprintf(&loc_path, "%s/my-ckpt.txt", dir);
printf("%3d of %3d) Restart CB: <%s> <%d>\n",
mpi_rank, mpi_size, dir, seq);
if( NULL == (ckpt_fd = fopen(loc_path, "r")) ) {
return -1;
}
fscanf(ckpt_fd, "%d", &global_step);
fclose(ckpt_fd);
if( NULL != loc_path ) {
free(loc_path);
loc_path = NULL;
}
printf("%3d of %3d) Restart CB: <%s> <%d> -> %d\n",
mpi_rank, mpi_size, dir, seq, global_step);
/* Successful restart (return non-zero if failure) */
return 0;
}
static int setup_work(int step)
{
global_step = step;
return 0;
}
static int do_work(int step)
{
MPI_Barrier(MPI_COMM_WORLD);
if( 0 == mpi_rank ) {
printf("%3d of %3d) Step: %3d / %3d\n",
mpi_rank, mpi_size, step, MAX_STEPS);
}
sleep(1);
return 0;
}
static int update_checkpoint_structure(int step)
{
ckpt_step = step;
return 0;
}
Application that uses the non-blocking checkpoint/restart API.
If the checkpoint thread is available then it is used to make concurrent
progress, otherwise most/all of the work is done during the
CR_Icheckpoint operation.
Only showing the difference from the Blocking C/R
example.
am_i_restartingenable_cr_thread
#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>
int main(int argc, char **argv) {
CR_Request_t ckpt_req;
int status;
...
/* Enable the checkpointing thread */
CR_Attr_set("enable_cr_thread", "1");
/* Setup the initial work unit */
setup_work(cur_step);
for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
/* Do some computation */
do_work(cur_step);
/* Protect and update the checkpoint structure */
CR_Protect_enter();
update_checkpoint_structure(cur_step);
CR_Protect_leave();
/* Take a non-blocking checkpoint */
CR_Icheckpoint(&ckpt_req);
/* Setup the next work unit */
setup_work(cur_step+INC_STEPS);
/* Wait for the checkpoint to finish */
CR_Wait(&ckpt_req, &status);
}
...
}
Application that uses the concurrent checkpointing thread to start a checkpoint
as triggered by a timer event. The timer is activated per process without any
coordination by the library. If the application requires coordination, the it is
the responsibility of the application to do so outside of the Application C/R
Library (e.g., by calling MPI_Barrier from the checkpoint
callback).
Only showing the difference from the Blocking C/R
and Non-Blocking C/R
examples below.
am_i_restartingcheckpoint_freqenable_cr_thread
#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>
int main(int argc, char **argv) {
...
/* Enable the checkpointing thread */
CR_Attr_set("enable_cr_thread", "1");
/* Checkpoint approximately every minute */
CR_Attr_set("checkpoint_freq", "1");
/* Setup initial work unit */
setup_work(cur_step);
for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
/* Do some computation */
do_work(cur_step);
/* Protect and update the checkpoint structure */
CR_Protect_enter();
update_checkpoint_structure(cur_step);
CR_Protect_leave();
/* Setup next work unit */
setup_work(cur_step+INC_STEPS);
}
...
}
An example of how to use the CHKPT_REQ CIFTS FTB
event to trigger a checkpoint in a simple application.
For this application, we will assume that the CIFTS FTB is the only mechanism
to request a checkpoint from the application. The application can combine
the CIFTS FTB mechanism with any of the other checkpoint request mechanisms
available in the library.
This section assumes that you have correctly installed the CIFTS FTB library,
and have it running on your machine. For instructions on how to setup the
FTB see the following [link].
You will also need to make sure you configure
the Application C/R library with support for the CIFTS FTB.
Only showing the difference from the Blocking C/R and Non-Blocking C/R examples.
am_i_restartingenable_cr_thread
#include <stdio.h>
#include <mpi.h>
#include <mpi-ext.h>
int main(int argc, char **argv) {
...
/* Enable the checkpointing thread */
CR_Attr_set("enable_cr_thread", "1");
/* Setup initial work unit */
setup_work(cur_step);
for( ; cur_step < MAX_STEPS; cur_step += INC_STEPS) {
/* Do some computation */
do_work(cur_step);
/* Protect and update the checkpoint structure */
CR_Protect_enter();
update_checkpoint_structure(cur_step);
CR_Protect_leave();
/* Setup next work unit */
setup_work(cur_step+INC_STEPS);
}
...
}
Below is a small program that throws the CHKPT_REQ CIFTS FTB event to trigger a checkpoint in a simple application.
#include <stdio.h>
#include <libftb.h>
int main(int argc, char **argv) {
...
if( FTB_SUCCESS != (ret = FTB_Publish(ftb_client_handle,
APPCR_EVENT_CKPT_REQ,
NULL, &event_handle)) ) {
fprintf(stderr, "Error: FTB_Publish(%s) failed!\n", APPCR_EVENT_CKPT_REQ);
exit_status = -3;
goto cleanup;
}
...
}