Tuesday, September 29, 2015

MAG ecflow jobs

MAG ecflow job:
1. admin  ( /ecfmag/ecfnets/scripts/admin/mag )
JMAG_CKQUOTA @ wcoss (mag_ckquota.ecf @ ecf1)
export job="jmag_ckquota"
export pid=$$
export DATA=${DATAROOT}/${job}.${pid}

rm -rf $DATA
mkdir -p $DATA

cd $DATA
$SCRIPTSmag/exmag_checkquota.sh.ecf
If run on gyre :  
        $  /usr/lpp/mmfs/bin/mmlsquota    -j    nco-sib    gpfs-gd2
if run on tide :    
        $  /usr/lpp/mmfs/bin/mmlsquota    -j   nco-sib    gpfs-td2
cd /tmpnwprd1
rm -rf $DATA
log file : /nco/sib/magdev/com/output/prod/today/mag_ckquota.o%J

JMAG_CLEANUP @ wcoss (mag_cleanup.ecf @ ecf1)

export job="jmag_cleanup"
export pid=$$
export DATA=${DATAROOT}/${job}.${pid}

rm -rf $DATA
mkdir -p $DATA

cd $DATA
#1) clean up wcoss
$SCRIPTSmag/exmag_cleanup.sh.ecf

# Clean up GIF files : /nco/sib/magdev/com/mag/test/gifs
    find ./${i} -mindepth 1 -maxdepth 1 -mmin +1560 -type d
    find ./${i} -mindepth 1 -maxdepth 1 -mmin +1560 -type d -exec rm -rf '{}' ';'

# Clean up status files : /nco/sib/magdev/com/mag/test/status
    find ${LOG_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d
    find ${LOG_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d -exec rm -rf '{}' ';'

# Removing  log files : /nco/sib/magdev/com/output/prod/today
    find ${LOG_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d
    find ${LOG_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d -exec rm -rf '{}' ';'

# Clean up tmp dirs : /nco/sib/magdev/tmpnwprd1
    find ${TMP_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d
    find ${TMP_DIR} -mindepth 1 -maxdepth 1 -mtime +3 -type d -exec rm -rf '{}' ';'

#2) clean up rzdm
/usr/bin/ssh -t -t  mag@ncorzdm.ncep.noaa.gov "bash /home/people/nco/mag/bin/exmag_cleanup_test.sh.ecf"

cd /tmpnwprd1
rm -rf $DATA

log file : /nco/sib/magdev/com/output/prod/today/mag_cleanup.o%J


JMAG_SETUP @ wcoss (ecmag_maintain.ecf  @ ecf1)
Purpose : Create sub-dir like 20150915 and create symbolic link "today" linked to the current day.
$ cd  /nco/sib/magdev/com/output/prod
$ ln   -vTsf   $out_dir/prod/$PDY $out_dir/prod/today

ln     [OPTION]...   [-T]     TARGET    LINK_NAME
-v : verbose, print name of each linked file
-T : treat  LINK_NAME  as a normal file
-s : make symbolic links instead of hard links
-f : force, remove destination files

export DATA=${NWROOT}/tmpnwprd1/JMAG_SETUP.${pid}

mkdir -p $DATA
cd $DATA
env
${NWROOT}/mag.${mag_ver}/scripts/exmag_setup.sh.ecf
${NWROOT}/mag.${mag_ver}/scripts/exMagDirClean.sh.ecf

cd ${NWROOT}/tmpnwprd1
rm -rf $DATA
exit

Log file :  /nco/sib/magdev/com/output/prod/today/ecmag_maintain.o%J
2. mag processor
( /ecfmag/ecfnets/scripts/mag/mag_processor/${model}/mag_${model}_processor.ecf  @ ecf1)

JMAG @ wcoss
export job=${job:-jmag.${MODEL}}
export pid=$$
export DATA=${DATAROOT}/${job}.${pid}
rm -rf $DATA
mkdir -p $DATA

cd $DATA

export jlogfile=${jlogfile:-$COMROOT/logs/jlogfiles/jlogfile.${job}.${pid}}

if [ "$hurr_model" == "yes" ]; then
mag_script=exmag_processor_hurr.pl
else
mag_script=exmag_processor.pl
fi

${SCRIPTSmag}/${mag_script}      ${MAGPROCPL_FLAGS}

msg="JOB $job ENDED NORMALLY."
postmsg $jlogfile "$msg"

rm -rf $DATA
date


Eg: /ecfmag/ecfnets/scripts/mag/mag_processor/gfs/mag_gfs_processor.ecf
export MODEL=gfs
export MP_CMDFILE=poe_script
export HOMEmag=$NWROOT/mag.$mag_ver
$HOMEmag/jobs/JMAG


3.  send2web
JSNDMAG2WEB
export job=${job:-jmag.${MODEL}}
export pid=$$
export DATA=${DATAROOT}/${job}.${pid}
rm -rf $DATA
mkdir -p $DATA
cd $DATA

export jlogfile=${jlogfile:-$COMROOT/logs/jlogfiles/jlogfile.${job}.${pid}}
$SCRIPTSmag/exsendmag2web.sh.ecf   $MODEL   $TABLEDIR/$TABLE   $transfer_file   $yyyymmdd $cycle
if [ "${MAG_TRANSFER}" = "ON" ]; then        # If this is not running in development...
$HOMEmag/scripts/exmag_status_sync.sh.ecf
fi

cd ${BASE_DIR}/tmpnwprd1
rm -rf $DATA

Eg: /ecfmag/ecfnets/scripts/mag/mag_send2web/gfs/ecmag_sync_gfs.ecf
export MODEL=GFS
export TABLE=MAG_sync_table.tbl
export ncorzdm_username=mag;
export HOMEmag=$NWROOT/mag.$mag_ver;
$HOMEmag/jobs/JSNDMAG2WEB

rsync in exsendmag2web.sh.ecf :

/usr/bin/rsync -v -v   --rsh="/usr/bin/ssh -c arcfour128,aes128-cbc,arcfour,aes128-ctr,blowfish-cbc -o macs=hmac-md5,hmac-md5-96 -o compression=yes"     --stats   --timeout=30     --update  /nco/sib/magdev/com/mag/test/gifs/gfs/20151001/12/{allow.cfg,index.php}     mag@ncorzdm:/home/www/nco_mag/test_data/gfs/20151001/12

rsync flag description:
-v : increase verbosity, the more -v given, the more info will be printed.
--rsh : specify remote shell to use.
--stats : give some file transfer status
--update : force rsync to skip any files which exist  on the destination and have a modified time newer than source file.

/nco/sib/magdev/nwtest/mag.trunk_726/scripts/exsendmag2web.sh.ecf    

GFS
/nco/sib/magdev/nwtest/mag.trunk_726/fix/MAG_sync_table.tbl
/nco/sib/magdev/com/mag/test/status/transfer/gfs_2015100112_69_133_153.transfer
20151001
12


How to switch host where ecflow submit jobs to?
  1. $  ssh -Y   Deyong.Xu@cpecflow1.ncep.noaa.gov
  2. $  ecflowview &
  3. from the ecFlow viewer:
    1. halt   (right-click on MAG, choose halt)
    2. wait for jobs to bleed off and go to 4. Or proceed with c and d
    3. set to complete (right-click on each, choose set complete):
      1. ecmaintain
      2. rsync_requeue
      3. mag_cleanup
    4. Kill all the jobs on tide / gyre as mag.dev account.
      1. $  sudo  -u mag.dev -i
      2. $  bjobs  -u mag.dev
      3. $  bjobs  -u mag.dev | grep devhigh | awk '{print "bkill", $1}'
      4. (Now, cut and paste the results of step iii. to actually kill the jobs)

  1. $  sudo -u mag.dev -i  ( sudo to mag.dev account on cpecflow1 server )
  2. $  vi  /home/mag.dev/etc/prodmachinefile    and switch the order of the wcoss host names (the development machine should be the first one)
  3. from the ecFlow viewer:
    1. restart (right-click on MAG, choose restart)
    2. check each task, verify that it is queue, and the next time to run is after the current time (click on task, click on “i” icon on upper right bar, choose tab “Why”, and examine TASK line)

How make hidden transfer file into visible and trigger processing event in ecflow?
-----------------------------------------
 “exmag_processor.pl”    
-----------------------------------------
$transfer_file_hidden="${Config::status_dir}/transfer/.${model}_${date}${cycle}_${job}_${main::transfer_seq}_${random}.transfer";

if (! open ($trans_filehandle, "> $transfer_file_hidden")) {
    logmsg($Config::fatal, "Error opening transfer file: $transfer_file_hidden: $!");
    return 1;
}

 logmsg($Config::info, "Poe job completed successfully, so make the transfer file ${transfer_file} visible:");

 if (! rename    $transfer_file_hidden,    $transfer_file) {
      logmsg($Config::fatal, "Error renaming the hidden transfer file to visible name:  
                  ${transfer_file}");
        return 1;
 }

if (! $main::development_mode) {   # when not running at the command prompt
       # dxu : set the event “processing” to be true in the corresponding model *.ecf .
        `ecflow_client    --event     processing`;  
 }

Event and Trigger
Event:
The event keyword assigns an event to the task currently being defined. Only tasks can have events and they can be considered as an attribute of a task. There can be many events and they are displayed as nodes.

An event has a number and possibly a name. If it is only defined as a number, its name is the text representation of the number without leading zeroes. For example, event 007 can be accessed as event 7 or as event 007. Event's numbers must be positive and their name can contain only letters and digits. The use of letters is optional; the event name can consist simply of digits.

eg:
task x
event 1                 # Can only be referred to as x:1
event 2  prepok  # Can only be referred to as x:prepok
event 3  99           # This is asking for trouble!

real MAG example:
family ghm
task mag_ghm_full_processor
 event   1    processing         # dxu:  1 is useless ;   processing = false
 time 00:02 23:52 00:10

Trigger:
The purpose of an event is to signal partial completion of a task and to be able to trigger another job which is waiting this partial completion.

eg:
suite x
family f
task t1
event  foo
task t2
trigger  t1:foo == set   # foo = set ; foo=true
                                    #t1:foo is relative path.  


Three steps to combine event and trigger :
1) Assign an event to a task.
task   create_a_file
    event      event_A         # event name : event_A ,    event_A = false .

2) Modify the task to change the event value to true.
 In  task  create_a_file,
        ecflow_client    --event    event_A   # command "ecflow_client  --event" sets  
                                                                 # event_A = true

3) Task  copy_a_file  checks value of event_A of task create_a_file and will run if event_A = true.
task copy_a_file
       trigger   create_a_file: event_A          # check event_A’s value.




ecflow system:  cpecflow1.ncep.noaa.gov

/ecfmag/ecfnets/defs

1. Directory structure:
(note : use “admin” for demo. )


admin.def
Purpose:
  1. set ENV variables using keyword edit.
  2. set cron schedule.
suite admin
 repeat day 1
 edit ECF_HOME '/ecfmag/ecfnets/scripts'
 edit ECF_INCLUDE '/ecfmag/ecfnets/scripts'
 edit ECF_OUT '/ecfmag/ecfnets/output'
 edit ECF_TRIES '1'
 edit ECF_FILES '/ecfmag/ecfnets/scripts'
 edit ECF_JOB_CMD '%ECF_JOB% 1> %ECF_JOBOUT% 2>&1'
 family mag
    task mag_requeue
     cron 00:14 23:49 00:10
##    task rsync_requeue
##      time 00:14 23:49 00:10
    task mag_cleanup
     edit ECF_JOB_CMD '/ecfmag/ecfutils/unixsubmit %ECF_JOB% %ECF_JOBOUT% ibmsp'
     edit ECF_PASS 'FREE'
     time 00:14
    task ecmag_maintain
    edit ECF_JOB_CMD '/ecfmag/ecfutils/unixsubmit %ECF_JOB% %ECF_JOBOUT% ibmsp'
    edit ECF_PASS 'FREE'
    time 00:07
    task mag_ckquota
     edit ECF_JOB_CMD '/ecfmag/ecfutils/unixsubmit %ECF_JOB% %ECF_JOBOUT% ibmsp'
     edit ECF_PASS 'FREE'
     cron 00:05 23:49 01:00
 endfamily
endsuite

mag_requeue.ecf
…..
python ${pythondir}/node_status.py | grep complete  >${magout}/mag_list

for model in `cat ${magout}/mag_list | awk -F" " '{print $1}'`
do
  echo
  echo "JOB IS COMPLETE - REQUEUE-ING TASK: $model"
  echo
  ecflow_client --requeue force $model
  export err=$?
  if [ $err -ne '0' ]
  then
    ecflow_client --abort="ecflow_client --force FAILED: $model NOT requeued"
  fi
done

if [ -s ${magout}/mag_list ]
then
  echo
  echo "THE FOLLOWING HAVE BEEN REQUEUED:"
  cat ${magout}/mag_list
  echo
fi

rm -f ${magout}/mag_list

date
if test $err -eq '0'
then
  ecflow_client --complete
else
  ecflow_client --abort
fi

No comments:

Post a Comment