#!/bin/ksh93 # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r714 src/43haes/usr/sbin/cluster/events/rep_disk_notify.sh 1.10 # # Licensed Materials - Property of IBM # # COPYRIGHT International Business Machines Corp. 2011,2014 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#)20 1.10 src/43haes/usr/sbin/cluster/events/rep_disk_notify.sh, hacmp, 61haes_r714, 1445A_hacmp714 8/20/14 14:21:39 ######################################################################### # # # Name: rep_disk_notify # # # # Description: When the cluster has lost access to the # # repository disk, script is called to display # # the message periodically. # # # # Called by: clevmgrd # # # # Calls to: lscluster # # # # Arguments: the failed disk name # # the node which noticed the failure. # # # # Returns: exits on signal or when repos disk state # # shows as UP # # # ######################################################################### ######################################################################### # # # Name: check_repos_state # # # # Description: Check the repository state # # # # Called by: main() and the signal handler # # # # Calls to: lscluster # # # # Returns: REPOS_STATE_UP if up # # REPOS_STATE_DOWN if down # # REPOS_STATE_ERROR if unknown / error occured # # # ######################################################################### check_repos_state() { [[ "$VERBOSE_LOGGING" == "high" ]] && set -x typeset PS4_FUNC="check_repos_state" TIMESTR=$(date) # # check the repos state on the event node # REPQUERY=$(LC_ALL=C cl_rsh $NODENAME /usr/sbin/lscluster -d $REPDISK 2>&1) typeset -i rc=$? if (( $rc != 0 )) then # # give one more try # sleep 5 TIMESTR=$(date) REPQUERY=$(LC_ALL=C cl_rsh $NODENAME /usr/sbin/lscluster -d $REPDISK 2>&1) rc=$? if (( $rc != 0 )) then # # tried twice to query - there is no point in # continuing the notification process because we cannot # determine the state # MSG=$(dspmsg -s 32 scripts.cat 11 "$PROGNAME: $TIMESTR : Unable to determine repository disk state.\n\ Please check CAA cluster status to verify repository disk state.\n" $PROGNAME "$TIMESTR") rc=$REPOS_STATE_ERROR fi fi if (( $rc == 0 )) then # # lscluster succeeded, now see what it says about this disk # REPSTATE=$(echo "$REPQUERY" | grep -w "State :" | awk '{print $3}') if [[ -z "$REPSTATE" || ($REPSTATE != "UP" && $REPSTATE != "DOWN") ]] then # # could not determine the state from the data returned # MSG=$(dspmsg -s 32 scripts.cat 11 "$PROGNAME: $TIMESTR : Unable to determine repository disk state.\n\ Please check CAA cluster status to verify repository disk state.\n" $PROGNAME "$TIMESTR") rc=$REPOS_STATE_ERROR elif [[ $REPSTATE == "DOWN" ]] then # # repos down # MSG=$(dspmsg -s 32 scripts.cat 9 "ERROR: $PROGNAME : $TIMESTR : Node $NODENAME on Cluster $CLUSTER has lost access to repository disk $REPDISK.\nPlease recover from this error or replace the repository disk using smitty." $PROGNAME "$TIMESTR" $NODENAME $CLUSTER $REPDISK) rc=$REPOS_STATE_DOWN else # # if here, repository is up # MSG=$(dspmsg -s 32 scripts.cat 10 "$PROGNAME: $TIMESTR : Access to repository disk has been restored on Node $NODENAME" $PROGNAME "$TIMESTR" $NODENAME) rc=$REPOS_STATE_UP fi fi # # print the message and exit # echo $MSG echo $MSG >>/dev/console echo $MSG >>$LOG return $rc } ######################################################################### # # # Name: sigquit_handler # # # # Description: signal handler # # # # Returns: exits on signal or when repos disk state # # shows as UP # # # ######################################################################### sigquit_handler () { [[ "$VERBOSE_LOGGING" == "high" ]] && set -x typeset PS4_FUNC="sigquit_handler" # # check one last time, igonring the return # check_repos_state # # even if we get here, we are a signal handler, and we do have to # exit 0 } ######################################################################### # # # Name: repository_down # # # # Description: Called for respository down notification, this # # function loops, checking the repos state, until # # the repos is found to be up # # # # Called by: main() # # # # Returns: 0 if repos up, non 0 on error # # # ######################################################################### repository_down() { # # sleep 30 seconds at a time, doubling each threshold, to # a maximum of one hour # typeset -i sleep_time=30 typeset -i threshold=5 typeset -i max_sleep=3600 typeset -i loop_cnt typeset -i state # # infinite loop - exit on repos up or signal # for (( loop_cnt=1 ; ; loop_cnt++ )) do # # check the state # check_repos_state state=$? if (( state == REPOS_STATE_ERROR )) then # fatal error, cannot continue exit -1 elif (( state == REPOS_STATE_UP )) then # repos up - all done exit 0 fi # # if here, state is REPOS_STATE_DOWN - continue to loop # # # Use a slowly increasing wait time # if (( $loop_cnt % $threshold == 0 )) then # # adjust threshold # let threshold=$(( $threshold + $threshold/2 )) # # Every $threshold cycles through the notification loop, # the sleep time is increased by half, up to a maxium of # $max_sleep seconds # let sleep_time=$(( $sleep_time + $sleep_time/2 )) let sleep_time=$(( $sleep_time < $max_sleep ? $sleep_time : $max_sleep )) fi # # do the actual sleep # sleep $sleep_time done } ######################################################################### # # # Name: repository_up # # # # Description: Called for respository up notifications, this # # function prints a message and returns. # # # # Called by: main() # # # # Returns: 0 # # # ######################################################################### repository_up() { TIMESTR=$(date) MSG=$(dspmsg -s 32 scripts.cat 10 "$PROGNAME: $TIMESTR : Access to repository disk has been restored on Node $NODENAME" $PROGNAME "$TIMESTR" $NODENAME) echo $MSG echo $MSG >>/dev/console echo $MSG >>$LOG return 0 } ######################################################################### # : Main Starts Here # ######################################################################### VERBOSE_LOGGING=${VERBOSE_LOGGING:-"low"} [[ "$VERBOSE_LOGGING" == "high" ]] && set -x [[ "$VERBOSE_LOGGING" == "high" ]] && version='1.10' # # register the signal handler # trap sigquit_handler INT PROGNAME=${0##*/} export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)" export LOCAL_NODE=$(/usr/es/sbin/cluster/utilities/get_local_nodename) REPDISK=$1 NODEUID=$2 # not used NODENUM=$3 UPORDOWN=$4 export REPDISK NODENUM UPORDOWN # # return codes for check_repos_state # export REPOS_STATE_UP=0 export REPOS_STATE_DOWN=1 export REPOS_STATE_ERROR=2 set -u CLUSTER=$(cllsclstr -c | grep -v "#cid:cname" | cut -d ':' -f2) export CLUSTER # # find log directory # LOG_DIRECTORY=$(clodmget -n -q "name=hacmp.out" -f value HACMPlogs) if [[ -z $LOG_DIRECTORY ]] then LOG=/var/hacmp/log/hacmp.out else LOG=$LOG_DIRECTORY/hacmp.out fi export LOG # # convert node number to name # NODENAME=$(clodmget -q "node_id = $NODENUM and object=COMMUNICATION_PATH" -f name -n HACMPnode ) export NODENAME # # run the appropriate subroutine # case $UPORDOWN in REP_UP ) repository_up exit $? ;; REP_DOWN ) repository_down exit $? ;; * ) echo "$PROGNAME: An internal error occured." dspmsg -s 56 cluster_hlp.msg 54 "Please report this error and the following information to IBM support." echo "$*" ;; esac # should never get here exit 1;