#!/bin/ksh93
# IBM_PROLOG_BEGIN_TAG 
# This is an automatically generated prolog. 
#  
# 61haes_r714 src/43haes/usr/sbin/cluster/events/rep_disk_notify.sh 1.10 
#  
# Licensed Materials - Property of IBM 
#  
# COPYRIGHT International Business Machines Corp. 2011,2014 
# All Rights Reserved 
#  
# US Government Users Restricted Rights - Use, duplication or 
# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. 
#  
# IBM_PROLOG_END_TAG 
# @(#)20        1.10  src/43haes/usr/sbin/cluster/events/rep_disk_notify.sh, hacmp, 61haes_r714, 1445A_hacmp714 8/20/14 14:21:39


#########################################################################
#                                                                       #
#       Name:           rep_disk_notify                                 #
#                                                                       #
#       Description:    When the cluster has lost access to the         #
#                       repository disk,  script is called to display 	#
#                       the message periodically.                       #
#                                                                       #
#       Called by:      clevmgrd					#
#                                                                       # 
#       Calls to:       lscluster                                       # 
#                                                                       # 
#       Arguments:      the failed disk name				#
#			the node which noticed the failure.		#
#                                                                       #
#       Returns:        exits on signal or when repos disk state	#
#						shows as UP		#
#                                                                       #
#########################################################################

#########################################################################
#                                                                       #
#       Name:           check_repos_state                               #
#                                                                       #
#       Description:    Check the repository state                	    #
#                                                                       #
#       Called by:      main() and the signal handler			        #
#                                                                       # 
#       Calls to:       lscluster                                       # 
#                                                                       # 
#       Returns:        REPOS_STATE_UP if up                            #
#                       REPOS_STATE_DOWN if down                        #
#                       REPOS_STATE_ERROR if unknown / error occured    #
#                                                                       #
#########################################################################
check_repos_state()
{

    [[ "$VERBOSE_LOGGING" == "high" ]] && set -x
    typeset PS4_FUNC="check_repos_state"

    TIMESTR=$(date)
    #
    # check the repos state on the event node
    #
    REPQUERY=$(LC_ALL=C cl_rsh $NODENAME /usr/sbin/lscluster -d $REPDISK 2>&1)
    typeset -i rc=$?

    if (( $rc != 0 ))
    then
	    #
	    # give one more try 
	    #
	    sleep 5
        TIMESTR=$(date)
    	REPQUERY=$(LC_ALL=C cl_rsh $NODENAME /usr/sbin/lscluster -d $REPDISK 2>&1)
	    rc=$?
    	if (( $rc != 0 ))
    	then
	        #
	        # tried twice to query - there is no point in
	        # continuing the notification process because we cannot
	        # determine the state
	        #
    	    MSG=$(dspmsg -s 32 scripts.cat 11 "$PROGNAME: $TIMESTR : Unable to determine repository disk state.\n\
Please check CAA cluster status to verify repository disk state.\n" $PROGNAME "$TIMESTR")
            rc=$REPOS_STATE_ERROR
	    fi
    fi

    if (( $rc == 0 ))
    then
        #
        # lscluster succeeded, now see what it says about this disk
        #
        REPSTATE=$(echo "$REPQUERY" | grep -w "State :" | awk '{print $3}')

        if [[ -z "$REPSTATE" || ($REPSTATE != "UP" && $REPSTATE != "DOWN") ]]
        then
            #
            # could not determine the state from the data returned
            #
            MSG=$(dspmsg -s 32 scripts.cat 11 "$PROGNAME: $TIMESTR : Unable to determine repository disk state.\n\
Please check CAA cluster status to verify repository disk state.\n" $PROGNAME "$TIMESTR")
            rc=$REPOS_STATE_ERROR
        elif [[ $REPSTATE == "DOWN" ]]
        then
            #
            # repos down
            #
            MSG=$(dspmsg -s 32 scripts.cat 9 "ERROR: $PROGNAME : $TIMESTR : Node $NODENAME on Cluster $CLUSTER has lost access to repository disk $REPDISK.\nPlease recover from this error or replace the repository disk using smitty." $PROGNAME "$TIMESTR" $NODENAME $CLUSTER $REPDISK)
            rc=$REPOS_STATE_DOWN
        else
            #
            # if here, repository is up
            #
            MSG=$(dspmsg -s 32 scripts.cat 10 "$PROGNAME: $TIMESTR : Access to repository disk has been restored on Node $NODENAME" $PROGNAME "$TIMESTR" $NODENAME)
            rc=$REPOS_STATE_UP
        fi
    fi

    #
    # print the message and exit 
    #
    echo $MSG
    echo $MSG >>/dev/console
    echo $MSG >>$LOG

    return $rc
}

#########################################################################
#                                                                       #
#       Name:           sigquit_handler                                 #
#                                                                       #
#		Description:	signal handler				#
#                                                                       #
#       Returns:        exits on signal or when repos disk state	#
#						shows as UP		#
#                                                                       #
#########################################################################
sigquit_handler ()
{
    [[ "$VERBOSE_LOGGING" == "high" ]] && set -x
    typeset PS4_FUNC="sigquit_handler"

    #
    # check one last time, igonring the return 
    #
    check_repos_state

    #
    # even if we get here, we are a signal handler, and we do have to
    #
    exit 0
}

#########################################################################
#                                                                       #
#       Name:           repository_down                                 #
#                                                                       #
#       Description:    Called for respository down notification, this  #
#                       function loops, checking the repos state, until #
#                       the repos is found to be up                     #
#                                                                       #
#       Called by:      main()                                          #
#                                                                       # 
#       Returns:        0 if repos up, non 0 on error                   #
#                                                                       #
#########################################################################
repository_down()
{

    #
    # sleep 30 seconds at a time, doubling each threshold, to 
    # a maximum of one hour
    #
    typeset -i sleep_time=30
    typeset -i threshold=5
    typeset -i max_sleep=3600
    typeset -i loop_cnt
    typeset -i state

    #
    # infinite loop - exit on repos up or signal
    #
    for (( loop_cnt=1 ; ; loop_cnt++ ))
    do

        #
        # check the state 
        #
        check_repos_state
        state=$?
        if (( state == REPOS_STATE_ERROR ))
        then
                # fatal error, cannot continue
                exit -1
        elif (( state == REPOS_STATE_UP ))
        then
                # repos up - all done
                exit 0
        fi
        #
        # if here, state is REPOS_STATE_DOWN - continue to loop
        #

        #
        # Use a slowly increasing wait time
        #
        if (( $loop_cnt % $threshold == 0 ))
        then
	        #
	        # adjust threshold
	        #
            let threshold=$(( $threshold + $threshold/2 ))

            #
            #   Every $threshold cycles through the notification loop,
            #   the sleep time is increased by half, up to a maxium of
            #   $max_sleep seconds
            #
            let sleep_time=$(( $sleep_time + $sleep_time/2 ))
            let sleep_time=$(( $sleep_time < $max_sleep ? $sleep_time : $max_sleep ))
        fi

        #
        #   do the actual sleep
        #
        sleep $sleep_time

    done

}

#########################################################################
#                                                                       #
#       Name:           repository_up                                   #
#                                                                       #
#       Description:    Called for respository up notifications, this   #
#                       function prints a message and returns.          #
#                                                                       #
#       Called by:      main()                                          #
#                                                                       # 
#       Returns:        0                                               #
#                                                                       #
#########################################################################
repository_up()
{

    TIMESTR=$(date)
    MSG=$(dspmsg -s 32 scripts.cat 10 "$PROGNAME: $TIMESTR : Access to repository disk has been restored on Node $NODENAME" $PROGNAME "$TIMESTR" $NODENAME)
    echo $MSG
    echo $MSG >>/dev/console
    echo $MSG >>$LOG

    return 0
}


#########################################################################
#
:   Main Starts Here
#
#########################################################################

VERBOSE_LOGGING=${VERBOSE_LOGGING:-"low"}
[[ "$VERBOSE_LOGGING" == "high" ]] && set -x
[[ "$VERBOSE_LOGGING" == "high" ]] && version='1.10'

#
# register the signal handler
#
trap sigquit_handler INT

PROGNAME=${0##*/}
export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)"
export LOCAL_NODE=$(/usr/es/sbin/cluster/utilities/get_local_nodename)

REPDISK=$1
NODEUID=$2	# not used
NODENUM=$3
UPORDOWN=$4

export REPDISK NODENUM UPORDOWN
#
# return codes for check_repos_state 
#
export REPOS_STATE_UP=0
export REPOS_STATE_DOWN=1
export REPOS_STATE_ERROR=2

set -u

CLUSTER=$(cllsclstr -c | grep -v "#cid:cname" | cut -d ':' -f2)
export CLUSTER

#
# find log directory
#
LOG_DIRECTORY=$(clodmget -n -q "name=hacmp.out"  -f value HACMPlogs)
if [[ -z $LOG_DIRECTORY ]]
then
	LOG=/var/hacmp/log/hacmp.out
else 
	LOG=$LOG_DIRECTORY/hacmp.out
fi
export LOG

#
# convert node number to name
#
NODENAME=$(clodmget -q "node_id = $NODENUM and object=COMMUNICATION_PATH" -f name -n HACMPnode )
export NODENAME

#
# run the appropriate subroutine
#
case $UPORDOWN in
    REP_UP )
        repository_up
        exit $?
        ;;

    REP_DOWN )
        repository_down
        exit $?
	;;

    * )
        echo "$PROGNAME: An internal error occured."
        dspmsg -s 56 cluster_hlp.msg 54 "Please report this error and the following information to IBM support."
        echo "$*"
        ;;

esac

# should never get here
exit 1;