#!/bin/ksh93
#  ALTRAN_PROLOG_BEGIN_TAG
#  This is an automatically generated prolog.
#
#  Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021.  All rights reserved.
#
#  ALTRAN_PROLOG_END_TAG
#
# IBM_PROLOG_BEGIN_TAG 
# This is an automatically generated prolog. 
#  
# 61haes_r721 src/43haes/usr/sbin/cluster/events/config_too_long.sh 1.17.1.1 
#  
# Licensed Materials - Property of IBM 
#  
# COPYRIGHT International Business Machines Corp. 1990,2016 
# All Rights Reserved 
#  
# US Government Users Restricted Rights - Use, duplication or 
# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. 
#  
# IBM_PROLOG_END_TAG
# @(#)  7d4c34b 43haes/usr/sbin/cluster/events/config_too_long.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM

#########################################################################
#                                                                       #
#       Name:           config_too_long                                 #
#                                                                       #
#       Description:    PowerHA runs events to recover cluster          #
#                       resources after a failure. Since these events   #
#                       are implemented as shell scripts, there is no   #
#                       programatic way for the clstrmgr to determine   #
#                       if a script is hung or if it just taking a long *
#                       time. When a cluster event runs for longer than #
#                       a predefined time, this event script is called  #
#                       to display a message periodically to alert the  #
#                       customer that they need to check for failures.  #
#                                                                       #
#       Called by:      cluster manager                                 # 
#                                                                       # 
#       Calls to:       None                                            # 
#                                                                       # 
#       Arguments:      NUM_SECS since the event (which may have 	#
#			failed, but may just be lengthy) was called.    #
#                                                                       #
#                       EVENT name and associated arguments             #
#                                                                       #
#                                                                       #
#       Returns:        Never returns - killed by clstrmgr when         #
#                       event processing resumes.                       #
#                                                                       #
#########################################################################


# Including Availability metrics library file
. /usr/es/lib/ksh93/availability/cl_amlib

#########################################################################
#                                                                       #
#       Name:           sigquit_handler                                 #
#                                                                       #
#       Description:    Signal handler to catch the interrupt from      #
#                       clstrmgr                                        #
#                                                                       #
#       Called by:      shell signal processing                         #
#                                                                       #
#       Arguments:      None                                            #
#                                                                       #
#       Returns:        Never returns - exit after printing a message   #
#                                                                       #
#########################################################################
sigquit_handler ()
{
    typeset PS4_FUNC="sigquit_handler"
    dspmsg scripts.cat 327 "$PROGNAME: Event '$EVENT' on Cluster $CLUSTER Completed Successfully.\n" $PROGNAME $EVENT $CLUSTER
    #Logging the config_too_long end entry along with timestamp
    amlog_trace $AM_CONFIGTL_END "ConfigTooLong_End|$EVENT"
    exit 0
}

###############################################################################
# Start Main
###############################################################################

PROGNAME=${0##*/}
export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)"
set -a
eval $(cllsparam -n $LOCALNODENAME)
set +a

[[ "$VERBOSE_LOGGING" == "high" ]] && {
    set -x
    version='%I%'
}
#
# The clstrmgr will kill this script when the event processing is resumed -
# setup a signal handler to catch it
#
trap sigquit_handler INT

#
# Only called by clstrmgr so there is no input checking
#
NUM_SECS=$1
EVENT=$2

#Logging the config_too_long begin entry along with timestamp
amlog_trace $AM_CONFIGTL_BEGIN "ConfigTooLong_Begin|$EVENT"

integer HOUR=3600           #In seconds
integer THRESHOLD=5         #Error Time Doubles every THRESHOLD times
integer SLEEP_INTERVAL=1    #A sleep 5 causes sleep to be called 5/SL * SL times

#
# Set the PERIOD to the number of seconds to wait before 
# redisplaying message.  Default is 30 seconds.
#
integer PERIOD=30

set -u

#
# Initialize counters
#
integer LOOPCNT=0
integer MESSAGECNT=0
CLUSTER=$(cllsclstr -c | tail -1 | cut -d ':' -f2)
integer TIME=$NUM_SECS
integer sleep_cntr=0

#
# display list of current processes
#

: ## begin ps -edf
ps -edf
: ## end ps -edf

#
# config too long loops continously, displaying messages with decreasing
# frequency so as not to overrun the logs, so we turn off tracing on
# purpose.  clstrmgr will kill this process when event processing resumes
#
set +x

while (:)
do

    MSG=$(dspmsg scripts.cat 326 "WARNING: Cluster $CLUSTER has been running recovery program '$EVENT' for $TIME seconds.\n  Please check cluster status." $CLUSTER $EVENT $TIME)
    echo $MSG >/dev/console
    echo $MSG

    # CTL can occur if there is an event script failure or if the scripts
    # are just running slow - highlight failed nodes here since that is
    # where the customer will have to take action.
    typeset NODE="" STRING=""
    clcmd lssrc -ls clstrmgrES | egrep "^NODE|^Current state:" | 
    while read STRING
    do
        if [[ $STRING == NODE* ]]
        then
            NODE=$(echo $STRING | cut -f2 -d" ")
            continue;
        else
            if [[ $STRING == "Current state: ST_RP_FAILED" ]]
            then
                dspmsg -s 46 scripts.cat 17 "WARNING: node $NODE has encountered a fatal event script error.\nManual intervention is required.\n" $NODE
                NODE=""
            fi
        fi
    done


    if (( PERIOD < HOUR ))
    then
        (( MESSAGECNT++ ))
        if (( MESSAGECNT % $THRESHOLD == 0 ))
        then
            MESSAGECNT=0;
            (( PERIOD *= 2 ))
            if (( PERIOD > HOUR ))
            then
                PERIOD=$HOUR
            fi
        fi
    fi

    (( LOOPCNT++ ))
    (( TIME += PERIOD ))
    sleep_cntr=0
    while (( sleep_cntr < PERIOD ))
    do
        sleep $SLEEP_INTERVAL
        (( sleep_cntr++ ))
    done
done

# we expect to be killed by clstrmgr, so there is no formal exit value here
