#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r721 src/43haes/usr/sbin/cluster/events/config_too_long.sh 1.17.1.1 # # Licensed Materials - Property of IBM # # COPYRIGHT International Business Machines Corp. 1990,2016 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 7d4c34b 43haes/usr/sbin/cluster/events/config_too_long.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM ######################################################################### # # # Name: config_too_long # # # # Description: PowerHA runs events to recover cluster # # resources after a failure. Since these events # # are implemented as shell scripts, there is no # # programatic way for the clstrmgr to determine # # if a script is hung or if it just taking a long * # time. When a cluster event runs for longer than # # a predefined time, this event script is called # # to display a message periodically to alert the # # customer that they need to check for failures. # # # # Called by: cluster manager # # # # Calls to: None # # # # Arguments: NUM_SECS since the event (which may have # # failed, but may just be lengthy) was called. # # # # EVENT name and associated arguments # # # # # # Returns: Never returns - killed by clstrmgr when # # event processing resumes. # # # ######################################################################### # Including Availability metrics library file . /usr/es/lib/ksh93/availability/cl_amlib ######################################################################### # # # Name: sigquit_handler # # # # Description: Signal handler to catch the interrupt from # # clstrmgr # # # # Called by: shell signal processing # # # # Arguments: None # # # # Returns: Never returns - exit after printing a message # # # ######################################################################### sigquit_handler () { typeset PS4_FUNC="sigquit_handler" dspmsg scripts.cat 327 "$PROGNAME: Event '$EVENT' on Cluster $CLUSTER Completed Successfully.\n" $PROGNAME $EVENT $CLUSTER #Logging the config_too_long end entry along with timestamp amlog_trace $AM_CONFIGTL_END "ConfigTooLong_End|$EVENT" exit 0 } ############################################################################### # Start Main ############################################################################### PROGNAME=${0##*/} export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)" set -a eval $(cllsparam -n $LOCALNODENAME) set +a [[ "$VERBOSE_LOGGING" == "high" ]] && { set -x version='%I%' } # # The clstrmgr will kill this script when the event processing is resumed - # setup a signal handler to catch it # trap sigquit_handler INT # # Only called by clstrmgr so there is no input checking # NUM_SECS=$1 EVENT=$2 #Logging the config_too_long begin entry along with timestamp amlog_trace $AM_CONFIGTL_BEGIN "ConfigTooLong_Begin|$EVENT" integer HOUR=3600 #In seconds integer THRESHOLD=5 #Error Time Doubles every THRESHOLD times integer SLEEP_INTERVAL=1 #A sleep 5 causes sleep to be called 5/SL * SL times # # Set the PERIOD to the number of seconds to wait before # redisplaying message. Default is 30 seconds. # integer PERIOD=30 set -u # # Initialize counters # integer LOOPCNT=0 integer MESSAGECNT=0 CLUSTER=$(cllsclstr -c | tail -1 | cut -d ':' -f2) integer TIME=$NUM_SECS integer sleep_cntr=0 # # display list of current processes # : ## begin ps -edf ps -edf : ## end ps -edf # # config too long loops continously, displaying messages with decreasing # frequency so as not to overrun the logs, so we turn off tracing on # purpose. clstrmgr will kill this process when event processing resumes # set +x while (:) do MSG=$(dspmsg scripts.cat 326 "WARNING: Cluster $CLUSTER has been running recovery program '$EVENT' for $TIME seconds.\n Please check cluster status." $CLUSTER $EVENT $TIME) echo $MSG >/dev/console echo $MSG # CTL can occur if there is an event script failure or if the scripts # are just running slow - highlight failed nodes here since that is # where the customer will have to take action. typeset NODE="" STRING="" clcmd lssrc -ls clstrmgrES | egrep "^NODE|^Current state:" | while read STRING do if [[ $STRING == NODE* ]] then NODE=$(echo $STRING | cut -f2 -d" ") continue; else if [[ $STRING == "Current state: ST_RP_FAILED" ]] then dspmsg -s 46 scripts.cat 17 "WARNING: node $NODE has encountered a fatal event script error.\nManual intervention is required.\n" $NODE NODE="" fi fi done if (( PERIOD < HOUR )) then (( MESSAGECNT++ )) if (( MESSAGECNT % $THRESHOLD == 0 )) then MESSAGECNT=0; (( PERIOD *= 2 )) if (( PERIOD > HOUR )) then PERIOD=$HOUR fi fi fi (( LOOPCNT++ )) (( TIME += PERIOD )) sleep_cntr=0 while (( sleep_cntr < PERIOD )) do sleep $SLEEP_INTERVAL (( sleep_cntr++ )) done done # we expect to be killed by clstrmgr, so there is no formal exit value here