#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2018,2020,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r714 src/43haes/usr/sbin/cluster/utilities/clstop.sh 1.2.17.7 # # Licensed Materials - Property of IBM # # COPYRIGHT International Business Machines Corp. 1990,2014 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 20c8e9d 43haes/usr/sbin/cluster/utilities/clstop.sh, 726, 2147A_aha726, Oct 30 2021 04:20 PM ############################################################################### # COMPONENT_NAME: UTILITIES # # FUNCTIONS: usage # # Name: clstop # # This program stops cluster daemons by using the SRC facility, and places # options in the HACMPdaemons ODM. # # Arguments: # -f forced stop # -g graceful dwon, no takeover by other node. # -g[r] graceful down, release resources. # -s Don't broadcast the shutdown message via /usr/sbin/wall # -y Don't ask for confirmation of process-shutdown # -N stop now # -R stop on subsequent system restart (remove inittab entry) # -B stop now and on subsequent system restart # -S called from rc.shutdown ( system shutdown ) # # Returns: 0 - program stopped # 1 - failed to stop the cluster daemons # # Environment: # ############################################################################### ############################################################################### # # Name: usage # # This routine displays the usage message for the clstop utility then exits. # # Arguments: none # Usage: usage # Returns: 1 - exit with error # Environment: # ############################################################################### usage() { typeset PS4_FUNC="usage" cl_log 223 \ "Usage: ${PROGNAME} -f | -g[r] [ -s ] [ -y ] [ -N | -R | -B ]\n" ${PROGNAME} exit 1 } ############################################################################### # # Name: VerifyEventError # # This function verifies whether any of the cluster nodes is in event # error (other than the local node), and, if so, exit the utility, thereby # preventing a node being forced down while another node is in event error. # # Arguments: none # Usage: VerifyEventError # Returns: none # Environment: # ############################################################################### VerifyEventError() { typeset PS4_FUNC="VerifyEventError" [[ $VERBOSE_LOGGING == "high" ]] && set -x # : check if local node is in event error. if so, allow forced down. # LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON |grep "Current state" | read skip CURRENT_STATE if [[ $CURRENT_STATE == "ST_RP_FAILED" ]] then dspmsg scripts.cat 9207 "\n$PROGNAME: Node ($LOCALNODENAME) had Event Script Failure previously.\n\ Stopping cluster services with \"Unmanage Resource Groups\" option may\n\ put some resource groups into unmanaged state, after which event processing\n\ will continue.\n" $PROGNAME $LOCALNODENAME return 0 fi CM_ACTIVE_NODES=$(/usr/es/sbin/cluster/utilities/clgetactivenodes -n $LOCALNODENAME -o $DCD 2>/dev/null) if [[ -z $CM_ACTIVE_NODES ]] then # : try the ACD, just in case the DCD was blown away, but the ACD should still be fine : because the local node is up # CM_ACTIVE_NODES=$(/usr/es/sbin/cluster/utilities/clgetactivenodes -n $LOCALNODENAME -o $ACD 2>/dev/null) if [[ -z $CM_ACTIVE_NODES ]]; then # : we could not determine an active node list. Let forced-down continue # return 0 fi fi typeset -i HasOutputHeading=0 typeset -i SomeNodesInRPFailedState=0 for node in $CM_ACTIVE_NODES; do LC_ALL=C /usr/es/sbin/cluster/utilities/cl_rsh $node lssrc -ls $CLSTRMGR_DAEMON |grep "Current state" | read skip CURRENT_STATE if [[ $CURRENT_STATE == "ST_RP_FAILED" ]] then if (( HasOutputHeading == 0 )) then HasOutputHeading=1 dspmsg scripts.cat 9205 "\n$PROGNAME: The following node(s) had Event Script Failure:\n" $PROGNAME 1>&2 fi echo "\t$node" 1>&2 SomeNodesInRPFailedState=1 fi done if (( SomeNodesInRPFailedState == 1 )) then dspmsg scripts.cat 9206 "$PROGNAME: The node(s) listed above should be recovered from Event Script\n\ failure before cluster services on node ($LOCALNODENAME) can be brought down\n\ with \"Unmanage Resource Groups\" option.\n" $PROGNAME $LOCALNODENAME 1>&2 # : exit the script from here, we do not continue with the forced down : request. # exit 1 fi return 0 } ############################################################################### # # Name: shutdown_rsct # # This function waits for cluter and resource managers to go inoperative and # then shuts down event management, topology and group services. # # Arguments: none # Usage: shutdown_rsct # Returns: none # Environment: # ############################################################################### shutdown_rsct() { typeset PS4_FUNC="shutdown_rsct" [[ $VERBOSE_LOGGING == "high" ]] && set -x if (( $IN_MIGRATION == 0 )); then # : We dont stop RSCT services in Cayenne. # return 0 fi # : trap SIGHUP so that script will : not be killed before it is done # trap "" 1 LOGFILE=/var/hacmp/log/shutdown_rsct.out # LOGFILE is where all stdout goes. integer retries=0 SMUX_FORCED=64 # exit code when clstrmgr has been forced down SMUX_SHUTDOWN=512 # exit code when clstrmgr stopped by aix shutdown RD_EXIT_STATUS_PATH="/usr/es/sbin/cluster/.clstrmgr.exit" echo "shutdown_rsct() Called on $(date)" >>$LOGFILE echo "waiting for clstrmgr to become inoperative...">>$LOGFILE # : lssrc is known to sometimes have temporary problems which give non-zero : return codes. When this happens here, STATE is left blank, which we then : interpret to be "clstrmgr is down" and then stop the rsct daemons, which : essentially kills the node. So, we check clstrmgrES with clcheck_server # /usr/es/sbin/cluster/utilities/clcheck_server $CLSTRMGR_DAEMON lssrc_rc=$? if (( $lssrc_rc != 1 )) then # : then might as well continue trying to shutdown rsct properly, after logging : at least we ensured it was not a temporary problem that was quickly cleared # echo "$PROGNAME has encountered successive lssrc problems. Attempting to continue." >>$LOGFILE fi TEMP_STATE=$(LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON) STATE=$(printf "$TEMP_STATE" | awk -F: '($1 == "Current state") { print $2 }' | sed 's/ //g') echo "$(date) ... clstrmgr state is ($STATE)" >>$LOGFILE # : If \$STATE is any value other than ST_INIT or NOT_CONFIGURED, including : null in the case where lssrc failed, then we want to loop and try again. # while [[ $STATE != "ST_INIT" && $STATE != "NOT_CONFIGURED" ]] do sleep 5 /usr/es/sbin/cluster/utilities/clcheck_server $CLSTRMGR_DAEMON lssrc_rc=$? if (( $lssrc_rc != 1 )) then # : Check whether the clstrmgr is in the process of being exited. The exit status will be written to a file : This status will be read and the fill will be removed inside clexit.rc. A rare timing window exists where we compete with : SRC->clexit.rc and halt the node prematurely. Hence We will parse this file and fetch the exit code of clstrmgr. : If its a normal termination of clstrmgr, we will skip node halt. # if [[ -r $RD_EXIT_STATUS_PATH ]] then CLSTRMGR_EXIT=$(cat $RD_EXIT_STATUS_PATH) if [[ $CLSTRMGR_EXIT == 0 || $CLSTRMGR_EXIT == $SMUX_FORCED || $CLSTRMGR_EXIT == $SMUX_SHUTDOWN ]] then # clcheck_server indicates the clstrmgr is inactive, however $RD_EXIT_STATUS_PATH still exists, # indicating the clstrmgr is in the process of exiting with code $CLSTRMGR_EXIT. Hence, continue # stopping RSCT services and do not call clexit.rc break fi fi # : There is a chance that we reached this point because the \$RD_EXIT_STATUS_PATH status file : has been removed, and the clstrmgr has restarted, but has not yet entered the loop where : it checks for incoming SRC messages. If \$STATE is null, but clstrmgr is running, it : is virtually guaranteed we have hit this problem. But the window is so small that : simply trying one more time will also guarantee the clstrmgr will then be accepting : SRC requests. # ps -eo 'args' | grep -vw grep | grep -qw clstrmgr ps_rc=$? if [[ -z $STATE && $ps_rc == 0 ]] then continue fi # : if we got here, we have a serious lssrc issue, and subsequent code to bring : rsct down cleanly will not succeed cleanly. So, issue clexit.rc directly. # echo "$PROGNAME has encountered successive lssrc problems. Halting the node." >>$LOGFILE clexit.rc $CLSTRMGR_DAEMON fi TEMP_STATE=$(LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON) STATE=$(printf "$TEMP_STATE" | awk -F: '($1 == "Current state") { print $2 }' | sed 's/ //g') echo "$(date) ... State is ($STATE)" >>$LOGFILE done echo "clstrmgr inoperative at $(date)">>$LOGFILE # : make sure our log gets written # sync; sync; sync LPPNAME="HACMP for AIX" # : rc.cluster will have created a link from /etc/rc.shutdown to the hacmp : version of rc.shutdown. It may also have renamed the users version of : /etc/rc.shutdown. Undo either of those changes here # if [[ -f /etc/rc.shutdown ]]; then # file exists grep -q "${LPPNAME}" /etc/rc.shutdown # look for our pattern rc=$? if [[ -L /etc/rc.shutdown ]] && (( $rc == 0 )); then # its the hacmp version echo "Removing rc.shutdown link">>$LOGFILE rm -f /etc/rc.shutdown # remove the _link_ if [[ -f /etc/rc.shutdown.hacmp.orig ]] # user version exists then cp -f /etc/rc.shutdown.hacmp.orig /etc/rc.shutdown fi fi fi ############################################################################ : If the group services based CLVM daemon is active, shut it down now : before group services itself is shut down ############################################################################ if [[ $STOP_GSCLVMD == "true" ]] then # : Check and see if there are any enhanced concurrent volume groups : hanging around in passive mode. If so, get rid of them before : shuting down gsclvmd. # INACTIVE_VGS=$(lsvg | grep -v -E $(lsvg -o | paste -s -d'|' - ) ) for vg in $INACTIVE_VGS ; do if LC_ALL=C lsvg $vg 2>/dev/null | grep -i -q 'passive-only' then # : Reset any read only fence height prior to vary off # cl_set_vg_fence_height -c $vg rw RC=$? if (( $RC != 0 )) then # : cl_set_vg_fence_height -c $vg rw return code is $RC : Log any error, but continue. If this is a real problem, the varyoffvg will fail # rw=$(dspmsg -s 103 cspoc.cat 350 'read/write' | cut -f2 -d,) cl_log 10511 "$PROGNAME: Volume group $vg fence height could not be set to read/write" $PROGNAME $vg $rw fi # : 'lsvg ' will show if a volume group is varied : on in passive mode. Any such are varied off # varyoffvg $vg fi done rm -f /usr/es/sbin/cluster/.gsclvmd echo "$(date) stopping gsclvmd..." >> $LOGFILE stopsrc -s gsclvmd echo "$(date) stopped gsclvmd..." >> $LOGFILE fi echo "$(date) stopping clinfoES...">>$LOGFILE stopsrc -c -s clinfoES echo "$(date) stopped clinfoES">>$LOGFILE echo "$(date) stopping emsvcs...">>$LOGFILE if [[ $EMSVCS_DAEMON == "emsvcsAS" ]] then stopsrc -cs $EMSVCS_DAEMON fi echo "$(date) stopped emsvcs">>$LOGFILE echo "$(date) stopping grpsvcs...">>$LOGFILE if [[ $GRPSVCS_DAEMON == "grpsvcsAS" ]] then stopsrc -cs $GRPSVCS_DAEMON fi echo "$(date) stopped grpsvcs">>$LOGFILE echo "$(date) stopping topsvcs..." >>$LOGFILE if [[ $TOPSVCS_DAEMON == "topsvcsAS" ]] then stopsrc -cs $TOPSVCS_DAEMON fi echo "$(date) stopped topsvcs..." >>$LOGFILE ################################################################### : Manage the aliases for heartbeat addresses ################################################################### NETWORKS=$(cllsnw -Sc | cut -d":" -f1) for NETWORK in $NETWORKS do if cllsnw -Scn $NETWORK | cut -d":" -f4 | grep -q hb_over_alias then VERBOSE_LOGGING=low cl_hb_alias_network $NETWORK force >>$LOGFILE 2>&1 fi done ################################################################### : Remove the registry entries for IBM.HacmpRgRm so it is unable : to report stale state information when cluster services is down ################################################################### echo "$(date) Removing IBM.HacmpRgRm registry entries..." >>$LOGFILE echo "$(date) The following registry entries will be removed:" >>$LOGFILE REGFILES=$(ls /var/ct/IW/registry/local_tree/*Hacmp*) echo "$REGFILES" >>$LOGFILE rm -f /var/ct/IW/registry/local_tree/*Hacmp* echo "$(date) IBM.HacmpRgRm registry entries removed..." >>$LOGFILE echo "$(date) shutdown_rsct() Exiting" >>$LOGFILE } ############################################################################### # Main ############################################################################### PROGNAME=${0##*/} RSCT_BIN=/usr/sbin/rsct/bin export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)":${RSCT_BIN} eval export $(cllsparam -x) if [[ $VERBOSE_LOGGING == "high" ]] then set -x version='1.2.17.7' fi HA_DIR="$(cl_get_path)" HOSTNAME=$(hostname) STOP_GSCLVMD="false" ACTIVE_ODM_IS_INVALID=0 CURRENT_STATE="" ODMDIR=/usr/${HA_DIR}/sbin/cluster/etc/objrepos/active DCD="/etc/${HA_DIR}/objrepos" ACD="/usr/${HA_DIR}/sbin/cluster/etc/objrepos/active" /usr/es/sbin/cluster/utilities/cl_migcheck HAESTOHAES IN_MIGRATION=$? CL_MIGCHECK_FILE="/usr/${HA_DIR}/sbin/cluster/utilities/cl_migcheck" CLUSTER_START_FILE="/usr/es/sbin/cluster/etc/.clusterStarted" TOPSVCS_DAEMON="" GRPSVCS_DAEMON="" EMSVCS_DAEMON="" if [[ -f /usr/es/sbin/cluster/clstrmgr ]] then # : Normal daemon names # CLSTRMGR_DAEMON="clstrmgrES" else # : Application Availability # CLSTRMGR_DAEMON="clstrmgrAS" TOPSVCS_DAEMON="topsvcsAS" GRPSVCS_DAEMON="grpsvcsAS" EMSVCS_DAEMON="emsvcsAS" fi # : Check the cluster configuration # CLUSTER_CONFIGURED=false if [[ -n $(ODMDIR=$DCD clodmget -f id -n HACMPcluster) ]] then CLUSTER_CONFIGURED=true elif [[ -n $(ODMDIR=$ACD clodmget -f id -n HACMPcluster) ]] then CLUSTER_CONFIGURED=true fi # : Save local node name. # LOCALNODENAME=$(ODMDIR=$DCD get_local_nodename 2>/dev/null) if [[ -z $LOCALNODENAME ]] then # : if could not determine nodename from DCD, try ACD: # LOCALNODENAME=$(ODMDIR=$ACD get_local_nodename 2>/dev/null) if [[ -z $LOCALNODENAME ]] then if [[ $CLUSTER_CONFIGURED == "true" ]] then dspmsg scripts.cat 2961 "\n$PROGNAME: Not able to discover the name of the local node. Please check the\n\ cluster configuration.\n" $PROGNAME 1>&2 fi exit 1 fi fi # : see if the cluster manager is operational # odmget HACMPcluster > /dev/null 2>&1 ACTIVE_ODM_IS_INVALID=$? if (( $ACTIVE_ODM_IS_INVALID != 0 )) then export ODMDIR=$DCD fi RUN_CLRMUPDATE=FALSE # : Run the clRMupdate only when the clstrmgr is active # CLSTRMGR_ACTIVE=$(LC_ALL=C lssrc -s $CLSTRMGR_DAEMON | grep $CLSTRMGR_DAEMON | grep -w active) if [[ -n $CLSTRMGR_ACTIVE ]] then CURRENT_STATE=$(LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON | while read line do if [[ $line == "Current state: "* ]] then echo "${line##*: }" fi done) if [[ $CURRENT_STATE != "ST_INIT" && $CURRENT_STATE != "NOT_CONFIGURED" ]] then RUN_CLRMUPDATE=TRUE fi fi mode="" when="now" broadcast="true" sys_shutdown="false" # : Normalize the command arguments # cmdargs="$*" if [[ -n $CLSTRMGR_ACTIVE ]] then cl_echo 208 "${0}: called with flags $cmdargs\n" "${0}" "$cmdargs" fi # : Parse the command line # while getopts ':fgrsyNRBS' option do case $option in f ) : Forced shutdown ; mode="forced" ;; g ) : Graceful shutdown ; mode="graceful" ;; r ) : graceful with takeover ; mode="takeover" ;; s ) : Do not broadcast ; broadcast="false" ;; y ) : Do not seek confirmation ; no_prompt="yes" ;; N ) : stop now ; when="now" ;; R ) : stop on system restart ; when="restart" ;; B ) : stop now and on restart ; when="both" ;; S ) : called from sytem shutdown ; sys_shutdown="true" ;; * ) : anything else invalid ; usage ;; esac done shift $((OPTIND - 1)) # : we need shutdown mode to be specified # if [[ $mode == "" ]] then usage fi # : Add options to the file if clchdaemons is present: # CLCHDAEMONS_FILE="/usr/${HA_DIR}/sbin/cluster/utilities/clchdaemons" if [[ -e $CLCHDAEMONS_FILE ]] then NODENAME=$(clodmget -f nodename -n HACMPcluster 2>/dev/null) fi TYPE="stop" if [[ -e $CLCHDAEMONS_FILE && $RUN_CLRMUPDATE == "TRUE" ]] then clchdaemons -n "$NODENAME" -d "$CLSTRMGR_DAEMON" -t "$TYPE" -o "time" -v "$when" >/dev/null 2>&1 clchdaemons -n "$NODENAME" -d "$CLSTRMGR_DAEMON" -t "$TYPE" -o "broadcast" -v "$broadcast" >/dev/null 2>&1 clchdaemons -n "$NODENAME" -d "$CLSTRMGR_DAEMON" -t "$TYPE" -o "mode" -v "$mode" >/dev/null 2>&1 fi # end of clchdaemons manipulation # : Remove inittab entry # if [[ $when == "restart" || $when == "both" ]] then cl_rmitab hacmp6000 > /dev/null 2>&1 /usr/bin/errlogger "$PROGNAME : The PowerHA SystemMirror 'hacmp6000' inittab entry has been removed, if present." if [[ $when == "restart" ]] then exit 0 fi fi # : See if this node is forced down # FORCEDOWN_NODES=$(LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON | while read line do if [[ $line == "Forced down node list: "* ]] then echo "${line##*: }" break; fi done) for forced_node in $FORCEDOWN_NODES; do # : Allow clstop to handle system shutdown case, when cluster is in unmanaged state too. # if [[ $forced_node == "$LOCALNODENAME" && $sys_shutdown == "false" ]] then dspmsg scripts.cat 9208 "$PROGNAME: Node $LOCALNODENAME is already stopped using the \"Unmanage Resource \n\ Groups\" stop option. No action will be taken on $LOCALNODENAME.\n" $PROGNAME $LOCALNODENAME $LOCALNODENAME 1>&2 exit -1; fi done # : Broadcast shutdown. # if [[ $broadcast != "false" ]] then dspmsg scripts.cat 961 "PowerHA SystemMirror on ${HOSTNAME} shutting down. \ Please exit any cluster applications..." ${HOSTNAME} | wall fi # : Status of cluster. # CURRENT_STATE=$(LC_ALL=C lssrc -ls $CLSTRMGR_DAEMON | grep "Current state:" | cut -f2 -d: | sed -e 's/^ *//g') if [[ $CURRENT_STATE == "ST_INIT" || $CURRENT_STATE == "NOT_CONFIGURED" ]] then dspmsg scripts.cat 10703 "Cluster services are already down on ${LOCALNODENAME}.\n" $LOCALNODENAME 1>&2 fi # : check for a migration in progress anywhere # migration_in_progress="false" if [[ -e $CL_MIGCHECK_FILE ]] then cl_migcheck "ANY" RC=$? if (( $RC == 1 )) then migration_in_progress="true" fi fi case $mode in forced) VerifyEventError # : Send a message to clstrmgr and clinfo # if [[ $sys_shutdown == "false" ]] then if [[ $RUN_CLRMUPDATE == "TRUE" ]] then clRMupdate clrm_stop_request force >/dev/null 2>&1 fi # : wait to stop clinfo # else if [[ $RUN_CLRMUPDATE == "TRUE" ]] then clRMupdate clrm_stop_request shutdown >/dev/null 2>&1 fi stopsrc -c -s clinfoES fi ;; takeover) # : Stop cluster manager and clinfo gracefully and release resources. # if [[ $RUN_CLRMUPDATE == "TRUE" ]] then clRMupdate clrm_stop_request graceful_with_takeover fi stopsrc -f -s clinfoES rm -f /usr/es/sbin/cluster/etc/ClSm 2>/dev/null ;; graceful) # : Stop cluster manager and clinfo gracefully and hold on to resources. # if [[ $RUN_CLRMUPDATE == "TRUE" ]] then clRMupdate clrm_stop_request graceful fi stopsrc -s clinfoES rm -f /usr/es/sbin/cluster/etc/ClSm 2>/dev/null ;; esac # : stop CLEVMGRD if active # if LC_ALL=C lssrc -a | grep -w clevmgrdES | grep -q -w active then stopsrc -s clevmgrdES fi #################################################################### : Remove the cluster start file so we dont perform FFDC collection #################################################################### rm -f $CLUSTER_START_FILE if [[ $mode != "forced" ]] then if (( $IN_MIGRATION == 1 )) then # : We dont stop RSCT services in Cayenne. # LOGFILE=/tmp/shutdown_rsct.out # LOGFILE is where all stdout goes. >$LOGFILE # truncate LOGFILE. shutdown_rsct >> $LOGFILE 2>&1 & sleep 5 fi fi # : verify the grace period for node level and cluster level, if grace period is set, : CriticalMode need to set to 1 to avoid the rebooting of the node, when cthags daemon got killed. # typeset grace_period typeset node_grace_period typeset local_node grace_period=$(clodmget -f crit_daemon_restart_grace_period HACMPcluster) local_node=$(get_local_nodename) node_grace_period=$(clodmget -n -q "object=CRIT_DAEMON_RESTART_GRACE_PERIOD and name=$local_node" -f value HACMPnode) # Get node level value, if it is set, verify the value is set to more than zero. # if node level is not set, check cluster level value, if it is more than zero if [[ $mode == "forced" ]];then if [[ -n $node_grace_period ]] && (( $node_grace_period > 0 )) || (( $grace_period != 0 )) then export CT_MANAGEMENT_SCOPE=2 chrsrc -c IBM.PeerNode CriticalMode=1 unset CT_MANAGEMENT_SCOPE vglist=$(clodmget -n -q "name=VOLUME_GROUP" -f value HACMPresource) if [[ -n $vglist ]] then for vg in $vglist do lsvg_out=$(LC_ALL=C lsvg $vg 2>/dev/null) if print -- "$lsvg_out" | grep -i -q 'VG STATE: * active ' then varyonvg -S $vg >/dev/null 2>&1 if [[ $? == 0 ]];then cl_dspmsg -s 19 cspoc.cat 5 "cl_clstop: Successfully ran varyonvg -S for volume group %1\$s" $vg fi fi done fi fi fi return 0