#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2019,2020,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r721 src/43haes/lib/ksh93/hacmp/KLIB_HACMP_offline_node.sh 1.17.1.3 # # Licensed Materials - Property of IBM # # Restricted Materials of IBM # # COPYRIGHT International Business Machines Corp. 2006,2016 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_offline_node.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM # Start of POD-formatted documentation. Viewing suggestions: # perldoc # pod2text -c # pod2text -c --code # pod2html function devDoc { : <<'=cut' >/dev/null 2>&1 =head1 NAME KLIB_HACMP_offline_node =head1 SYNOPSIS clmgr offline node [,,,...] \ [ WHEN={now|restart|both} ] \ [ MANAGE={offline|move|unmanage} ] \ [ BROADCAST={true|false} ] \ [ TIMEOUT= ] \ [ STOP_CAA={no|yes} ] NOTE: the "TIMEOUT" attribute defaults to 120 seconds. NOTE: the alias for "node" is "no". =head1 DESCRIPTION Attempts to bring a PowerHA node down, and performs one of the following actions with resource groups that are running on that node: offline - brings the RGs offline move_rg - moves the resource groups to another node in the cluster unmanage - does not stop the resources, but HACMP no longer monitors any of the resource groups on the node as dicated by the MANAGE attribute. =head1 ARGUMENTS 1. node [OPTIONAL] [string] The name/label of the node that is to be brought offline. Defaults to the local node. 2. when [OPTIONAL] [string] An indicator of when the action should take place. Valid values include: {now|restart|both} 3. manage [OPTIONAL] [string] An indicator of the manner in which any resource groups managed on the specified node should be handled. Valid values include: {offline|move|unmanage} 4. broadcast [OPTIONAL] [string] A Boolean-like indicator of whether or not to broadcast a message announcing the shutdown on each affected node. 5. timeout [OPTIONAL] [integer] The amount of time to allow the shutdown to continue before giving up and reporting a failure. Of course, just because clmgr gives up does not mean that PowerHA does, and the shutdown may occur anyway, simply taking longer than the timeout to complete! If user does not specify a timeout value, clmgr cognitive mechanism predicts a timeout value based on previous similar operations or cluster size. When cluster services are stopped with the unmanage option, the default timeout is used. The default timeout is 120 seconds. 6. stop_caa [OPTIONAL] [boolean] A Boolean-like indicator for specifying whether or not to explicitly send a stop request to the CAA cluster nodes for the specified node(s). =head1 RETURN 0: no errors were detected; the operation appears to have been successful 1: a general error has occurred 2: a specified resource does not exist, or could not be found 3: some required input was missing 4: some detected input was incorrect in some way 5: a required dependency does not exist 6: a specified search failed to match any data 7: given operation was not really necessary 8: chosen timeout value is exhausted =cut } # End of POD-formatted documentation. function on_exit_offline_node { if (( $rc == RC_UNKNOWN )); then print exit $RC_ERROR fi } #============================================================================ # # Name: KLIB_HACMP_offline_node # # Description: This is the main, FPATH function that is invoked by clmgr # to bring cluster services offline on one or more specified # nodes (defaulting to the local node). Each node is first # checked to see if it is active. Only active nodes are sent # a stop command. If the nodes are already all offline, then # the desired end state already exists, and a success code # is returned. Otherwise, the appropriate stop command is # issued for the active nodes, and the results are returned. # It is worth mentioning that this command employs a user- # configurable timeout to avoid hangs. # # If the CAA cluster services are also being brought offline, # that is done *after* the SystemMirror cluster services are # *successfully* brought offline. One limitation to be aware # of is that it is only possible for one node to stop CAA on # another node if the local node currently has active CAA # services. Put another way, if in one clmgr operation, you # stop CAA on the *local* node, you will not be able to # manage CAA on any of the remote nodes from this node until # CAA services are restored. # Workaround 1: use clrsh to run clmgr on the remote nodes. # Workaround 2: log in to the remote nodes and run clmgr. # Workaround 3: when working with a subset of the cluster # nodes, always manipulate them all at once. # Workaround 4: always manage the local node last. # # The "STOP_CAA" option is really intended for use only # during specific, exceptional conditions. Normally, it is # neither recommended nor needed to directly manipulate the # CAA state. # # Inputs: See the "devDoc()" function, above. # # Outputs: The only outputs are any messages, error or otherwise, # that might be needed. # # Returns: Zero if no errors are detected. Otherwise, an appropriate # non-zero value is returned. Refer to the "RETURN" section # of the "devDoc()" function, above, for the standard return # code values/meanings for clmgr. # #============================================================================ function KLIB_HACMP_offline_node { trap 'on_exit_offline_node' EXIT . $HALIBROOT/log_entry "$0()" "$CL" : version="@(#) 7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_offline_node.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM" : INPUTS: $* typeset nodes=${1//\"/} typeset when=${2//\"/} typeset manage=${3//\"/} typeset broadcast=${4//\"/} integer timeout=${5//\"/} typeset stop_caa=${6//\"/} [[ $CLMGR_LOGGING == 'med' ]] && set +x # Only trace param values #=================================== : Declare and initialize variables #=================================== typeset -i used=0 interval=5 user_timeout_flag=1 if (( $timeout <= 0 )); then user_timeout_flag=0 # User has not provided timeout timeout=120 # Set a default value fi rc=$RC_UNKNOWN # Declare globally, so it is visible in the exit function typeset cmd_rc=$RC_UNKNOWN typeset -i i=0 typeset state= node= #================================================================ : Assuming an object was specified, see if it is a known object #================================================================ if [[ -n $nodes ]]; then for node in ${nodes//,/ }; do if [[ $node != *([[:space:]]) ]]; then CL=$LINENO KLIB_HACMP_is_known_node "$node" >/dev/null 2>&1 if (( $? != RC_SUCCESS )); then rc=$RC_NOT_FOUND break fi fi done else nodes=$LOCAL_NODE fi #================= : Validate input #================= if [[ -z $nodes ]]; then /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 505 "\nERROR: at least one node must be specified.\n\n" 1>&2 rc=$RC_MISSING_INPUT elif (( $rc == RC_NOT_FOUND )); then /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 102 "\nERROR: \"%1\$s\" does not appear to exist!\n\n" "$node" 1>&2 /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 151 "Available Nodes:\n\n" 1>&2 typeset available CL=$LINENO KLIB_HACMP_list_nodes available for (( i=0; i<${#available[*]}; i++ )); do if [[ ${available[$i]} != *([[:space:]]) ]]; then print -u2 "\t${available[$i]}" fi done print -u2 "" else if [[ -z $when ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 "\nWarning: \"%1\$s\" must be specified. Since it was not,\n a default of \"%2\$s\" will be used.\n\n" WHEN now when="-N" else CL=$LINENO verify_in_set WHEN "$when" "NOW RESTART BOTH" when if (( $? == RC_SUCCESS )); then when="-${when:0:1}" else rc=$RC_INCORRECT_INPUT fi fi if [[ -z $manage ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 "\nWarning: \"%1\$s\" must be specified. Since it was not,\n a default of \"%2\$s\" will be used.\n\n" MANAGE offline manage="-g" else CL=$LINENO verify_in_set MANAGE "$manage" "offline move unmanage" manage if (( $? == RC_SUCCESS )); then case $manage in o*) manage="-g" ;; m*) manage="-gr" ;; u*) manage="-f" ;; esac else rc=$RC_INCORRECT_INPUT fi fi if [[ -n $broadcast ]]; then CL=$LINENO verify_in_set BROADCAST "$broadcast" "yes true enable 1 no false disable 0" broadcast if (( $? == RC_SUCCESS )); then [[ $broadcast == @(y|t|e|1)* ]] && broadcast="" || broadcast="-s" else rc=$RC_INCORRECT_INPUT fi fi if [[ -n $stop_caa ]]; then CL=$LINENO verify_in_set STOP_CAA "$stop_caa" "yes true enable 1 no false disable 0" stop_caa if (( $? == RC_SUCCESS )); then [[ $stop_caa == @(y|t|e|1)* ]] && stop_caa="yes" || stop_caa="" else rc=$RC_INCORRECT_INPUT fi fi if [[ $stop_caa == "yes" && $manage == "-f" ]]; then /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 534 "\nCombination of manage=unmanage and STOP_CAA=yes is not allowed,\n since you must stop SystemMirror cluster services before stopping CAA\n\n" rc=$RC_INCORRECT_INPUT fi #============================================================== : If the current operation is unmanaging a node, we do what : we can here to avoid allowing customers to get themselves : into trouble with that. It is a really bad idea to unmanage : a node when there are unsynchronized changes in the cluster. : It is very difficult to recover from, and will certainly : result in an outage, and possibly a crash of the LPAR. #============================================================== if [[ $manage == "-f" ]]; then typeset NEED_SYNC=$(clcmd /bin/ksh93 -c "$HAUTILS/clodmget -f handle HACMPcluster" 2>/dev/null |\ grep -p '^0$' | grep -w "^NODE") if [[ -z $NEED_SYNC ]]; then : If we get nothing from clcmd, check locally, too... just in case... if clodmget -f handle HACMPcluster | grep -qw 0 then NEED_SYNC=$(clodmget -q "object=COMMUNICATION_PATH AND name=$(get_local_nodename)" -n -f value HACMPnode) fi fi if [[ -n $NEED_SYNC ]]; then # : Collect the list of node names that need to be synced # typeset US_LIST="" NODE_NAME="" print -- "$NEED_SYNC" |\ while read LINE do LINE=${LINE#*[[:space:]]} # Strips off "NODE" NODE_NAME=$(clodmget -q "object=COMMUNICATION_PATH AND value=$LINE" -n -f name HACMPnode) if [[ -n $NODE_NAME ]] then US_LIST=${US_LIST:+$US_LIST,}$NODE_NAME else US_LIST=${US_LIST:+$US_LIST,}$LINE fi done cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 1311 '\nERROR: One or more cluster nodes (%1$s) currently have unsychronized\nconfiguration changes. Unmanaging a cluster node is not allowed at this time.\n' "${US_LIST//\n/,}" 1>&2 log_return_msg "$RC_ERROR" "$0" "$LINENO" rc=$RC_INCORRECT_INPUT return $rc fi fi fi if (( $rc == RC_UNKNOWN )); then #================================================================== : First, check to see if the specified nodes already have cluster : services inactive. If so, then this command is not needed for : those nodes, and will result in an erroneous exit code of 1. : Further, make sure active nodes are "ST_STABLE", or clstop will : fail because of that. #================================================================== typeset new_nodelist= state= orig_nodes=$nodes for (( used=0; used < timeout; )); do typeset START=$SECONDS # Do not declare as an integer! typeset -i node_count=0 ready_count=0 for node in ${nodes//,/ }; do (( node_count++ )) CL=$LINENO KLIB_HACMP_get_node_state "$node" state cmd_rc=$? if [[ $state == *@(INIT|NOT_CONFIGURED|UNMANAGED|INOPERATIVE) ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 500 "Warning: cluster services are already offline on node\n \"%1\$s\" (state is \"%2\$s\").\n Removing that node from the shutdown list..\n" "$node" "$state" (( ready_count++ )) elif [[ $state == "ST_RP_FAILED" ]]; then # Node is broken! cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 507 "\nERROR: cluster services have experienced a problem on node\n \"%1\$s\" which has left the node in an error state.\n Manual intervention will be required to recover from this problem.\n\n" "$node" 1>&2 rc=$RC_ERROR elif (( cmd_rc == RC_ERROR )) && \ [[ $state == "UNKNOWN" ]] then # clstrmgrES is down! rc=$RC_ERROR elif [[ $state != "ST_STABLE" ]]; then if [[ ",$new_nodelist," != *,$node,* ]]; then new_nodelist="$new_nodelist,$node" fi elif [[ ",$new_nodelist," != *,$node,* ]]; then new_nodelist="$new_nodelist,$node" (( ready_count++ )) fi done nodes=${new_nodelist#,} [[ $nodes == *([[:space:]]) ]] && break (( $rc != RC_UNKNOWN )) && break (( $node_count == $ready_count )) && break if (( used < (timeout - interval) )); then typeset DIFF=$SECONDS (( DIFF -= $START )) if (( $DIFF < interval )); then (( DIFF = interval - $DIFF )) sleep $DIFF (( used += interval )) else (( used += $DIFF )) fi else (( used += interval )) fi done fi if (( $rc == RC_UNKNOWN )) && [[ $nodes != *([[:space:]]) ]]; then #============================================================ : If user has not specified a timeout value, : Predict best possible timeout based on previous operations : For unmanage down, let us go with default timeout #============================================================ if (( $user_timeout_flag == 0 )) && [[ $manage != "-f" ]]; then typeset -i predicted_timeout=0 print "$0()[$LINENO]($SECONDS): Using availability metrics to predict a timeout value" >>$CLMGR_TMPLOG for node in ${nodes//,/ }; do # catch average time taken for stop cluster services for each node # cl_availability gives average time for STOP operation at the end typeset DATA=$(LC_ALL=C cl_availability -n $node | grep 'Average time taken' | awk '{print $NF}' | tail -1) if [[ -n $DATA && $DATA == *:*:* ]]; then # convert HH:MM:SS format into seconds typeset -i seconds=$(echo $DATA | awk -F: '{print $1*60*60+$2*60+$3}') print "$0()[$LINENO]($SECONDS): Average time taken for previous cluster services stop operations on node $node is: $seconds seconds" >>$CLMGR_TMPLOG # consider the maximum value among all nodes (( $seconds > $predicted_timeout )) && predicted_timeout=$seconds fi done # If we do not get any prediction from cl_availability, let us depend on cluster resource count # Based on analysis, we assume each resource takes around 10 seconds to process if (( $predicted_timeout == 0 )); then print "$0()[$LINENO]($SECONDS): We have not got any prediction from availability metrics" >>$CLMGR_TMPLOG typeset -i res_count=$(clodmget HACMPresource | wc -l) (( predicted_timeout=res_count*10 )) print "$0()[$LINENO]($SECONDS): Predicted value based on cluster resource count(=$res_count) is: $predicted_timeout seconds" >>$CLMGR_TMPLOG else print "$0()[$LINENO]($SECONDS): Predicted value from availability metrics is: $predicted_timeout seconds" >>$CLMGR_TMPLOG fi # For a better value, pad it with 1 minute more. (( predicted_timeout=predicted_timeout+60 )) # By now, if we get a good prediction, let us go with it. # Else, default timeout will be used. if (( $predicted_timeout > $timeout )); then timeout=$predicted_timeout fi print "$0()[$LINENO]($SECONDS): Final TIMEOUT value is: $timeout seconds" >>$CLMGR_TMPLOG fi (( $user_timeout_flag == 0 )) && cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 1459 "\nINFO: As %1\$s is not specified, clmgr cognitive mechanism has predicted %1\$s=%2\$d.\n" TIMEOUT $timeout #============================================= : Attempt to stop any remaining active nodes #============================================= print "$0()[$LINENO]($SECONDS): _SPOC_FORCE=Y $HACSPOC/fix_args nop cl_clstop $when -cspoc-n \"$nodes\" $broadcast $manage" >>$CLMGR_TMPLOG # Always log commands VERBOSE_LOGGING="" _SPOC_FORCE=Y $HACSPOC/fix_args nop cl_clstop $when -cspoc-n "$nodes" $broadcast $manage rc=$? print "cl_clstop RC: $rc" >>$CLMGR_TMPLOG # Always log command result (( $rc != RC_SUCCESS )) && rc=$RC_ERROR #============================================================ : The command executed above is asynchronous. So in order : for the correct status to be returned, it is necessary : to loop here, and poll the cluster manager. Of course, : a visual indication is provided for the customer, so : they can see that something is happening. And eventually, : we will give up, and report a problem message. #============================================================ if (( $rc == RC_SUCCESS )); then typeset node= check_nodes=" ${nodes//,/ } " rc=$RC_TIMEOUT # Assume timeout expiry, until proven otherwise for (( used=0; used < timeout; used += interval )); do sleep $interval for node in $check_nodes; do CL=$LINENO KLIB_HACMP_get_node_state "$node" state 2>>$CLMGR_TMPLOG if [[ $state == @(ST_INIT|NOT_CONFIGURED|UNMANAGED|INOPERATIVE) ]]; then (( used > 0 )) && print if [[ $state == "UNMANAGED" ]]; then /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 525 "\"%1\$s\" is now unmanaged.\n" "$node" else /usr/bin/dspmsg -s $CLMGR_SET $CLMGR_MSGS 506 "\"%1\$s\" is now offline.\n" "$node" fi check_nodes=${check_nodes// $node / } if [[ $check_nodes == *([[:space:]]) ]]; then unset check_nodes rc=$RC_SUCCESS break fi elif [[ $state == "ST_RP_FAILED" ]]; then # No recovering from this (( used > 0 )) && print cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 507 "\nERROR: cluster services have experienced a problem on node\n \"%1\$s\" which has left the node in an error state.\n Manual intervention will be required to recover from this problem.\n\n" "$node" 1>&2 unset check_nodes rc=$RC_ERROR break fi done [[ -n $check_nodes ]] && print -n "." || break done elif [[ ! -z $nodes ]]; then rc=$RC_ERROR fi elif (( $rc == $RC_UNKNOWN )) && [[ $nodes == *([[:space:]]) ]]; then : All nodes are down, so we have the desired end state rc=$RC_SUCCESS fi if (( $used >= $timeout )) && (( $rc != $RC_SUCCESS )); then print cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 508 "\nWarning: unable to determine if node \"%1\$s\" has gone offline\n (the process is still not complete). Wait a few minutes, then\n manually check the node's state using \"clmgr query node %1\$s\".\n\n" "$node" "$CLMGR_PROGNAME" fi #======================================================================= : If a user input error was detected, provide some helpful suggestions #======================================================================= if (( $rc == RC_MISSING_INPUT || $rc == RC_INCORRECT_INPUT )) && \ [[ $CLMGR_GUI == *([[:space:]]) ]] then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 104 "For more information about available options and syntax, try\n\"$HAUTILS/clmgr %1\$s\". As an\nalternative, if the PowerHA SystemMirror man pages have been installed, invoke\n\"$HAUTILS/clmgr -hv\" (or \"/usr/bin/man clmgr\"),\nsearching for \"%2\$s\" in the displayed ext.\n\n" \ "offline node -h" "NODE:" "$CLMGR_PROGNAME" 1>&2 fi if (( $rc == RC_SUCCESS )) && [[ -n $stop_caa && -n $orig_nodes ]]; then #==================================================== : The customer has requested that a stop request be : sent to the CAA cluster on nodes "$orig_nodes". #==================================================== typeset CNAME=$(LC_ALL=C lscluster -c 2>>$CLMGR_TMPLOG | grep "^Cluster Name: ") CNAME=${CNAME#*: } [[ -z $CNAME ]] && CNAME=$(clodmget -n -f name HACMPcluster) if [[ -n $CNAME ]]; then rm -f $HAETC/clmgr.$$ typeset CAA_NODE="" CAA_NODES="" for node in ${orig_nodes//,/ }; do CAA_NODE=$(VERBOSE_LOGGING="" cl_nn2hn $node) if [[ -n $CAA_NODE ]]; then #============================================== : Create the "license" file needed to enabled : the smcaactrl SystemMirror "plugin" for CAA. #============================================== typeset POS=$(LC_ALL=C lscluster -m 2>>$CLMGR_TMPLOG |\ grep "Node name: " |\ grep -wn $CAA_NODE) POS=${POS%%:*} typeset NODE_UUID="" if [[ $POS == +([0-9]) ]] && (( POS )); then NODE_UUID=$(LC_ALL=C lscluster -m |\ grep "UUID for node: " |\ head -n $POS | tail -1) fi print -- "$node $CAA_NODE $NODE_UUID" >> $HAETC/clmgr.$$ [[ -n $CAA_NODES ]] && CAA_NODES="$CAA_NODES," CAA_NODES="$CAA_NODES$CAA_NODE" fi done if [[ -n $CAA_NODES ]]; then print "$0()[$LINENO]($SECONDS): clctrl -stop -n $CNAME -m $CAA_NODES" >>$CLMGR_TMPLOG clctrl -stop -n $CNAME -m $CAA_NODES rc=$? print "$0()[$LINENO]($SECONDS): clctrl RC: $rc" >>$CLMGR_TMPLOG fi rm -f $HAETC/clmgr.$$ else cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 191 "\nERROR: unable to determine the name of the cluster. Knowing the name of the cluster is needed in order to stop or start Cluster Aware AIX cluster services. Verify that this cluster is fully configured and has been successfully synchronized before attempting this operation again.\n\n" 1>&2 fi fi log_return_msg "$rc" "$0()" "$LINENO" return $? } # End of "KLIB_HACMP_offline_node()" #============================================================================== # The following, comment block attempts to enforce coding standards when this # file is edited via emacs or vim. This block _must_ appear at the very end # of the file, or the editor will not find it, and it will be ignored. #============================================================================== # Local Variables: # indent-tabs-mode: nil # tab-width: 4 # End: #============================================================================== # vim: tabstop=4 shiftwidth=4 expandtab #==============================================================================