#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2019,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r721 src/43haes/lib/ksh93/hacmp/KLIB_HACMP_online_site.sh 1.12.1.4 # # Licensed Materials - Property of IBM # # COPYRIGHT International Business Machines Corp. 1990,2010 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 0772e43 43haes/lib/ksh93/hacmp/KLIB_HACMP_online_site.sh, 726, 2147A_aha726, Jul 29 2021 09:46 PM #============================================================================ # # Name: KLIB_HACMP_online_site # # Description: This is the main, FPATH function that is invoked by clmgr # to bring cluster services online on one or more specified # sites (on all nodes within each site). Each node is first # checked to see if it is inactive. Only inactive nodes are # sent a start command. If the nodes are already all online, # then the desired end state already exists, and a success # code is returned. Otherwise, the appropriate start command # is issued for the inactive nodes, and the results are # returned. It is worth mentioning that this command employs # a user-configurable timeout to avoid hangs. # # If the CAA cluster services are also being brought online, # that is done *before* the SystemMirror cluster services are # brought online. One limitation to be aware of is that it # is only possible for one node to start CAA on another node # if the local node currently has active CAA services. Put # another way, if CAA is not active on the *local* node, you # will not be able to manage CAA on any remote nodes from # this node until CAA services are restored. # Workaround 1: use clrsh to run clmgr on the remote nodes. # Workaround 2: log in to the remote nodes and run clmgr. # Workaround 3: when working with a subset of the cluster # nodes, always manipulate them all at once. # Workaround 4: always manage the local node last. # # The "START_CAA" option is really intended for use only # during specific, exceptional conditions. Normally, it is # neither recommended nor needed to directly manipulate the # CAA state. # # Inputs: See the "devDoc()" function, above. # # Outputs: The only outputs are any messages, error or otherwise, # that might be needed. # # Returns: Zero if no errors are detected. Otherwise, an appropriate # non-zero value is returned. Refer to the "RETURN" section # of the "devDoc()" function, above, for the standard return # code values/meanings for clmgr. # #============================================================================ function KLIB_HACMP_online_site { trap 'on_exit_online_site' EXIT . $HALIBROOT/log_entry "$0()" "$CL" : version="@(#) 0772e43 43haes/lib/ksh93/hacmp/KLIB_HACMP_online_site.sh, 726, 2147A_aha726, Jul 29 2021 09:46 PM" : INPUTS: $* typeset site=${1//\"/} typeset when=${2//\"/} typeset manage=${3//\"/} typeset broadcast=${4//\"/} typeset clinfo=${5//\"/} typeset force=${6//\"/} typeset fix=${7//\"/} integer timeout=${8//\"/} typeset start_caa=${9//\"/} [[ $CLMGR_LOGGING == 'med' ]] && set +x # Only trace param values #=================================== : Declare and initialize variables #=================================== typeset -i used=0 interval=5 errors=0 rc=$RC_UNKNOWN # Declare globally, so it is visible in the exit function typeset -i i=0 typeset state= start_clinfo= if [[ $site != *([[:space:]]) ]]; then nodes=$(cllssite -c "$site" 2>/dev/null | grep -v "^#" | grep -w "$site" | cut -d: -f2 2>/dev/null) nodes=${nodes#+([[:space:]])} nodes=${nodes%+([[:space:]])} nodes=${nodes//+([[:space:]])/,} fi #================================================================ : Assuming an object was specified, see if it is a known object #================================================================ if [[ $site != *([[:space:]]) ]]; then CL=$LINENO KLIB_HACMP_is_known_site "$site" >/dev/null 2>&1 (( $? != RC_SUCCESS )) && rc=$RC_NOT_FOUND fi #================= : Validate input #================= if [[ -z $site ]]; then dspmsg -s $CLMGR_SET $CLMGR_MSGS 511 "\nERROR: at least one site must be specified.\n\n" 1>&2 rc=$RC_MISSING_INPUT elif (( $rc == RC_NOT_FOUND )); then dspmsg -s $CLMGR_SET $CLMGR_MSGS 102 '\nERROR: "%1$s" does not appear to exist!\n\n' "$site" 1>&2 dspmsg -s $CLMGR_SET $CLMGR_MSGS 157 "Available Sites:\n\n" 1>&2 typeset available CL=$LINENO KLIB_HACMP_list_sites available for (( i=0; i<${#available[*]}; i++ )); do if [[ ${available[$i]} != *([[:space:]]) ]]; then print -u2 "\t${available[$i]}" fi done print -u2 "" elif [[ -z $nodes ]]; then dspmsg -s $CLMGR_SET $CLMGR_MSGS 512 '\nERROR: "%1$s" contains no nodes.\n\n' "$site" 1>&2 rc=$RC_MISSING_DEPENDENCY else if [[ -n $start_caa ]]; then CL=$LINENO verify_in_set START_CAA "$start_caa" "only yes true enable 1 no false disable 0" start_caa if (( $? == RC_SUCCESS )); then if [[ $start_caa == o* ]]; then start_caa="only" elif [[ $start_caa == @(y|t|e|1)* ]]; then start_caa="yes" else start_caa="" fi else rc=$RC_INCORRECT_INPUT fi fi # : Do not validate PowerHA start-up inputs if only CAA is being started # if [[ $start_caa != "only" ]]; then if [[ -z $when ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 '\nWarning: "%1$s" must be specified. Since it was not,\n a default of "%2$s" will be used.\n\n' WHEN now when="-N" else CL=$LINENO verify_in_set WHEN "$when" "NOW RESTART BOTH" when if (( $? == RC_SUCCESS )); then when="-${when:0:1}" else rc=$RC_INCORRECT_INPUT fi fi if [[ -z $manage ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 '\nWarning: "%1$s" must be specified. Since it was not,\n a default of "%2$s" will be used.\n\n' MANAGE auto manage="-A" else CL=$LINENO verify_in_set MANAGE "$manage" "AUTOMATIC MANUAL" manage if (( $? == RC_SUCCESS )); then [[ $manage == M* ]] && manage="-M" || manage="-A" else rc=$RC_INCORRECT_INPUT fi fi if [[ -n $broadcast ]]; then CL=$LINENO verify_in_set BROADCAST "$broadcast" "yes true enable 1 no false disable 0" broadcast if (( $? == RC_SUCCESS )); then [[ $broadcast == @(n|f|d|0)* ]] && broadcast="" || broadcast="-b" else rc=$RC_INCORRECT_INPUT fi else broadcast="-b" # Broadcast be default fi if [[ -n $clinfo ]]; then CL=$LINENO verify_in_set CLINFO "$clinfo" "yes true enable 1 no false disable 0 consistent" clinfo if (( $? == RC_SUCCESS )); then case $clinfo in @(t|y|1|e)*) start_clinfo="-i" ;; c* )start_clinfo="-I" ;; esac else rc=$RC_INCORRECT_INPUT fi fi if [[ -n $force ]]; then CL=$LINENO verify_in_set FORCE "$force" "yes true enable 1 no false disable 0" force if (( $? == RC_SUCCESS )); then [[ $force == @(y|t|e|1)* ]] && force="-v" || force="" else rc=$RC_INCORRECT_INPUT fi fi if [[ -n $fix ]]; then CL=$LINENO verify_in_set FIX "$fix" "yes true enable 1 no false disable 0 interactively" fix if (( $? == RC_SUCCESS )); then case $fix in i*) (( ! CLMGR_FORCE )) && fix="-C interactive" ;; @(y|t|e|1)*) fix="-C yes" ;; *) fix="" ;; esac else rc=$RC_INCORRECT_INPUT fi fi elif [[ -n "$when$manage$broadcast$clinfo$force$fix$timeout" ]]; then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 1491 'Warning: when the "%1$s" input is used with a value of "%2$s", all other option(s) are invalid and will be disregarded.\n' "START_CAA" "$start_caa" fi #=============================================================== : Dynamically set the timeout based upon the number of nodes. : Of course, this is only done if the customer did not already : specify a timeout, or specified a timeout that is too short. #=============================================================== if [[ -z $timeout ]] || (( timeout < 180 )); then # Allow 5 minutes for a 2-node cluster; more than enough timeout=300 # Add 90 seconds for each extra node beyond 2 for node in ${nodes//,/ }; do (( ++nc > 2 )) && (( timeout += 90 )) done fi fi #========================================================================== : First, check to see if the specified nodes already have cluster services : running. If so, then this command is not needed for those nodes, and will : result in an erroneous exit code of 1. #========================================================================== typeset new_nodelist= state= orig_nodes=$nodes for node in ${nodes//,/ }; do state=$(CL=$LINENO KLIB_HACMP_get_node_state $node 2>>$CLMGR_TMPLOG) # If clstrmgrES subsystem is inoperative, clstart will start it, so no further action is required. if [[ $state != *@(INIT|NOT_CONFIGURED|UNMANAGED|INOPERATIVE) ]]; then if [[ $state == "UNKNOWN" ]]; then typeset COMMPATH=$(clodmget -q "name=$node AND object=COMMUNICATION_PATH" -n -f value HACMPnode) cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 194 '\nERROR: unable to communicate with "%1$s" (%2$s). Verify that the node is powered up and active, and that clcomd is properly configured and running on it. If the problem persists, also check the local clcomd, and verify that the network is functioning normally.\n\n' "$node" "$COMMPATH" 1>&2 rc=$RC_ERROR else cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 501 'Warning: cluster services are already online on node\n "%1$s" (state is "%2$s").\n Removing that node from the startup list.\n' "$node" "$state" fi else new_nodelist="$new_nodelist,$node" fi done nodes=${new_nodelist#,} if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then if [[ -n $start_caa ]]; then #========================================================= : The customer has requested that a start request be : sent to the CAA cluster on the nodes for site "$site". #========================================================= typeset CNAME=$(LC_ALL=C lscluster -c 2>>$CLMGR_TMPLOG | grep "^Cluster Name: ") CNAME=${CNAME#*: } [[ -z $CNAME ]] && CNAME=$(clodmget -n -f name HACMPcluster) if [[ -n $CNAME ]]; then typeset CAA_NODE="" CAA_NODES="" for node in ${orig_nodes//,/ }; do CAA_NODE=$(VERBOSE_LOGGING="" cl_nn2hn $node) if [[ -n $CAA_NODE ]]; then [[ -n $CAA_NODES ]] && CAA_NODES="$CAA_NODES," CAA_NODES="$CAA_NODES$CAA_NODE" fi done date > $HAETC/clmgr.$$ print "$0()[$LINENO]($SECONDS): clctrl -start -n $CNAME -m $CAA_NODES" >>$CLMGR_TMPLOG clctrl -start -n $CNAME -m $CAA_NODES print "$0()[$LINENO]($SECONDS): clctrl RC: $?" >>$CLMGR_TMPLOG rm -f $HAETC/clmgr.$$ else cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 191 "\nERROR: unable to determine the name of the cluster. Knowing the name of the cluster is needed in order to stop or start Cluster Aware AIX cluster services. Verify that this cluster is fully configured and has been successfully synchronized before attempting this operation again.\n\n" 1>&2 fi fi fi if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then #==================================================== : Make sure CAA is active before starting the nodes #==================================================== for node in ${nodes//,/ }; do print "$0()[$LINENO]($SECONDS): $CLRSH $node /bin/true" >>$CLMGR_TMPLOG $CLRSH $node /bin/true if (( $? == RC_SUCCESS )); then : Looks like clcomd is working print "$0()[$LINENO]($SECONDS): $CLRSH $node lscluster -c" >>$CLMGR_TMPLOG $CLRSH $node lscluster -c >>$CLMGR_TMPLOG 2>&1 if (( $? != 0 )); then $CLRSH $node lscluster -c 1>&2 cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 535 '\nERROR: the Cluster Aware AIX cluster services appear to be offline on \"%1$s\".\nIt is possible that CAA was manually stopped using STOP_CAA option.\nIn that case, you could use any one of the below command to start CAA and PowerHA services.\n\"clmgr online cluster START_CAA=yes\" OR \n \"clmgr online node START_CAA=yes\" OR \n \"clmgr online site START_CAA=yes\"\n\n' "$node" 1>&2 rc=$RC_ERROR fi else typeset COMMPATH=$(clodmget -q "name=$node AND object=COMMUNICATION_PATH" -n -f value HACMPnode) cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 194 '\nERROR: unable to communicate with "%1$s" (%2$s). Verify that the node is powered up and active, and that clcomd is properly configured and running on it. If the problem persists, also check the local clcomd, and verify that the network is functioning normally.\n\n' "$node" "$COMMPATH" 1>&2 rc=$RC_ERROR fi done fi if [[ $start_caa == "only" ]]; then : The "only" option was specified with the "START_CAA" input, so only : CAA is to be started, not PowerHA cluster services. Nulling the : "nodes" variable disables the following node start code segment. nodes="" fi if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then set +e print "$0()[$LINENO]($SECONDS): _SPOC_FORCE=Y fix_args nop cl_rc.cluster $when -cspoc-n \"$nodes\" $manage $broadcast $start_clinfo $force $fix" >>$CLMGR_TMPLOG # Always log commands VERBOSE_LOGGING="" _SPOC_FORCE=Y fix_args \ nop cl_rc.cluster \ $when \ -cspoc-n "$nodes" \ $manage \ $broadcast \ $start_clinfo \ $force \ $fix rc=$? print "cl_rc.cluster RC: $rc" >>$CLMGR_TMPLOG # Always log command result set -e #============================================================ : The command executed above is asynchronous. So in order : for the correct status to be returned, it is necessary : to loop here, and poll the cluster manager. Of course, : a visual indication is provided for the customer, so : they can see that something is happening. And eventually, : we will give up, and report a problem message. #============================================================ if (( $rc == RC_SUCCESS )); then rc=$RC_ERROR # Assume the worst, until proven otherwise typeset node= check_nodes=" ${nodes//,/ } " for (( used=0; used < timeout; used += interval )); do for node in $check_nodes; do state=$(CL=$LINENO KLIB_HACMP_get_node_state "$node" 2>>$CLMGR_TMPLOG) if [[ $state == "ST_STABLE" ]]; then (( used > 0 )) && print dspmsg -s $CLMGR_SET $CLMGR_MSGS 521 'Node "%1$s" in site "%2$s" is now online.\n' "$node" "$site" check_nodes=${check_nodes// $node / } if [[ $check_nodes == *([[:space:]]) ]]; then unset check_nodes rc=$RC_SUCCESS break fi elif [[ $state == @(ST_RP_FAILED|INOPERATIVE) ]]; then # No recovering from this # script failure / clstrmgr failed to start / clstrmgr died during startup (before having joined) (( used > 0 )) && print cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 514 '\nERROR: cluster services have experienced a problem on node\n "%1$s" in site "%2$s" which has left the site\n in an error state. Manual intervention will be required\n to recover from this problem.\n\n' "$node" "$site" 1>&2 unset check_nodes break else print -n "." if (( interval >= timeout - 1 )); then print cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 522 "\nWarning: unable to determine if node \"%1\$s\" in site\n \"%2\$s\" has come online (the process is still not\n complete). Wait a few minutes, then manually check the site's\n state using \"clmgr query site %1\$s\".\n\n" "$node" "$site" "$CLMGR_PROGNAME" elif (( timeout - interval <= 0 )); then (( interval = timeout - interval - 1 )) fi sleep $interval fi done done else rc=$RC_ERROR fi elif [[ -z $nodes && $start_caa != "only" ]] && (( $rc == RC_UNKNOWN )); then : All nodes are up, so we have the desired end state rc=$RC_SUCCESS fi if (( rc == $RC_SUCCESS )); then dspmsg -s $CLMGR_SET $CLMGR_MSGS 532 '\nThe nodes for site "%1$s" are already online.\n' "$site" fi #======================================================================= : If a user input error was detected, provide some helpful suggestions #======================================================================= if (( $rc == RC_MISSING_INPUT || $rc == RC_INCORRECT_INPUT )) && \ [[ $CLMGR_GUI == *([[:space:]]) ]] then cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 104 'For more information about available options and syntax, try\n"$HAUTILS/clmgr %1$s". As an\nalternative, if the PowerHA SystemMirror man pages have been installed, invoke\n"$HAUTILS/clmgr -hv" (or "/usr/bin/man clmgr"),\nsearching for "%2$s" in the displayed text.\n\n' \ "online site -h" "SITE:" "$CLMGR_PROGNAME" 1>&2 fi log_return_msg "$rc" "$0()" "$LINENO" return $? } # End of "KLIB_HACMP_online_site()" #============================================================================ # # Name: on_exit_online_site # # Description: This function is invoked whenever this script exits. It # checks the current "$rc" value, and if it is still set # to unknown, exits with an error code. # # Inputs: None, explicitly. However, the "$rc" variable is read. # # Outputs: None. # # Returns: Nothing. The program exits. # #============================================================================ function on_exit_online_site { if (( $rc == RC_UNKNOWN )); then print exit $RC_ERROR fi } #============================================================================ # # Name: KLIB_HACMP_online_node # # Description: This is the main, FPATH function that is invoked by clmgr # to bring cluster services online on one or more specified # nodes (defaulting to the local node). Each node is first # checked to see if it is iinactive. Only inactive nodes are # sent a start command. If the nodes are already all online, # then the desired end state already exists, and a success # code is returned. Otherwise, the appropriate start command # is issued for the inactive nodes, and the results are # returned. It is worth mentioning that this command employs # a user-configurable timeout to avoid hangs. # # If the CAA cluster services are also being brought online, # that is done *before* the SystemMirror cluster services are # brought online. One limitation to be aware of is that it # is only possible for one node to start CAA on another node # if the local node currently has active CAA services. Put # another way, if CAA is not active on the *local* node, you # will not be able to manage CAA on any remote nodes from # this node until CAA services are restored. # Workaround 1: use clrsh to run clmgr on the remote nodes. # Workaround 2: log in to the remote nodes and run clmgr. # Workaround 3: when working with a subset of the cluster # nodes, always manipulate them all at once. # Workaround 4: always manage the local node last. # # The "START_CAA" option is really intended for use only # during specific, exceptional conditions. Normally, it is # neither recommended nor needed to directly manipulate the # CAA state. # # Inputs: See the "devDoc()" function, below. # # Outputs: The only outputs are any messages, error or otherwise, # that might be needed. # # Returns: Zero if no errors are detected. Otherwise, an appropriate # non-zero value is returned. Refer to the "RETURN" section # of the "devDoc()" function, below, for the standard return # code values/meanings for clmgr. # #============================================================================ function devDoc { : <<'=cut' >/dev/null 2>&1 =head1 NAME KLIB_HACMP_online_site =head1 SYNOPSIS clmgr online site \ WHEN={now|restart|both} \ MANAGE={auto|manual} \ [ BROADCAST={false|true} ] \ [ CLINFO={false|true|consistent} ] \ [ FORCE={false|true} ] \ [ FIX={no|yes|interactively} ] [ TIMEOUT= ] \ [ START_CAA={no|yes|only} ] NOTE: the "TIMEOUT" attribute defaults to 120 seconds. =head1 DESCRIPTION Attempts to bring all the member nodes of the specified site online. =head1 ARGUMENTS 1. site [REQUIRED] [string] The name/label of the site that is to be brought online. 2. when [OPTIONAL] [string] An indicator of when the action should take place. Valid values include: {now|restart|both} 3. manage [OPTIONAL] [string] An indicator of the manner in which any resource groups managed within the specified site should be handled. Valid values include: {auto|manual} 4. broadcast [OPTIONAL] [string] A Boolean-like indicator of whether or not to broadcast a message announcing the startup on each affected node. 5. clinfo [OPTIONAL] [string] An indicator of whether ot startup clinfo, and how. Valid values include: {false|true|consistent} 6. force [OPTIONAL] [string] A Boolean-like indicator for specifying whether or not the startup should continue even if verification fails. 7. fix [OPTIONAL] [string] An indicator of whether or not to attempt to automatically correct certain errors that might be found during verification, and how. Valid values include: {no|yes|interactively} 8. timeout [OPTIONAL] [integer] The amount of time to allow the startup to continue before giving up and reporting a failure. Of course, just because clmgr gives up does not mean that PowerHA does, and the startup may occur anyway, simply taking longer than the timeout to complete! The default timeout is 300 seconds, plus 90 additional seconds for each node beyond the second node (i.e. that would mean 180 added seconds in a 4 node cluster). 9. start_caa [OPTIONAL] [set] An indicator for specifying whether or not to explicitly send a start request to the CAA cluster running on the node in the specified site. Accepted values are: true, false, only. =head1 RETURN 0: no errors were detected; the operation appears to have been successful 1: a general error has occurred 2: a specified resource does not exist, or could not be found 3: some required input was missing 4: some detected input was incorrect in some way 5: a required dependency does not exist 6: a specified search failed to match any data =cut } # End of "devDoc()" #============================================================================== # The following, comment block attempts to enforce coding standards when this # file is edited via emacs or vim. This block _must_ appear at the very end # of the file, or the editor will not find it, and it will be ignored. #============================================================================== # Local Variables: # indent-tabs-mode: nil # tab-width: 4 # End: #============================================================================== # vim: tabstop=4 shiftwidth=4 expandtab #==============================================================================