#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021 All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r721 src/43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh 1.21 # # Licensed Materials - Property of IBM # # Restricted Materials of IBM # # COPYRIGHT International Business Machines Corp. 2005,2016 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM #=============================================================================== # # Name: KLIB_HACMP_get_cluster_state # # Description: This is the main, FPATH function that is invoked by clmgr # to retrieve the cluster manager state from each node, and # aggregate them into an overall cluster state. # # Inputs: See the "devDoc()" function, below. # # Outputs: The properties hash is populated. The only other outputs are # any error messages that might be needed. # # Returns: Zero if no errors are detected. Otherwise, an appropriate # non-zero value is returned. Refer to the "RETURN" section # of the "devDoc()" function, below, for the standard return # code values/meanings for clmgr. # #============================================================================ function KLIB_HACMP_get_cluster_state { . $HALIBROOT/log_entry "$0()" "$CL" : version="@(#) 7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM" : INPUTS: $* [[ $CLMGR_LOGGING == 'med' ]] && set +x # Only trace param values #=================================== : Declare and initialize variables #=================================== typeset -i rc=$RC_UNKNOWN errors=0 typeset LINE= NODE= OUTPUT= HOST= DATA= ERRMSG= typeset -u STATE=UNKNOWN typeset -i NODE_COUNT=0 STATE_COUNT=0 OFFLINE=0 UNCONFIG=0 ERROR=0 WARNING=0 typeset -i JOINING=0 NORMAL=0 UNKNOWN=0 UNMANAGED=0 if [[ -s /tmp/clmgr_cluster_state ]]; then print "$0()[$LINENO]($SECONDS): attempting to use state found in debug file, /tmp/clmgr_cluster_state." >>$CLMGR_TMPLOG STATE=$(tail -1 /tmp/clmgr_cluster_state) print "$0()[$LINENO]($SECONDS): using fake state \"$STATE\"" >>$CLMGR_TMPLOG if [[ $STATE != *([[:space:]]) ]]; then print -- "$STATE" log_return_msg "$rc" "$0()" "$LINENO" return $? else print "$0()[$LINENO]($SECONDS): WARNING: invalid state found in last line of /tmp/clmgr_cluster_state." >>$CLMGR_TMPLOG fi fi # : If this cluster has never been synchronized, then the state is known. : If HACMPadapter has not been populated, then no nodes have been added. : If nodes are present, but their IDs are zero, then the first sync has : not occurred. Either way, the state should be NOT_CONFIGURED. # print "$0()[$LINENO]($SECONDS): odmget HACMPadapter" >>$CLMGR_TMPLOG print "$0()[$LINENO]($SECONDS): clodmget -q \"object=COMMUNICATION_PATH\" -f node_id HACMPnode | sort -u" >>$CLMGR_TMPLOG if [[ -z $(odmget HACMPadapter) || \ $(clodmget -q "object=COMMUNICATION_PATH" -f node_id HACMPnode | sort -u) == "0" ]] then print "NOT_CONFIGURED" log_return_msg "0" "$0()" "$LINENO" return $? fi print "$0()[$LINENO]($SECONDS): cllsnode -cS" >>$CLMGR_TMPLOG # Always log commands OUTPUT=$(cllsnode -cS) rc=$? print "$0()[$LINENO]($SECONDS): cllsnode RC: $rc" >>$CLMGR_TMPLOG # Always log command results if (( $rc == $RC_SUCCESS )) then # : See if we can communicate to the other nodes. If not, it : is not possible to determine the overall cluster status. # print "$0()[$LINENO]($SECONDS): lscluster -c" >>$CLMGR_TMPLOG # Always log commands if ! lscluster -c >>$CLMGR_TMPLOG 2>&1 then STATE="UNKNOWN" print "$0()[$LINENO]($SECONDS): CAA cluster services are not active on this node ($LOCAL_NODE). It is not possible to determine the cluster's state." >>$CLMGR_TMPLOG rc=$RC_ERROR fi fi if (( $rc != $RC_SUCCESS )); then rc=$RC_ERROR else print -- "$OUTPUT" |\ while read LINE; do NODE=${LINE%%:*} [[ -z $NODE ]] && continue (( NODE_COUNT++)) done if (( NODE_COUNT )); then STATE="" HOST="" NODE="" ERRMSG="" if lscluster -c >>$CLMGR_TMPLOG 2>&1 then print "$0()[$LINENO]($SECONDS): LC_ALL=C clcmd lssrc -ls clstrmgrES" >>$CLMGR_TMPLOG DATA=$(LC_ALL=C clcmd lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG) else print -- "$OUTPUT" |\ while read LINE; do NODE=${LINE%%:*} DATA="$DATA ------------------------------- NODE $NODE ------------------------------- $($CLRSH $NODE LC_ALL=C lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG)" done fi # In rare cases, "clcmd lssrc -ls clstrmgrES" gives error state due to timing issue # Below retry mechanism helps to avoid such scenario typeset -i retry=0 echo $DATA | grep -q "The request could not be passed to the clstrmgrES subsystem." while (( $? == 0 )); do sleep 2 DATA=$(LC_ALL=C clcmd lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG) retry=$(( retry + 1 )) if (( retry == 3 )); then break fi echo $DATA | grep -q "The request could not be passed to the clstrmgrES subsystem." done print -- "$DATA\n" >>$CLMGR_TMPLOG print "$0()[$LINENO]($SECONDS): clcmd RC: $?" >>$CLMGR_TMPLOG print -- "$DATA\nEND_OF_DATA" |\ while read LINE; do if [[ $LINE == NODE\ * || $LINE == "END_OF_DATA" ]] then if [[ -n $HOST && -z $STATE ]]; then if [[ -n $ERRMSG ]]; then if [[ $ERRMSG == *"The request could not be passed to the clstrmgrES subsystem."* ]] then print "$0()[$LINENO]($SECONDS): Something appears to be wrong with the clstrmgrES subsystem on $HOST, as tested with the \"lssrc -ls clstrmgrES\" command." >>$CLMGR_TMPLOG STATE="ERROR" else print "$0()[$LINENO]($SECONDS): Unable to retrieve the current state of the clstrmgrES subsystem (lssrc -ls clstrmgrES) from $HOST. Possible communication/clcomd error." >>$CLMGR_TMPLOG STATE="UNKNOWN" fi else #========================================================= : See if we can ping $HOST. If not, treat it as OFFLINE. : Otherwise, since the host appears to be online, but we : were not able to obtain results, we truly do not know : the node state, so mark it as "UNKNOWN". This could : happen, for example, if clcomd were offline or broken. #========================================================= print "$0()[$LINENO]($SECONDS): ping -c 1 -w 1 $HOST" >>$CLMGR_TMPLOG ping -c 1 -w 1 $HOST >>$CLMGR_TMPLOG 2>&1 cmd_rc=$? if (( cmd_rc != RC_SUCCESS )); then print "$0()[$LINENO]($SECONDS): $HOST did not return a state, and is not responding to a ping. It appears to be offline." >>$CLMGR_TMPLOG STATE="OFFLINE" else print "$0()[$LINENO]($SECONDS): Unable to retrieve the current state of the clstrmgrES subsystem (lssrc -ls clstrmgrES) from $HOST. Possible communication/clcomd error." >>$CLMGR_TMPLOG STATE="UNKNOWN" fi fi fi print "$0()[$LINENO]($SECONDS): $HOST State: $STATE" >>$CLMGR_TMPLOG print "$STATE" >>$TMPDIR/clmgr.KHgcs.$$ STATE="" ERRMSG="" if [[ -n ${LINE#NODE } && $LINE != 'END_OF_DATA' ]] then print "$0()[$LINENO]($SECONDS): cl_nn2hn ${LINE#NODE }" >>$CLMGR_TMPLOG HOST=$(VERBOSE_LOGGING="" cl_nn2hn ${LINE#NODE }) print "$0()[$LINENO]($SECONDS): cl_nn2hn RC: $?, NODE=\"${LINE#NODE }\", HOST=\"$HOST\"" >>$CLMGR_TMPLOG fi elif [[ $LINE == +([0-9])-+([0-9])\ * ||\ $LINE == clcmd* ||\ -n $ERRMSG ]] then if [[ $LINE != *([[:space:]]) && \ $LINE != '----------------'* ]] then ERRMSG="${ERRMSG:+$ERRMSG$NL}$LINE" fi elif [[ $LINE == Current\ state:* ]] then STATE=${LINE#*: } elif [[ $LINE == Forced\ down\ node\ list:* ]] then for NODE in ${LINE#*: }; do DATA=$(VERBOSE_LOGGING="" cl_nn2hn $NODE) if [[ $HOST == $DATA ]]; then STATE="UNMANAGED" break fi done fi done #================================================ : Collect and analyze the status from each node #================================================ if (( $rc == RC_SUCCESS )) && [[ -s $TMPDIR/clmgr.KHgcs.$$ ]]; then cat $TMPDIR/clmgr.KHgcs.$$ |\ while read STATE; do [[ $STATE == *([[:space:]]) ]] && continue (( STATE_COUNT++ )) if [[ $CLMGR_GUI == "SMUI" ]]; then if [[ $STATE == *@(INIT|OFFLINE)* ]]; then (( OFFLINE++ )) elif [[ $STATE == *NOT_CONFIGURED* ]]; then (( UNCONFIG++ )) elif [[ $STATE == *@(FAILED|ERROR)* ]]; then (( ERROR++ )) elif [[ $STATE == *WARNING* ]]; then (( WARNING++ )) elif [[ $STATE == *@(STABLE|JOINING|UNSTABLE|VOTING|RUNNING|BARRIER|DONE)* ]]; then (( NORMAL++ )) elif [[ $STATE == *UNMANAGED* ]]; then (( UNMANAGED++ )) else (( UNKNOWN++ )) fi else if [[ $STATE == *@(INIT|OFFLINE)* ]]; then (( OFFLINE++ )) elif [[ $STATE == *NOT_CONFIGURED* ]]; then (( UNCONFIG++ )) elif [[ $STATE == *@(FAILED|ERROR)* ]]; then (( ERROR++ )) elif [[ $STATE == *@(STABLE|JOINING|UNSTABLE|VOTING|RUNNING|BARRIER|DONE)* ]]; then (( NORMAL++ )) elif [[ $STATE == *UNMANAGED* ]]; then (( UNMANAGED++ )) else (( UNKNOWN++ )) fi fi done rm -f $TMPDIR/clmgr.KHgcs.$$ print "$0()[$LINENO]($SECONDS): NODE COUNT: $NODE_COUNT" >>$CLMGR_TMPLOG print "$0()[$LINENO]($SECONDS): STATE RETRIEVALS: $STATE_COUNT" >>$CLMGR_TMPLOG (( errors = NODE_COUNT - STATE_COUNT )) (( UNKNOWN += ( NODE_COUNT - STATE_COUNT ) )) #============================================================= : Aggregate the node states to derive an appropriate overall : cluster state. The goal here is to provide the most useful : indicator to the customer. Therefore, the order of the : conditionals below matters, and occur in the order that : is most likely to be of concern/interest to customers. #============================================================= if (( NODE_COUNT > 0 )); then if [[ $CLMGR_GUI == "SMUI" ]]; then if (( ERROR )); then STATE="ERROR" elif (( UNKNOWN )); then STATE="UNKNOWN" elif (( UNCONFIG )); then STATE="NOT_CONFIGURED" elif (( WARNING )); then STATE="WARNING" elif (( OFFLINE == NODE_COUNT )); then STATE="OFFLINE" elif (( NORMAL == NODE_COUNT )); then STATE="STABLE" elif (( UNMANAGED == NODE_COUNT )); then STATE="UNMANAGED" elif (( NORMAL && NORMAL != NODE_COUNT )); then STATE="WARNING" else STATE="UNKNOWN" fi else if (( ERROR )); then STATE="ERROR" elif (( UNKNOWN )); then STATE="UNKNOWN" elif (( UNCONFIG )); then STATE="NOT_CONFIGURED" elif (( OFFLINE == NODE_COUNT )); then STATE="OFFLINE" elif (( NORMAL == NODE_COUNT )); then STATE="STABLE" elif (( UNMANAGED == NODE_COUNT )); then STATE="UNMANAGED" else STATE="UNKNOWN" fi fi if [[ $STATE == "OFFLINE" ]]; then DATA=$(LC_ALL=C clcmd odmget -q node_id=0 HACMPnode 2>>$CLMGR_TMPLOG | grep "^HACMPnode:\$") [[ $DATA != *([[:space:]]) ]] && STATE="NOT_CONFIGURED" fi fi else STATE="UNKNOWN" fi else (( rc == RC_SUCCESS )) || STATE=NOT_CONFIGURED || STATE=UNKNOWN fi fi if (( $rc != $RC_ERROR )); then (( errors == 0 )) && rc=$RC_SUCCESS || rc=$RC_ERROR fi if (( $rc != $RC_SUCCESS )); then STATE=UNKNOWN elif [[ $STATE != @(WARNING|ERROR|NOT_CONFIGURED|UNMANAGED) ]] && \ [[ $CLMGR_GUI == "SMUI" ]] then print "$0()[$LINENO]($SECONDS): clcmd $HAUTILS/clodmget -f handle -n HACMPcluster | grep -q '^0\$'" >>$CLMGR_TMPLOG clcmd $HAUTILS/clodmget -f handle -n HACMPcluster 2>>$CLMGR_TMPLOG | grep -q '^0$' cmd_rc=$? print "$0()[$LINENO]($SECONDS): grep RC: $cmd_rc" >>$CLMGR_TMPLOG if (( cmd_rc == RC_SUCCESS )); then : At least on node has a cluster "handle" value of zero. : That indicates that unsynchronized changes have been made. STATE="WARNING" fi fi print "$0()[$LINENO]($SECONDS): Cluster State: $STATE" >>$CLMGR_TMPLOG print -- $STATE log_return_msg "$rc" "$0()" "$LINENO" return $? } # End of "KLIB_HACMP_get_cluster_state()" #============================================================================ # # Name: devDoc # # Description: This is a never-to-be-called, wrapper function that all the # clmgr FPATH functions implement in order to hide embedded # syntax from trace logging. This information is implemented # in POD format, and can be viewed in a number of ways using # POD tools. Some viewing suggestions for this function's POD- # formatted information are: # # perldoc # pod2text -c # pod2text -c --code # pod2html # # However, the more important use for this information is that # it is parsed by clmgr to display the syntax for this file's # operation. The information in the "SYNOPSIS" section is used # for this purpose. This feature was originally implemented # using the man page information. However, in a code review it # was pointed out that this approach had to be changed because # customers do not have to install the man pages! Therefore, a # built-in dependency on man page information would break the # automatic help feature of clmgr. So the SYNPOSIS section must # be used instead. # # IMPORTANT: As a result of this, it is imperative that the # information in this SYNOPSIS be kept in sync # with the man page information, which is owned # by the IDD team. # # Inputs: None. # # Outputs: None. # # Returns: n/a (not intended to be invoked) # #============================================================================ function devDoc { : <<'=cut' >/dev/null 2>&1 =head1 NAME KLIB_HACMP_get_cluster_state =head1 SYNOPSIS clmgr -cSa STATE query cluster =head1 DESCRIPTION Queries every node in the cluster for its current state (retrieved by calling "lssrc -ls clstrmgrES") and aggregating the results. The currently known statuses are: ST_INIT ST_JOINING ST_STABLE ST_UNSTABLE ST_VOTING ST_RP_RUNNING ST_BARRIER ST_RP_FAILED ST_CBARRIER ST_DONE NOT_CONFIGURED UNMANAGED These raw states are aggregated into seven possible statuses: NOT_CONFIGURED OFFLINE ERROR STABLE WARNING UNMANAGED UNKNOWN NOT_CONFIGURED indicates that the node has not yet been configued into a cluster. If *any* node in the cluster has this state, then the entire cluster is considered to be "NOT_CONFIGURED". ST_INIT indicates cluster services are inactive, but AIX is running. If all the nodes have this state, the cluster state is set to "OFFLINE". ST_JOINING, ST_UNSTABLE, ST_VOTING, ST_RP_RUNNING, ST_BARRIER, ST_CBARRIER, and ST_DONE all indicate that cluster services are active, and something is happening. If *any* of the nodes in the cluster possess this state, then the cluster state is set to "STABLE". ST_RP_FAILED indicates a PowerHA error has occurred on that node (other nodes in the cluster may be in *BARRIER in this case). If *any* node in the cluster possesses this state, the cluster state is set to "ERROR". ST_STABLE indicates that cluster services are operating normally. If all nodes in the cluster have this state, the cluster state is set to "STABLE". WARNING indicates a degraded condition within the cluster, or the detection of a potential problem. For example, if one or more nodes report as "OFFLINE" while the rest are "STABLE", that is a degraded condition that will result in a "WARNING" state. This is also true when something causes the clstrmgr process to exit, but leaves the host active. Also, if the entire node appears to be down (does not respond to a ping), that is also considered an offline state, and will result in a "WARNING" status if any other nodes are online. If *any* node in the cluster reports an unrecognized state, or cannot be contacted, the state of the cluster is set to "UNKNOWN". For example, if if clcomd is down, but the host is up (responds to a ping), that results in an "UNKNOWN" status. =head1 ARGUMENTS None. =head1 RETURN 0: no errors were detected; the operation appears to have been successful 1: a general error has occurred 2: a specified resource does not exist, or could not be found 3: some required input was missing 4: some detected input was incorrect in some way 5: a required dependency does not exist 6: a specified search failed to match any data =cut } # End of "devDoc()" #============================================================================== # The following, comment block attempts to enforce coding standards when this # file is edited via emacs or vim. This block _must_ appear at the very end # of the file, or the editor will not find it, and it will be ignored. #============================================================================== # Local Variables: # indent-tabs-mode: nil # tab-width: 4 # End: #============================================================================== # vim: tabstop=4 shiftwidth=4 expandtab #==============================================================================