#!/bin/ksh93
#  ALTRAN_PROLOG_BEGIN_TAG
#  This is an automatically generated prolog.
#
#  Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021  All rights reserved.
#
#  ALTRAN_PROLOG_END_TAG
#
# IBM_PROLOG_BEGIN_TAG 
# This is an automatically generated prolog. 
#  
# 61haes_r721 src/43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh 1.21 
#  
# Licensed Materials - Property of IBM 
#  
# Restricted Materials of IBM 
#  
# COPYRIGHT International Business Machines Corp. 2005,2016 
# All Rights Reserved 
#  
# US Government Users Restricted Rights - Use, duplication or 
# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. 
#  
# IBM_PROLOG_END_TAG 
# @(#)  7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM

#===============================================================================
#
# Name:        KLIB_HACMP_get_cluster_state
#
# Description: This is the main, FPATH function that is invoked by clmgr
#              to retrieve the cluster manager state from each node, and
#              aggregate them into an overall cluster state.
#
# Inputs:      See the "devDoc()" function, below.
#
# Outputs:     The properties hash is populated. The only other outputs are
#              any error messages that might be needed.
#
# Returns:     Zero if no errors are detected. Otherwise, an appropriate
#              non-zero value is returned. Refer to the "RETURN" section
#              of the "devDoc()" function, below, for the standard return
#              code values/meanings for clmgr.
#
#============================================================================
function KLIB_HACMP_get_cluster_state {
    . $HALIBROOT/log_entry "$0()" "$CL"
    : version="@(#)  7d4c34b 43haes/lib/ksh93/hacmp/KLIB_HACMP_get_cluster_state.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM"
    : INPUTS: $*

    [[ $CLMGR_LOGGING == 'med' ]] && set +x  # Only trace param values

    #===================================
    : Declare and initialize variables
    #===================================
    typeset -i rc=$RC_UNKNOWN errors=0
    typeset LINE= NODE= OUTPUT= HOST= DATA= ERRMSG=
    typeset -u STATE=UNKNOWN
    typeset -i NODE_COUNT=0 STATE_COUNT=0 OFFLINE=0 UNCONFIG=0 ERROR=0 WARNING=0
    typeset -i JOINING=0 NORMAL=0 UNKNOWN=0 UNMANAGED=0

    if [[ -s /tmp/clmgr_cluster_state ]]; then
        print "$0()[$LINENO]($SECONDS): attempting to use state found in debug file, /tmp/clmgr_cluster_state." >>$CLMGR_TMPLOG
        STATE=$(tail -1 /tmp/clmgr_cluster_state)
        print "$0()[$LINENO]($SECONDS): using fake state \"$STATE\"" >>$CLMGR_TMPLOG
        if [[ $STATE != *([[:space:]]) ]]; then
            print -- "$STATE"
            log_return_msg "$rc" "$0()" "$LINENO"
            return $?
        else
            print "$0()[$LINENO]($SECONDS): WARNING: invalid state found in last line of /tmp/clmgr_cluster_state." >>$CLMGR_TMPLOG
        fi
    fi

    #
    : If this cluster has never been synchronized, then the state is known.
    : If HACMPadapter has not been populated, then no nodes have been added.
    : If nodes are present, but their IDs are zero, then the first sync has
    : not occurred. Either way, the state should be NOT_CONFIGURED.
    #
    print "$0()[$LINENO]($SECONDS): odmget HACMPadapter" >>$CLMGR_TMPLOG
    print "$0()[$LINENO]($SECONDS): clodmget -q \"object=COMMUNICATION_PATH\" -f node_id HACMPnode | sort -u" >>$CLMGR_TMPLOG
    if [[ -z $(odmget HACMPadapter) || \
          $(clodmget -q "object=COMMUNICATION_PATH" -f node_id HACMPnode | sort -u) == "0" ]]
    then
        print "NOT_CONFIGURED"
        log_return_msg "0" "$0()" "$LINENO"
        return $?
    fi

    print "$0()[$LINENO]($SECONDS): cllsnode -cS" >>$CLMGR_TMPLOG  # Always log commands
    OUTPUT=$(cllsnode -cS)
    rc=$?
    print "$0()[$LINENO]($SECONDS): cllsnode RC: $rc" >>$CLMGR_TMPLOG  # Always log command results

    if (( $rc == $RC_SUCCESS ))
    then
        #
        : See if we can communicate to the other nodes. If not, it
        : is not possible to determine the overall cluster status.
        #
        print "$0()[$LINENO]($SECONDS): lscluster -c" >>$CLMGR_TMPLOG  # Always log commands
        if ! lscluster -c >>$CLMGR_TMPLOG 2>&1
        then
            STATE="UNKNOWN"
            print "$0()[$LINENO]($SECONDS): CAA cluster services are not active on this node ($LOCAL_NODE). It is not possible to determine the cluster's state." >>$CLMGR_TMPLOG
            rc=$RC_ERROR
        fi
    fi

    if (( $rc != $RC_SUCCESS )); then
        rc=$RC_ERROR

    else
        print -- "$OUTPUT" |\
        while read LINE; do
            NODE=${LINE%%:*}
            [[ -z $NODE ]] && continue
            (( NODE_COUNT++))
        done

        if (( NODE_COUNT )); then
            STATE="" HOST="" NODE="" ERRMSG=""

            if lscluster -c >>$CLMGR_TMPLOG 2>&1
            then
                print "$0()[$LINENO]($SECONDS): LC_ALL=C clcmd lssrc -ls clstrmgrES" >>$CLMGR_TMPLOG
                DATA=$(LC_ALL=C clcmd lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG)
            else
                print -- "$OUTPUT" |\
                while read LINE; do
                    NODE=${LINE%%:*}
                    DATA="$DATA

-------------------------------
NODE $NODE
-------------------------------
$($CLRSH $NODE LC_ALL=C lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG)"
                done
            fi

            # In rare cases, "clcmd lssrc -ls clstrmgrES" gives error state due to timing issue
            # Below retry mechanism helps to avoid such scenario
            typeset -i retry=0
            echo $DATA | grep -q "The request could not be passed to the clstrmgrES subsystem."
            while (( $? == 0 )); do
                sleep 2
                DATA=$(LC_ALL=C clcmd lssrc -ls clstrmgrES 2>>$CLMGR_TMPLOG)
                retry=$(( retry + 1 ))
                if (( retry == 3 )); then
                    break
                fi
                echo $DATA | grep -q "The request could not be passed to the clstrmgrES subsystem."
            done

            print -- "$DATA\n" >>$CLMGR_TMPLOG
            print "$0()[$LINENO]($SECONDS): clcmd RC: $?" >>$CLMGR_TMPLOG
             
            print -- "$DATA\nEND_OF_DATA" |\
            while read LINE; do

                if [[ $LINE == NODE\ * || $LINE == "END_OF_DATA" ]]
                then
                    if [[ -n $HOST && -z $STATE ]]; then
                        if [[ -n $ERRMSG ]]; then
                            if [[ $ERRMSG == *"The request could not be passed to the clstrmgrES subsystem."* ]]
                            then
                                print "$0()[$LINENO]($SECONDS): Something appears to be wrong with the clstrmgrES subsystem on $HOST, as tested with the \"lssrc -ls clstrmgrES\" command." >>$CLMGR_TMPLOG
                                STATE="ERROR"
                            else
                                print "$0()[$LINENO]($SECONDS): Unable to retrieve the current state of the clstrmgrES subsystem (lssrc -ls clstrmgrES) from $HOST. Possible communication/clcomd error." >>$CLMGR_TMPLOG
                                STATE="UNKNOWN"
                            fi
                        else
                            #=========================================================
                            : See if we can ping $HOST. If not, treat it as OFFLINE.
                            : Otherwise, since the host appears to be online, but we
                            : were not able to obtain results, we truly do not know
                            : the node state, so mark it as "UNKNOWN". This could
                            : happen, for example, if clcomd were offline or broken.
                            #=========================================================
                            print "$0()[$LINENO]($SECONDS): ping -c 1 -w 1 $HOST" >>$CLMGR_TMPLOG
                            ping -c 1 -w 1 $HOST >>$CLMGR_TMPLOG 2>&1
                            cmd_rc=$?
                            if (( cmd_rc != RC_SUCCESS )); then
                                print "$0()[$LINENO]($SECONDS): $HOST did not return a state, and is not responding to a ping. It appears to be offline." >>$CLMGR_TMPLOG
                                STATE="OFFLINE"
                            else
                                print "$0()[$LINENO]($SECONDS): Unable to retrieve the current state of the clstrmgrES subsystem (lssrc -ls clstrmgrES) from $HOST. Possible communication/clcomd error." >>$CLMGR_TMPLOG
                                STATE="UNKNOWN"
                            fi
                        fi
                    fi
                    print "$0()[$LINENO]($SECONDS): $HOST State: $STATE" >>$CLMGR_TMPLOG
                    print "$STATE" >>$TMPDIR/clmgr.KHgcs.$$

                    STATE=""
                    ERRMSG=""
             
                    if [[ -n  ${LINE#NODE } && $LINE != 'END_OF_DATA' ]]
                    then
                        print "$0()[$LINENO]($SECONDS): cl_nn2hn ${LINE#NODE }" >>$CLMGR_TMPLOG
                        HOST=$(VERBOSE_LOGGING="" cl_nn2hn ${LINE#NODE })
                        print "$0()[$LINENO]($SECONDS): cl_nn2hn RC: $?, NODE=\"${LINE#NODE }\", HOST=\"$HOST\"" >>$CLMGR_TMPLOG
                    fi

                elif [[ $LINE == +([0-9])-+([0-9])\ * ||\
                        $LINE == clcmd*               ||\
                        -n $ERRMSG ]]
                then
                    if [[ $LINE != *([[:space:]]) && \
                          $LINE != '----------------'* ]]
                    then
                        ERRMSG="${ERRMSG:+$ERRMSG$NL}$LINE"
                    fi

                elif [[ $LINE == Current\ state:* ]]
                then
                    STATE=${LINE#*: }

                elif [[ $LINE == Forced\ down\ node\ list:* ]]
                then
                    for NODE in ${LINE#*: }; do
                        DATA=$(VERBOSE_LOGGING="" cl_nn2hn $NODE)
                        if [[ $HOST == $DATA ]]; then
                            STATE="UNMANAGED"
                            break
                        fi
                    done
                fi
            done

            #================================================
            : Collect and analyze the status from each node
            #================================================
            if (( $rc == RC_SUCCESS )) && [[ -s $TMPDIR/clmgr.KHgcs.$$ ]]; then
                cat $TMPDIR/clmgr.KHgcs.$$ |\
                while read STATE; do
                    [[ $STATE == *([[:space:]]) ]] && continue

                    (( STATE_COUNT++ ))

                    if [[ $CLMGR_GUI == "SMUI" ]]; then
                        if [[ $STATE == *@(INIT|OFFLINE)* ]]; then
                            (( OFFLINE++ ))
                        elif [[ $STATE == *NOT_CONFIGURED* ]]; then
                            (( UNCONFIG++ ))
                        elif [[ $STATE == *@(FAILED|ERROR)* ]]; then
                            (( ERROR++ ))
                        elif [[ $STATE == *WARNING* ]]; then
                            (( WARNING++ ))
                        elif [[ $STATE == *@(STABLE|JOINING|UNSTABLE|VOTING|RUNNING|BARRIER|DONE)* ]]; then
                            (( NORMAL++ ))
                        elif [[ $STATE == *UNMANAGED* ]]; then
                            (( UNMANAGED++ ))
                        else
                            (( UNKNOWN++ ))
                        fi
                    else
                        if [[ $STATE == *@(INIT|OFFLINE)* ]]; then
                            (( OFFLINE++ ))
                        elif [[ $STATE == *NOT_CONFIGURED* ]]; then
                            (( UNCONFIG++ ))
                        elif [[ $STATE == *@(FAILED|ERROR)* ]]; then
                            (( ERROR++ ))
                        elif [[ $STATE == *@(STABLE|JOINING|UNSTABLE|VOTING|RUNNING|BARRIER|DONE)* ]]; then
                            (( NORMAL++ ))
                        elif [[ $STATE == *UNMANAGED* ]]; then
                            (( UNMANAGED++ ))
                        else
                            (( UNKNOWN++ ))
                        fi
                    fi
                done
                rm -f $TMPDIR/clmgr.KHgcs.$$

                print "$0()[$LINENO]($SECONDS): NODE COUNT:       $NODE_COUNT" >>$CLMGR_TMPLOG
                print "$0()[$LINENO]($SECONDS): STATE RETRIEVALS: $STATE_COUNT" >>$CLMGR_TMPLOG
                (( errors = NODE_COUNT - STATE_COUNT ))
                (( UNKNOWN += ( NODE_COUNT - STATE_COUNT ) ))

                #=============================================================
                : Aggregate the node states to derive an appropriate overall
                : cluster state. The goal here is to provide the most useful
                : indicator to the customer. Therefore, the order of the
                : conditionals below  matters, and occur in the order that
                : is most likely to be of concern/interest to customers.
                #=============================================================
                if (( NODE_COUNT > 0 )); then
                    if [[ $CLMGR_GUI == "SMUI" ]]; then
                        if (( ERROR )); then
                            STATE="ERROR"
                        elif (( UNKNOWN )); then
                            STATE="UNKNOWN"
                        elif (( UNCONFIG )); then
                            STATE="NOT_CONFIGURED"
                        elif (( WARNING )); then
                            STATE="WARNING"
                        elif (( OFFLINE == NODE_COUNT )); then
                            STATE="OFFLINE"
                        elif (( NORMAL == NODE_COUNT )); then
                            STATE="STABLE"
                        elif (( UNMANAGED == NODE_COUNT )); then
                            STATE="UNMANAGED"
                        elif (( NORMAL && NORMAL != NODE_COUNT )); then
                            STATE="WARNING"
                        else
                            STATE="UNKNOWN"
                        fi
                    else
                        if (( ERROR )); then
                            STATE="ERROR"
                        elif (( UNKNOWN )); then
                            STATE="UNKNOWN"
                        elif (( UNCONFIG )); then
                            STATE="NOT_CONFIGURED"
                        elif (( OFFLINE == NODE_COUNT )); then
                            STATE="OFFLINE"
                        elif (( NORMAL == NODE_COUNT )); then
                            STATE="STABLE"
                        elif (( UNMANAGED == NODE_COUNT )); then
                            STATE="UNMANAGED"
                        else
                            STATE="UNKNOWN"
                        fi
                    fi

                    if [[ $STATE == "OFFLINE" ]]; then
                        DATA=$(LC_ALL=C clcmd odmget -q node_id=0 HACMPnode 2>>$CLMGR_TMPLOG | grep "^HACMPnode:\$")
                        [[ $DATA != *([[:space:]]) ]] && STATE="NOT_CONFIGURED"
                    fi
                fi
            else
                STATE="UNKNOWN"
            fi

        else
            (( rc == RC_SUCCESS )) || STATE=NOT_CONFIGURED || STATE=UNKNOWN
        fi
    fi

    if (( $rc != $RC_ERROR )); then
       (( errors == 0 )) && rc=$RC_SUCCESS || rc=$RC_ERROR
    fi

    if (( $rc != $RC_SUCCESS )); then
        STATE=UNKNOWN

    elif [[ $STATE != @(WARNING|ERROR|NOT_CONFIGURED|UNMANAGED) ]] && \
         [[ $CLMGR_GUI == "SMUI" ]]
    then
        print "$0()[$LINENO]($SECONDS): clcmd $HAUTILS/clodmget -f handle -n HACMPcluster | grep -q '^0\$'" >>$CLMGR_TMPLOG
        clcmd $HAUTILS/clodmget -f handle -n HACMPcluster 2>>$CLMGR_TMPLOG | grep -q '^0$'
        cmd_rc=$?
        print "$0()[$LINENO]($SECONDS): grep RC: $cmd_rc" >>$CLMGR_TMPLOG

        if (( cmd_rc == RC_SUCCESS )); then
            : At least on node has a cluster "handle" value of zero.
            : That indicates that unsynchronized changes have been made.
            STATE="WARNING"
        fi
    fi

    print "$0()[$LINENO]($SECONDS): Cluster State: $STATE" >>$CLMGR_TMPLOG
    print -- $STATE

    log_return_msg "$rc" "$0()" "$LINENO"
    return $?
} # End of "KLIB_HACMP_get_cluster_state()"


#============================================================================
#
# Name:        devDoc
#
# Description: This is a never-to-be-called, wrapper function that all the
#              clmgr FPATH functions implement in order to hide embedded
#              syntax from trace logging. This information is implemented
#              in POD format, and can be viewed in a number of ways using
#              POD tools. Some viewing suggestions for this function's POD-
#              formatted information are:
#
#                  perldoc <FILENAME>
#                  pod2text -c <FILENAME>
#                  pod2text -c --code <FILENAME>
#                  pod2html <FILENAME>
#
#              However, the more important use for this information is that
#              it is parsed by clmgr to display the syntax for this file's
#              operation. The information in the "SYNOPSIS" section is used
#              for this purpose. This feature was originally implemented
#              using the man page information. However, in a code review it
#              was pointed out that this approach had to be changed because
#              customers do not have to install the man pages! Therefore, a
#              built-in dependency on man page information would break the
#              automatic help feature of clmgr. So the SYNPOSIS section must
#              be used instead.
#
#              IMPORTANT: As a result of this, it is imperative that the
#                         information in this SYNOPSIS be kept in sync
#                         with the man page information, which is owned
#                         by the IDD team.
#
# Inputs:      None.
#
# Outputs:     None.
#
# Returns:     n/a (not intended to be invoked)
#
#============================================================================
function devDoc {
    : <<'=cut' >/dev/null 2>&1

=head1 NAME

 KLIB_HACMP_get_cluster_state
 
=head1 SYNOPSIS

 clmgr -cSa STATE query cluster

=head1 DESCRIPTION

 Queries every node in the cluster for its current state (retrieved
 by calling "lssrc -ls clstrmgrES") and aggregating the results. The
 currently known statuses are:

         ST_INIT
         ST_JOINING
         ST_STABLE
         ST_UNSTABLE
         ST_VOTING
         ST_RP_RUNNING
         ST_BARRIER
         ST_RP_FAILED
         ST_CBARRIER
         ST_DONE
         NOT_CONFIGURED
         UNMANAGED

 These raw states are aggregated into seven possible statuses:

         NOT_CONFIGURED
         OFFLINE
         ERROR
         STABLE
         WARNING
         UNMANAGED
         UNKNOWN
 
 NOT_CONFIGURED indicates that the node has not yet been configued into a
 cluster. If *any* node in the cluster has this state, then the entire
 cluster is considered to be "NOT_CONFIGURED".

 ST_INIT indicates cluster services are inactive, but AIX is running.
 If all the nodes have this state, the cluster state is set to "OFFLINE".

 ST_JOINING, ST_UNSTABLE, ST_VOTING, ST_RP_RUNNING, ST_BARRIER, ST_CBARRIER,
 and ST_DONE all indicate that cluster services are active, and something is
 happening. If *any* of the nodes in the cluster possess this state, then
 the cluster state is set to "STABLE".

 ST_RP_FAILED indicates a PowerHA error has occurred on that node (other
 nodes in the cluster may be in *BARRIER in this case). If *any* node in
 the cluster possesses this state, the cluster state is set to "ERROR".

 ST_STABLE indicates that cluster services are operating normally. If all
 nodes in the cluster have this state, the cluster state is set to "STABLE".

 WARNING indicates a degraded condition within the cluster, or the detection
 of a potential problem. For example, if one or more nodes report as "OFFLINE"
 while the rest are "STABLE", that is a degraded condition that will result
 in a "WARNING" state. This is also true when something causes the clstrmgr
 process to exit, but leaves the host active. Also, if the entire node appears
 to be down (does not respond to a ping), that is also considered an offline
 state, and will result in a "WARNING" status if any other nodes are online.

 If *any* node in the cluster reports an unrecognized state, or cannot be
 contacted, the state of the cluster is set to "UNKNOWN". For example, if
 if clcomd is down, but the host is up (responds to a ping), that results
 in an "UNKNOWN" status.

=head1 ARGUMENTS

None.

=head1 RETURN

 0: no errors were detected; the operation appears to have been successful
 1: a general error has occurred
 2: a specified resource does not exist, or could not be found
 3: some required input was missing
 4: some detected input was incorrect in some way
 5: a required dependency does not exist
 6: a specified search failed to match any data

=cut
} # End of "devDoc()"


#==============================================================================
# The following, comment block attempts to enforce coding standards when this
# file is edited via emacs or vim. This block _must_ appear at the very end
# of the file, or the editor will not find it, and it will be ignored.
#==============================================================================
# Local Variables:
# indent-tabs-mode: nil
# tab-width: 4
# End:
#==============================================================================
# vim: tabstop=4 shiftwidth=4 expandtab
#==============================================================================