#!/bin/ksh93
#  ALTRAN_PROLOG_BEGIN_TAG                                                    
#  This is an automatically generated prolog.                                  
#                                                                              
#  Copyright (C) Altran ACT S.A.S. 2017,2019,2021.  All rights reserved.  
#                                                                              
#  ALTRAN_PROLOG_END_TAG                                                      
#                                                                              
# @(#)  0772e43 43haes/lib/ksh93/hacmp/KLIB_HACMP_online_cluster.sh, 726, 2147A_aha726, Jul 29 2021 09:46 PM

#============================================================================
#
# Name:        KLIB_HACMP_online_cluster
#
# Description: This is the main, FPATH function that is invoked by clmgr to
#              bring cluster services online on all nodes in the cluster.
#              Each node is first checked to see if it is inactive. Only
#              inactive nodes are sent a start command. If all nodes are
#              already all online, then the desired end state already
#              exists, and a success code is returned. Otherwise, the
#              appropriate start command is issued for the inactive nodes,
#              and the results are returned. It is worth mentioning that
#              this command employs a user-configurable timeout to avoid
#              hangs.
#
#              If the CAA cluster services are also being brought online,
#              that is done *before* the SystemMirror cluster services are
#              brought online. One limitation to be aware of is that it is
#              only possible for this to be done if the local node currently
#              has active CAA services.
#              Workaround: run clmgr on a node with active CAA services
#
#              The "START_CAA" option is really intended for use only
#              during specific, exceptional conditions. Normally, it is
#              neither recommended nor needed to directly manipulate the
#              CAA state.
#
# Inputs:      See the "devDoc()" function, above.
#
# Outputs:     The only outputs are any messages, error or otherwise,
#              that might be needed.
#
# Returns:     Zero if no errors are detected. Otherwise, an appropriate
#              non-zero value is returned. Refer to the "RETURN" section
#              of the "devDoc()" function, above, for the standard return
#              code values/meanings for clmgr.
#
#============================================================================
function KLIB_HACMP_online_cluster {
    trap 'on_exit_online_cluster' EXIT
    . $HALIBROOT/log_entry "$0()" "$CL"
    : version="@(#)  0772e43 43haes/lib/ksh93/hacmp/KLIB_HACMP_online_cluster.sh, 726, 2147A_aha726, Jul 29 2021 09:46 PM"
    : INPUTS: $*

    typeset when=${1//\"/}
    typeset manage=${2//\"/}
    typeset broadcast=${3//\"/}
    typeset clinfo=${4//\"/}
    typeset force=${5//\"/}
    typeset fix=${6//\"/}
    integer timeout=${7//\"/}
    typeset start_caa=${8//\"/}

    [[ $CLMGR_LOGGING == 'med' ]] && set +x  # Only trace param values

    #===================================
    : Declare and initialize variables
    #===================================
    integer used=0 interval=5 DELAYED_START=0
    rc=$RC_UNKNOWN  # Declare globally, so it is visible in the exit function
    typeset state= start_clinfo=

    #================================================================
    : Check for a defined cluster. No need to continue without one.
    #================================================================
    CL=$LINENO isClusterDefined
    if (( $? != RC_SUCCESS )); then
        dspmsg -s $CLMGR_SET $CLMGR_MSGS 35 "\nERROR: no cluster is defined yet.\n\n" 1>&2
        rc=$RC_MISSING_DEPENDENCY

    else
        print "$0()[$LINENO]($SECONDS): clnodename" >>$CLMGR_TMPLOG  # Always log commands
        nodes=$(clnodename)
        print "clnodename RC: $?; nodes == \"$nodes\"" >>$CLMGR_TMPLOG  # Always log command result
    fi

    #=================
    : Validate input
    #=================
    if [[ -z $nodes ]]; then
        dspmsg -s $CLMGR_SET $CLMGR_MSGS 130 "\nERROR: no nodes could be found within this cluster.\n\n" 1>&2
        rc=$RC_MISSING_DEPENDENCY
    else
        if [[ -n $start_caa ]]; then
            CL=$LINENO verify_in_set START_CAA "$start_caa" "only yes true enable 1 no false disable 0" start_caa
            if (( $? == RC_SUCCESS )); then
                if [[ $start_caa == o* ]]; then
                    start_caa="only"
                elif [[ $start_caa == @(y|t|e|1)* ]]; then
                    start_caa="yes"
                else
                    start_caa=""
                fi
            else
                rc=$RC_INCORRECT_INPUT
            fi
        fi

        #
        : Do not validate PowerHA start-up inputs if only CAA is being started
        #
        if [[ $start_caa != "only" ]]; then
            if [[ -z $when ]]; then
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 '\nWarning: "%1$s" must be specified. Since it was not,\n       a default of "%2$s" will be used.\n\n' WHEN now
                when="-N"
            else
                CL=$LINENO verify_in_set WHEN "$when" "NOW RESTART BOTH" when
                if (( $? == RC_SUCCESS )); then
                    when="-${when:0:1}"
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            fi

            if [[ -z $manage ]]; then
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 114 '\nWarning: "%1$s" must be specified. Since it was not,\n       a default of "%2$s" will be used.\n\n' MANAGE auto
                manage="-A"
            else
                CL=$LINENO verify_in_set MANAGE "$manage" "AUTOMATIC MANUAL DELAYED" manage
                if (( $? == RC_SUCCESS )); then
                    case $manage in
                        M*) manage="-M" ;;
                        D*) manage="-M"
                            DELAYED_START=1
                        ;;
                         *) manage="-A" ;;
                    esac
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            fi

            if [[ -n $broadcast ]]; then
                CL=$LINENO verify_in_set BROADCAST "$broadcast" "yes true enable 1 no false disable 0" broadcast
                if (( $? == RC_SUCCESS )); then
                    [[ $broadcast == @(n|f|d|0)* ]] && broadcast="" || broadcast="-b"
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            else
                broadcast="-b"  # Broadcast by default
            fi

            if [[ -n $clinfo ]]; then
                CL=$LINENO verify_in_set CLINFO "$clinfo" "yes true enable 1 no false disable 0 consistent" clinfo
                if (( $? == RC_SUCCESS )); then
                    case $clinfo in
                        @(t|y|1|e)*) start_clinfo="-i" ;;
                        c* )start_clinfo="-I" ;;
                    esac
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            fi

            if [[ -n $force ]]; then
                CL=$LINENO verify_in_set FORCE "$force" "yes true enable 1 no false disable 0" force
                if (( $? == RC_SUCCESS )); then
                    [[ $force == @(y|t|e|1)* ]] && force="-v" || force=""
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            fi

            if [[ -n $fix ]]; then
                CL=$LINENO verify_in_set FIX "$fix" "yes true enable 1 no false disable 0 interactively" fix
                if (( $? == RC_SUCCESS )); then
                    case $fix in
                        i*) (( ! CLMGR_FORCE )) && fix="-C interactive" ;;
                        @(y|t|e|1)*)               fix="-C yes"         ;;
                        *)                         fix=""               ;;
                    esac
                else
                    rc=$RC_INCORRECT_INPUT
                fi
            fi

        elif [[ -n "$when$manage$broadcast$clinfo$force$fix$timeout" ]]; then
            cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 1491 'Warning: when the "%1$s" input is used with a value of "%2$s", all other option(s) are invalid and will be disregarded.\n' "START_CAA" "$start_caa"
        fi

        #===============================================================
        : Dynamically set the timeout based upon the number of nodes.
        : Of course, this is only done if the customer did not already
        : specify a timeout, or specified a timeout that is too short.
        #===============================================================
        if [[ -z $timeout ]] || (( timeout < 180 )); then
            # Allow 5 minutes for a 2-node cluster; more than enough
            timeout=300

            # Add 90 seconds for each extra node beyond 2
            for node in ${nodes//,/ }; do
                (( ++nc > 2 )) && (( timeout += 90 ))
            done
        fi
    fi

    #==========================================================================
    : First, check to see if the specified nodes already have cluster services
    : running. If so, then this command is not needed for those nodes, and will
    : result in an erroneous exit code of 1.
    #==========================================================================
    typeset new_nodelist= state=
    for node in ${nodes//,/ }; do
        state=$(CL=$LINENO KLIB_HACMP_get_node_state $node 2>>$CLMGR_TMPLOG)
        # If clstrmgrES subsystem is inoperative, clstart will start it, so no further action is required.
        if [[ $state != *@(INIT|NOT_CONFIGURED|UNMANAGED|WARNING|INOPERATIVE) ]] && [[ $when != "-R" || $state == "UNKNOWN" ]]; then
            if [[ $state == "UNKNOWN" ]]; then
                typeset COMMPATH=$(clodmget -q "name=$node AND object=COMMUNICATION_PATH" -n -f value HACMPnode)
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 194 '\nERROR: unable to communicate with "%1$s" (%2$s). Verify that the node is powered up and active, and that clcomd is properly configured and running on it. If the problem persists, also check the local clcomd, and verify that the network is functioning normally.\n\n' "$node" "$COMMPATH" 1>&2
                rc=$RC_ERROR
            else
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 501 'Warning: cluster services are already online on node\n         "%1$s" (state is "%2$s").\n         Removing that node from the startup list.\n' "$node" "$state"
            fi
        else
            new_nodelist="$new_nodelist,$node"
        fi
    done
    nodes=${new_nodelist#,}

    if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then
        if [[ -n $start_caa ]]; then
            #==================================================
            : The customer has requested that a start request
            : be sent to the CAA cluster on all nodes.
            #==================================================
            typeset CNAME=$(LC_ALL=C lscluster -c 2>>$CLMGR_TMPLOG | grep "^Cluster Name: ")
                    CNAME=${CNAME#*: }
            [[ -z $CNAME ]] && CNAME=$(clodmget -n -f name HACMPcluster)
            if [[ -n $CNAME ]]; then
                date > $HAETC/clmgr.$$
                print "$0()[$LINENO]($SECONDS): clctrl -start -n $CNAME -a" >>$CLMGR_TMPLOG
                clctrl -start -n $CNAME -a
                print "$0()[$LINENO]($SECONDS): clctrl RC: $?" >>$CLMGR_TMPLOG
                rm -f $HAETC/clmgr.$$
            else
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 191 "\nERROR: unable to determine the name of the cluster. Knowing the name of the cluster is needed in order to stop or start Cluster Aware AIX cluster services. Verify that this cluster is fully configured and has been successfully synchronized before attempting this operation again.\n\n" 1>&2
            fi
        fi
    fi

    if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then
        #======================================================
        : Make sure CAA is active before starting the cluster
        #======================================================
        for node in $(clnodename); do
            print "$0()[$LINENO]($SECONDS): $CLRSH $node /bin/true" >>$CLMGR_TMPLOG
            $CLRSH $node /bin/true
            if (( $? == RC_SUCCESS )); then
                : Looks like clcomd is working
                print "$0()[$LINENO]($SECONDS): $CLRSH $node lscluster -c" >>$CLMGR_TMPLOG
                $CLRSH $node lscluster -c >>$CLMGR_TMPLOG 2>&1
                if (( $? != 0 )); then
                    $CLRSH $node lscluster -c 1>&2
                    cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 535 '\nERROR: the Cluster Aware AIX cluster services appear to be offline on \"%1$s\".\nIt is possible that CAA was manually stopped using STOP_CAA option.\nIn that case, you could use any one of the below command to  start CAA and  PowerHA services.\n\"clmgr online cluster START_CAA=true\"  OR \n \"clmgr online node <nodename> START_CAA=true\"  \n\n' "$node" 1>&2
                    rc=$RC_ERROR
                fi
            else
                typeset COMMPATH=$(clodmget -q "name=$node AND object=COMMUNICATION_PATH" -n -f value HACMPnode)
                cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 194 '\nERROR: unable to communicate with "%1$s" (%2$s). Verify that the node is powered up and active, and that clcomd is properly configured and running on it. If the problem persists, also check the local clcomd, and verify that the network is functioning normally.\n\n' "$node" "$COMMPATH" 1>&2
                rc=$RC_ERROR
            fi
        done
    fi

    if [[ $start_caa == "only" ]]; then
        : The "only" option was specified with the "START_CAA" input, so only
        : CAA is to be started, not PowerHA cluster services. Nulling the
        : "nodes" variable disables the following node start code segment.
        nodes=""
    fi

    if [[ -n $nodes ]] && (( $rc == RC_UNKNOWN )); then
        set +e
        print "$0()[$LINENO]($SECONDS): _SPOC_FORCE=Y fix_args nop cl_rc.cluster $when -cspoc-n \"$nodes\" $manage $broadcast $start_clinfo $force $fix" >>$CLMGR_TMPLOG  # Always log commands
        VERBOSE_LOGGING=""
        _SPOC_FORCE=Y fix_args \
            nop cl_rc.cluster \
            $when \
            -cspoc "-n ${nodes//+([[:space:]])/,}" \
            $manage \
            $broadcast \
            $start_clinfo \
            $force \
            $fix
        rc=$?
        print "$0()[$LINENO]($SECONDS): cl_rc.cluster RC: $rc" >>$CLMGR_TMPLOG  # Always log command result
        set -e

        #============================================================
        : The command executed above is asynchronous. So in order
        : for the correct status to be returned, it is necessary
        : to loop here, and poll the cluster manager. Of course,
        : a visual indication is provided for the customer, so
        : they can see that something is happening. And eventually,
        : we will give up, and report a problem message.
        #============================================================
        if (( $rc == RC_SUCCESS )); then
            rc=$RC_ERROR  # Assume the worst, until proven otherwise
            for (( used=0; used < timeout; used += interval )); do
                CL=$LINENO KLIB_HACMP_get_cluster_state | read state
                if [[ $state == "STABLE" ]]; then
                    (( used > 0 )) && print
                    rc=$RC_SUCCESS

                    if (( DELAYED_START )); then
                        #
                        : The cl_rc.cluster command logs normal, non-error info
                        : to STDOUT for some reason. Since STDERR is displayed
                        : last by clmgr, that will cause cluster startup info
                        : to be displayed _after_ the RG startup info, which
                        : may cause customer confusion. To defeat this, the
                        : error log is transferred to the normal output log
                        : before launching thr RG startup.
                        #
                        if [[ -s $CLMGR_ERRLOG ]]; then
                            cat $CLMGR_ERRLOG
                            >$CLMGR_ERRLOG
                        fi
                    fi

                    dspmsg -s $CLMGR_SET $CLMGR_MSGS 516 "\nThe cluster is now online.\n\n"
                    break

                elif [[ $state == "ERROR" ]]; then  # No recovering from this
                    (( used > 0 )) && print
                    cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 503 "\nERROR: cluster services have experienced a problem which has left\n       the cluster in an error state. Manual intervention will be\n       required to recover from this problem.\n\n" 1>&2
                    break
                else
                    print -n "."
                    if (( interval >= timeout - 1 )); then
                        print
                        cl_dspmsg -s $CLMGR_SET $CLMGR_MSGS 517 "\nWarning: unable to determine if the cluster has come fully online\n         (the process is still not complete). Wait a few minutes, then\n         manually check the cluster's state using \"clmgr query cluster\".\n\n" "$CLMGR_PROGNAME"
                    elif (( timeout - interval <= 0 )); then
                        (( interval = timeout - interval - 1 ))
                    fi
                    sleep $interval
                fi
            done

            if (( DELAYED_START )); then
                if (( rc == RC_SUCCESS )); then
                    #
                    : Wait for cluster to become stable.Errors will be seen if we try to bring RGs
                    : before cluster gets STABLE
                    #
                    wait_time=1
                    while [[ $wait_time -lt 120 ]]
                    do
                       CL=$LINENO KLIB_HACMP_get_cluster_state | read state
                       if [[ $state == "STABLE" ]]; then
                           break
                       else
                           sleep 1
                           (( wait_time = $wait_time +1 ))
                       fi
                    done
                    print "$0()[$LINENO]($SECONDS): Since delayed management of resource groups was requested, all resource groups will now be brought online." >>$CLMGR_TMPLOG
                    print "$0()[$LINENO]($SECONDS): KLIB_HACMP_online_resourcegroup all" >>$CLMGR_TMPLOG
                    CL=$LINENO KLIB_HACMP_online_resourcegroup all
                    rc=$?
                    print "$0()[$LINENO]($SECONDS): KLIB_HACMP_online_resourcegroup RC: $rc" >>$CLMGR_TMPLOG
                else
                    print "$0()[$LINENO]($SECONDS): Since the cluster startup may have had a problem, the requested delayed management of resource groups will not be done." >>$CLMGR_TMPLOG
                fi
            fi
        else
            rc=$RC_ERROR
        fi

    elif [[ -z $nodes && $start_caa != "only" ]] && (( $rc == RC_UNKNOWN )); then
        : All nodes are up, so we have the desired cluster end state
        dspmsg -s $CLMGR_SET $CLMGR_MSGS 530 '\nCluster "%1$s" is already online.\n' "$(CL=$LINENO KLIB_HACMP_get_cluster_name)"
        rc=$RC_SUCCESS
    fi

    log_return_msg "$rc" "$0()" "$LINENO"
    return $?
} # End of "KLIB_HACMP_online_cluster()"


#============================================================================
#
# Name:        on_exit_online_cluster
#
# Description: This function is invoked whenever this script exits. It
#              checks the current "$rc" value, and if it is still set
#              to unknown, exits with an error code.
#
# Inputs:      None, explicitly. However, the "$rc" variable is read.
#
# Outputs:     None.
#
# Returns:     Nothing. The program exits.
#
#============================================================================
function on_exit_online_cluster {
    if (( $rc == RC_UNKNOWN )); then
        print
        exit $RC_ERROR
    fi
}


#============================================================================
#
# Name:        KLIB_HACMP_online_node
#
# Description: This is the main, FPATH function that is invoked by clmgr
#              to bring cluster services online on one or more specified
#              nodes (defaulting to the local node). Each node is first
#              checked to see if it is iinactive. Only inactive nodes are
#              sent a start command. If the nodes are already all online,
#              then the desired end state already exists, and a success
#              code is returned. Otherwise, the appropriate start command
#              is issued for the inactive nodes, and the results are
#              returned. It is worth mentioning that this command employs
#              a user-configurable timeout to avoid hangs.
#
#              If the CAA cluster services are also being brought online,
#              that is done *before* the SystemMirror cluster services are
#              brought online. One limitation to be aware of is that it
#              is only possible for one node to start CAA on another node
#              if the local node currently has active CAA services. Put
#              another way, if CAA is not active on the *local* node, you
#              will not be able to manage CAA on any remote nodes from
#              this node until CAA services are restored.
#              Workaround 1: use clrsh to run clmgr on the remote nodes.
#              Workaround 2: log in to the remote nodes and run clmgr.
#              Workaround 3: when working with a subset of the cluster
#                            nodes, always manipulate them all at once.
#              Workaround 4: always manage the local node last.
#
#              The "START_CAA" option is really intended for use only
#              during specific, exceptional conditions. Normally, it is
#              neither recommended nor needed to directly manipulate the
#              CAA state.
#
# Inputs:      See the "devDoc()" function, below.
#
# Outputs:     The only outputs are any messages, error or otherwise,
#              that might be needed.
#
# Returns:     Zero if no errors are detected. Otherwise, an appropriate
#              non-zero value is returned. Refer to the "RETURN" section
#              of the "devDoc()" function, below, for the standard return
#              code values/meanings for clmgr.
#
#============================================================================
function devDoc {
    : <<'=cut' >/dev/null 2>&1

=head1 NAME

 KLIB_HACMP_online_cluster

=head1 SYNOPSIS

 clmgr online cluster \
             [ WHEN={now|restart|both} ] \
             [ MANAGE={auto|manual|delayed} ] \
             [ BROADCAST={false|true} ] \
             [ CLINFO={false|true|consistent} ] \
             [ FORCE={false|true} ] \
             [ FIX={no|yes|interactively} ]
             [ TIMEOUT=<seconds_to_wait_for_completion> ] \
             [ START_CAA={no|yes} ]

 NOTE: the "TIMEOUT" attribute defaults to 120 seconds.
 NOTE: the alias for "cluster" is "cl".

=head1 DESCRIPTION

Attempts to bring all the member nodes of the cluster online.

=head1 ARGUMENTS

 1. when [OPTIONAL] [string]
    An indicator of when the action should take place.

    Valid values include: {now|restart|both}

 2. manage [OPTIONAL] [string]
    An indicator of the manner in which any resource groups managed
    within the cluster should be handled. Automatic management
    honors the policies established in the cluster configuration.
    Manual management ignores all established policies, and leaves
    the resource groups offline after cluster services are started.
    Delayed management also ignores all established policies until
    cluster services are started, but then starts all resource groups
    after that.

    Valid values include: {auto|manual|delayed}

 3. broadcast [OPTIONAL] [boolean]
    A Boolean-like indicator of whether or not to broadcast a message
    announcing the startup on each affected node.

 4. clinfo [OPTIONAL] [string]
    An indicator of whether ot startup clinfo, and how.

    Valid values include: {false|true|consistent}

 5. force [OPTIONAL] [boolean]
    A Boolean-like indicator for specifying whether or not the startup
    should continue even if verification fails.

 6. fix [OPTIONAL] [string]
    An indicator of whether or not to attempt to automatically correct
    certain errors that might be found during verification, and how.

    Valid values include: {no|yes|interactively}

 7. timeout [OPTIONAL] [integer]
    The amount of time to allow the startup to continue before giving
    up and reporting a failure. Of course, just because clmgr gives up
    does not mean that PowerHA does, and the startup may occur anyway,
    simply taking longer than the timeout to complete!

    The default timeout is 300 seconds, plus 90 additional seconds for
    each node beyond the second node (i.e. that would mean 180 added
    seconds in a 4 node cluster).

 8. start_caa [OPTIONAL] [set]
    An indicator for specifying whether or not to explicitly send
    a start request to the CAA cluster running on all the nodes in
    the cluster. Accepted values are: true, false, only.

=head1 RETURN

 0: no errors were detected; the operation appears to have been successful
 1: a general error has occurred
 2: a specified resource does not exist, or could not be found
 3: some required input was missing
 4: some detected input was incorrect in some way
 5: a required dependency does not exist
 6: a specified search failed to match any data

=cut
} # End of "devDoc()"


#==============================================================================
# The following, comment block attempts to enforce coding standards when this
# file is edited via emacs or vim. This block _must_ appear at the very end
# of the file, or the editor will not find it, and it will be ignored.
#==============================================================================
# Local Variables:
# indent-tabs-mode: nil
# tab-width: 4
# End:
#==============================================================================
# vim: tabstop=4 shiftwidth=4 expandtab
#==============================================================================