#!/bin/ksh93 
#  ALTRAN_PROLOG_BEGIN_TAG
#  This is an automatically generated prolog.
#
#  Copyright (C) Altran ACT S.A.S. 2017,2021.  All rights reserved.
#
#  ALTRAN_PROLOG_END_TAG
#
# IBM_PROLOG_BEGIN_TAG 
# This is an automatically generated prolog. 
#  
# 61haes_r714 src/43haes/usr/sbin/cluster/events/utils/cl_fence_vg.sh 1.22 
#  
# Licensed Materials - Property of IBM 
#  
# COPYRIGHT International Business Machines Corp. 2007,2011 
# All Rights Reserved 
#  
# US Government Users Restricted Rights - Use, duplication or 
# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. 
#  
# IBM_PROLOG_END_TAG 
# @(#)  7d4c34b 43haes/usr/sbin/cluster/events/utils/cl_fence_vg.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM
###############################################################################
#
#   COMPONENT_NAME: hacmp.events
#
#   Name:
#       cl_fence_vg.sh
#
#
#   Description:
#
#	Perform any fencing operations associated with CRITICAL volume
#	groups, in response to a node_down event.  A CRITICAL volume group is
#	one that has been so configured.  The implication of "CRITICAL" is 
#	that in the event of a potential cluster partition, some drastic 
#	action must be taken by a node that does not retain access to a quorum
#	of disks in the volume group.
#
#	The expected use of CRITICAL volume groups is in "concurrent" resource
#	groups, to hold information critical to coordinating application servers
#	across the cluster.
#
#
#   Function:
#
#	Find those CRITICAL volume groups accessed by both the local
#	node and the node on which the event occurred.  If a given volume
#	group is marked as CRITICAL, check to see if this node can access
#	a quorum of disks.  If it cannot, it is fenced away from that 
#	volume group.
#
#	The actual fencing operation is performed by LVM.  This forces off the
#	volume group, in the same fashion as would happen if the volume group
#	lost quorum.
#
#	Additionally, CAA fencing is used to make the disks inaccessable.
#
#	Further processing is then based on the configured monitor action, if
#	any, for this volume group, as saved in HACMPmonitor.  Possible
#	choices are nothing, invoke a notification method, take the resource
#	group offline, shut down cluster services on this node, and halt the
#	node.
#
#
#   Input:
#
#	EVENTNODE is the HACMP node name of the failed node
#
#	LOCALNODENAME is the HACMP node name of the current node
#
#
#   Environment:
#
#	use is made of HACMPadapter, HACMPresource, HACMPmonitor, HACMPgroup,
#	CuDeP
#
#
# Return Values:
#       0       success
#       1       failure
#
#   Questions?	Comments?   Expressions of Astonishment?    mailto:hafeedbk@us.ibm.com
#
###############################################################################


###############################################################################
#
#
#   Function:	vg_fence_action
#
#	Take the configured action from the HACMPmonitor class for a volume
#	group
#
#
#   Input:
#
#	Volume group name
#
#
#   Output:
#
#	None
#
#
###############################################################################
function vg_fence_action {

    PS4_FUNC="vg_fence_action"
    [[ $VERBOSE_LOGGING == high ]] && set -x

    typeset VG
    VG=$1
    integer rc=0
    typeset RG=$(clodmget -q"value = $VG" -f group -n HACMPresource)

    #
    :   At this point, we have looked at all the disks in
    :   this volume group and do not retain access to a sufficient number
    :   of them.  Fence out access to the volume group.
    #
    cl_forceoff_vg $VG
    rc=$?
    if (( 0 == $rc ))
    then
	cl_echo 10410 "Fenced volume group $VG away from node $LOCALNODENAME" $VG $LOCALNODENAME
    else
	cl_log 10510 "$PROGNAME:  Volume group $VG could not be fenced away from node $LOCALNODENAME" $PROGNAME $VG $LOCALNODENAME
    fi

    #
    :	Do a CAA fence, too.  Effectively mark the disks as failed.
    #
    cl_set_vg_fence_height $VG "ff"
    if (( 0 != $rc ))
    then
	cl_log 10511 "$PROGNAME:  Volume group $VG fence height could not be set to FENCE_FAIL" $PROGNAME $VG FENCE_FAIL
    fi

    #
    :   Now, find the action configured for loss of this volume group
    #
    action=""
    monitor=$(clodmget -q "name = RESOURCE_TO_MONITOR and type = CRITICAL_VG and value = $VG" -f monitor -n HACMPmonitor)
    if [[ -z $monitor ]]			    # migration - allow for earlier formulation
    then
	monitor=$(clodmget -q "name = RESOURCE_TO_MONITOR and value = $VG" -f monitor -n HACMPmonitor)
    fi
    if [[ -n $monitor ]]
    then
	#
	:   Monitor $monitor is configured for this volume group.  Check
	:   to see what action it calls for 
	#
	action=$(clodmget -q "monitor = $monitor and name = FAILURE_ACTION and type = CRITICAL_VG" -f value -n HACMPmonitor)
	if [[ -z $action ]]			    # migration - allow for earlier formulation
	then
	    action=$(clodmget -q "monitor like '*$VG' and name = FAILURE_ACTION" -f value -n HACMPmonitor)
	fi
	
	#
	:   If there is a notification method, always run that, even if
	:   other actions are specified.  This allows the specification of
	:   both a notification mechanism, and a specific action
	#
	notify_method=$(clodmget -q "monitor = $monitor and type = CRITICAL_VG and name = NOTIFY_METHOD" -f value -n HACMPmonitor)
	if [[ -z $notify_method ]]		    # migration - allow for earlier formulation
	then
	    notify_method=$(clodmget -q "monitor like '*$VG' and name = NOTIFY_METHOD" -f value -n HACMPmonitor)
	fi
	if [[ -n $notify_method ]]
	then
	    cl_echo 10411 "Calling notify method $notify_method for loss of quorum on volume group $VG in resource group $RG" $notify_method $VG $RG
	    #
	    :	Call the notification method, $notify_method, backgrounded to
	    :	avoid dealing with hangs, errors, etc.
	    #
	    $notify_method & 
	fi
    fi

    #
    :   Perform the action $action
    #
    if [[ -z $action ]]
    then
	#
	:   No action has been set. Halt the node as the default action
	#
        cl_echo 10414 "Halting $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG
        clexit.rc clstrmgrES

    elif [[ $action == "fence" || $action == "notify" ]]
    then
	#
	:   This has already been done so as above we do nothing
	#

    elif [[ $action == "fallover" ]]
    then
	#
	:   The expected response to loss of quorum on $VG is to take
	:   resource group $RG offline
	#
	cl_echo 10412 "Taking resource group $RG offline due to loss of quorum on volume group $VG" $RG $VG
	#
	:   Generate a request to the cluster manager to take the resource
	:   group offline on this node.  
	#	Note simplistic coding below
	#	does not handle child/parent or location dependencies
	#
	GROUP_ID=$(clodmget -q "group = $RG" -f id -n HACMPgroup)
	USER_RG_OFFLINE="3"
	TARGET_HANDLE=$(clhandle | cut -f1 -d' ')
	SET_POL="1"
	PERSISTENT="0"
	SITE_OVERRIDE="0"
	clRMupdate user_rg_move $GROUP_ID $USER_RG_OFFLINE $TARGET_HANDLE $SET_POL $PERSISTENT $SITE_OVERRIDE


    elif [[ $action == "shutdown" ]]
    then
	#
	:   The expected response to loss of quorum on $VG is to take down
	:   HACMP on this node
	#
	cl_echo 10413 "Stopping PowerHA SystemMirror on $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG
	clstop -g -s -N
    
    elif [[ $action == "halt" ]]
    then
	#
	:   The expected response to loss of quorum on $VG is to halt the
	:   node.  Use clexit.rc to log and drive that action
	#
	cl_echo 10414 "Halting $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG
	clexit.rc clstrmgrES
    fi
}


###############################################################################
#
#
#   Function:	vg_fence_check
#
#	Check to see if a quorum of disks are accessable in a given volume group
#
#
#   Input:
#
#	Volume group name
#
#   Output:
#
#	return code 0 - quorum accessable
#	return code 1 - quorum not accessable
#
#
###############################################################################
function vg_fence_check {

    PS4_FUNC="vg_fence_check"
    [[ $VERBOSE_LOGGING == high ]] && set -x

    typeset VG=$1

    integer quorum=0
    integer total_vg_disks=0
    integer present=0
    typeset RG=$(clodmget -q"value = $VG" -f group -n HACMPresource)

    #
    :   Find all the disks in volume group $VG - space separated list
    #
    vg_disks=$(print "$lspv_out" | grep -w $VG | cut -f1 -d' ' | paste -s -d' ' - )
    total_vg_disks=$(print $vg_disks | wc -w)

    #
    :   Compute the quorum for $VG - this is "one more than half" of the
    :   disks in the volume group
    #
    quorum=$(( (total_vg_disks+2)/2 ))

    #
    :   Check each disk in the volume group to see if it
    :   is still accessable
    #
    present=0
    for vg_disk in $vg_disks
    do
	#
	:   Check to see if disk $vg_disk is still
	:   accessable - can we read the PVID of the disk
	#
	if cl_querypv -q $vg_disk
	then
	    if (( ++present >= quorum ))
	    then
		#
		:   We are able to access a quorum - $quorum - of disks
		:   disks in volume group $VG.  Return success for this 
		:   volume group
		#
		return 0
	    fi
	else
	    #
	    :	Disk $vg_disk is inaccessable.  One
	    :	more that counts against achieving quorum.
	    #
	    cl_echo 10512 "$PROGNAME: Unable to access disk $vg_disk in volume group $VG in resource group $RG shared with node $EVENTNODE" $PROGNAME $vg_disk $VG $RG $EVENTNODE
	fi
    done

    #
    :   If we get here, it means that a quorum of accessable
    :	disks has not been found.  Return an error indication
    #
    return 1
}


################################################################################
#
:   Main procedure in cl_fence_vg
#
################################################################################

typeset PROGNAME=${0##*/}
export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)"
if [[ $VERBOSE_LOGGING == "high" ]]; then
    set -x
    version='%I%'
fi


#
:   Collect list of disks, for use later
#
lspv_out=$(lspv)

if [[ -z $LOCALNODENAME ]]
then
    #
    :	'$LOCALNODENAME' should have been set by caller
    #
    LOCALNODENAME=$(/usr/es/sbin/cluster/utilities/get_local_nodename)
fi

#
:   Accept a formal parameter of 'name of node that failed' if none were set
:   in the environment
#
EVENTNODE=${EVENTNODE:-$1}

if [[ -z $EVENTNODE ]]
then
    #
    :	No indication of the node that failed, so cannot proceed
    #
    exit 1
fi

#
:   An explicit volume group list can be passed after the name of 
:   the node that failed.  Pick up any such
#
shift
vg_list="$*"			    # volume group list or null

common_groups=""		    # common between this node and the one that failed
common_critical_vgs=""		    # common between this node and the one that failed

if [[ -z $vg_list ]]
then
    #
    :   Find all the concurrent resource groups that contain both $EVENTNODE and $LOCALNODENAME
    #
    for group in $(clodmget -q "startup_pref = OAAN" -f group -n HACMPgroup)
    do
	#
	:   Check to see if $group is in FORCEDOWN_GROUPS.
	:   If it is, then the associated VGs should be varied off 
        :   and we will not be able to process them.
	#
	if [[ -n $FORCEDOWN_GROUPS && -n "$(print $FORCEDOWN_GROUPS | grep -w $group)" ]]
	then
	    #
	    :	Skipping $group
	    #
	    continue
	fi
        #
        :   If the resource group is OFFLINE, we will not consider those
        :   as they might have been brought offline by user either by a clstop
        :   or a user initiated RG move to offline.
        #
        if LC_ALL=C clRGinfo -s $group | grep -w $LOCALNODENAME | grep -w "OFFLINE"
        then
            #
            :	Skipping $group for being offline on localnode
            #
            continue
        fi
	#
	:   Extract the node list
	#
	rg_node_list=$(clodmget -q "group = $group" -f nodes -n HACMPgroup)

	#
	:   If the rg node list contains both $LOCALNODENAME and
	:   $EVENTNODE, add it to the list of groups to process
	#
	if print $rg_node_list | grep -w $LOCALNODENAME | grep -qw $EVENTNODE
	then
	    common_groups="$common_groups $group"
	fi
    done

    #
    :   Look at each of the resource groups in turn to determine what CRITICAL
    :   volume groups the local node $LOCALNODENAME share access with $EVENTNODE
    #
    for RG in $common_groups
    do
	#
	:   What CRITICAL volume groups does the local node $LOCALNODENAME share access
	:   with $EVENTNODE in resource group $RG
	#
	common_critical_vgs="$common_critical_vgs $(clodmget -q "group = $RG and name = CRITICAL_VG" -f value -n HACMPresource)"
    done
else
    #
    :	Given a list of volume groups.  Just check these
    #
    common_critical_vgs=$vg_list
fi

#
:   Process the list of common volume groups, $common_vgs
#
for VG in $common_critical_vgs
do
    #
    :   Go check if this volume group retains access to a quorum of disks
    #
    if ! vg_fence_check $VG
    then
	#
	:   Quorum has been lost.  Take the configured action
	#
	vg_fence_action $VG
    fi
done