#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # IBM_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # 61haes_r714 src/43haes/usr/sbin/cluster/events/utils/cl_fence_vg.sh 1.22 # # Licensed Materials - Property of IBM # # COPYRIGHT International Business Machines Corp. 2007,2011 # All Rights Reserved # # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # # IBM_PROLOG_END_TAG # @(#) 7d4c34b 43haes/usr/sbin/cluster/events/utils/cl_fence_vg.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM ############################################################################### # # COMPONENT_NAME: hacmp.events # # Name: # cl_fence_vg.sh # # # Description: # # Perform any fencing operations associated with CRITICAL volume # groups, in response to a node_down event. A CRITICAL volume group is # one that has been so configured. The implication of "CRITICAL" is # that in the event of a potential cluster partition, some drastic # action must be taken by a node that does not retain access to a quorum # of disks in the volume group. # # The expected use of CRITICAL volume groups is in "concurrent" resource # groups, to hold information critical to coordinating application servers # across the cluster. # # # Function: # # Find those CRITICAL volume groups accessed by both the local # node and the node on which the event occurred. If a given volume # group is marked as CRITICAL, check to see if this node can access # a quorum of disks. If it cannot, it is fenced away from that # volume group. # # The actual fencing operation is performed by LVM. This forces off the # volume group, in the same fashion as would happen if the volume group # lost quorum. # # Additionally, CAA fencing is used to make the disks inaccessable. # # Further processing is then based on the configured monitor action, if # any, for this volume group, as saved in HACMPmonitor. Possible # choices are nothing, invoke a notification method, take the resource # group offline, shut down cluster services on this node, and halt the # node. # # # Input: # # EVENTNODE is the HACMP node name of the failed node # # LOCALNODENAME is the HACMP node name of the current node # # # Environment: # # use is made of HACMPadapter, HACMPresource, HACMPmonitor, HACMPgroup, # CuDeP # # # Return Values: # 0 success # 1 failure # # Questions? Comments? Expressions of Astonishment? mailto:hafeedbk@us.ibm.com # ############################################################################### ############################################################################### # # # Function: vg_fence_action # # Take the configured action from the HACMPmonitor class for a volume # group # # # Input: # # Volume group name # # # Output: # # None # # ############################################################################### function vg_fence_action { PS4_FUNC="vg_fence_action" [[ $VERBOSE_LOGGING == high ]] && set -x typeset VG VG=$1 integer rc=0 typeset RG=$(clodmget -q"value = $VG" -f group -n HACMPresource) # : At this point, we have looked at all the disks in : this volume group and do not retain access to a sufficient number : of them. Fence out access to the volume group. # cl_forceoff_vg $VG rc=$? if (( 0 == $rc )) then cl_echo 10410 "Fenced volume group $VG away from node $LOCALNODENAME" $VG $LOCALNODENAME else cl_log 10510 "$PROGNAME: Volume group $VG could not be fenced away from node $LOCALNODENAME" $PROGNAME $VG $LOCALNODENAME fi # : Do a CAA fence, too. Effectively mark the disks as failed. # cl_set_vg_fence_height $VG "ff" if (( 0 != $rc )) then cl_log 10511 "$PROGNAME: Volume group $VG fence height could not be set to FENCE_FAIL" $PROGNAME $VG FENCE_FAIL fi # : Now, find the action configured for loss of this volume group # action="" monitor=$(clodmget -q "name = RESOURCE_TO_MONITOR and type = CRITICAL_VG and value = $VG" -f monitor -n HACMPmonitor) if [[ -z $monitor ]] # migration - allow for earlier formulation then monitor=$(clodmget -q "name = RESOURCE_TO_MONITOR and value = $VG" -f monitor -n HACMPmonitor) fi if [[ -n $monitor ]] then # : Monitor $monitor is configured for this volume group. Check : to see what action it calls for # action=$(clodmget -q "monitor = $monitor and name = FAILURE_ACTION and type = CRITICAL_VG" -f value -n HACMPmonitor) if [[ -z $action ]] # migration - allow for earlier formulation then action=$(clodmget -q "monitor like '*$VG' and name = FAILURE_ACTION" -f value -n HACMPmonitor) fi # : If there is a notification method, always run that, even if : other actions are specified. This allows the specification of : both a notification mechanism, and a specific action # notify_method=$(clodmget -q "monitor = $monitor and type = CRITICAL_VG and name = NOTIFY_METHOD" -f value -n HACMPmonitor) if [[ -z $notify_method ]] # migration - allow for earlier formulation then notify_method=$(clodmget -q "monitor like '*$VG' and name = NOTIFY_METHOD" -f value -n HACMPmonitor) fi if [[ -n $notify_method ]] then cl_echo 10411 "Calling notify method $notify_method for loss of quorum on volume group $VG in resource group $RG" $notify_method $VG $RG # : Call the notification method, $notify_method, backgrounded to : avoid dealing with hangs, errors, etc. # $notify_method & fi fi # : Perform the action $action # if [[ -z $action ]] then # : No action has been set. Halt the node as the default action # cl_echo 10414 "Halting $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG clexit.rc clstrmgrES elif [[ $action == "fence" || $action == "notify" ]] then # : This has already been done so as above we do nothing # elif [[ $action == "fallover" ]] then # : The expected response to loss of quorum on $VG is to take : resource group $RG offline # cl_echo 10412 "Taking resource group $RG offline due to loss of quorum on volume group $VG" $RG $VG # : Generate a request to the cluster manager to take the resource : group offline on this node. # Note simplistic coding below # does not handle child/parent or location dependencies # GROUP_ID=$(clodmget -q "group = $RG" -f id -n HACMPgroup) USER_RG_OFFLINE="3" TARGET_HANDLE=$(clhandle | cut -f1 -d' ') SET_POL="1" PERSISTENT="0" SITE_OVERRIDE="0" clRMupdate user_rg_move $GROUP_ID $USER_RG_OFFLINE $TARGET_HANDLE $SET_POL $PERSISTENT $SITE_OVERRIDE elif [[ $action == "shutdown" ]] then # : The expected response to loss of quorum on $VG is to take down : HACMP on this node # cl_echo 10413 "Stopping PowerHA SystemMirror on $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG clstop -g -s -N elif [[ $action == "halt" ]] then # : The expected response to loss of quorum on $VG is to halt the : node. Use clexit.rc to log and drive that action # cl_echo 10414 "Halting $LOCALNODENAME due to loss of quorum on volume group $VG" $LOCALNODENAME $VG clexit.rc clstrmgrES fi } ############################################################################### # # # Function: vg_fence_check # # Check to see if a quorum of disks are accessable in a given volume group # # # Input: # # Volume group name # # Output: # # return code 0 - quorum accessable # return code 1 - quorum not accessable # # ############################################################################### function vg_fence_check { PS4_FUNC="vg_fence_check" [[ $VERBOSE_LOGGING == high ]] && set -x typeset VG=$1 integer quorum=0 integer total_vg_disks=0 integer present=0 typeset RG=$(clodmget -q"value = $VG" -f group -n HACMPresource) # : Find all the disks in volume group $VG - space separated list # vg_disks=$(print "$lspv_out" | grep -w $VG | cut -f1 -d' ' | paste -s -d' ' - ) total_vg_disks=$(print $vg_disks | wc -w) # : Compute the quorum for $VG - this is "one more than half" of the : disks in the volume group # quorum=$(( (total_vg_disks+2)/2 )) # : Check each disk in the volume group to see if it : is still accessable # present=0 for vg_disk in $vg_disks do # : Check to see if disk $vg_disk is still : accessable - can we read the PVID of the disk # if cl_querypv -q $vg_disk then if (( ++present >= quorum )) then # : We are able to access a quorum - $quorum - of disks : disks in volume group $VG. Return success for this : volume group # return 0 fi else # : Disk $vg_disk is inaccessable. One : more that counts against achieving quorum. # cl_echo 10512 "$PROGNAME: Unable to access disk $vg_disk in volume group $VG in resource group $RG shared with node $EVENTNODE" $PROGNAME $vg_disk $VG $RG $EVENTNODE fi done # : If we get here, it means that a quorum of accessable : disks has not been found. Return an error indication # return 1 } ################################################################################ # : Main procedure in cl_fence_vg # ################################################################################ typeset PROGNAME=${0##*/} export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)" if [[ $VERBOSE_LOGGING == "high" ]]; then set -x version='%I%' fi # : Collect list of disks, for use later # lspv_out=$(lspv) if [[ -z $LOCALNODENAME ]] then # : '$LOCALNODENAME' should have been set by caller # LOCALNODENAME=$(/usr/es/sbin/cluster/utilities/get_local_nodename) fi # : Accept a formal parameter of 'name of node that failed' if none were set : in the environment # EVENTNODE=${EVENTNODE:-$1} if [[ -z $EVENTNODE ]] then # : No indication of the node that failed, so cannot proceed # exit 1 fi # : An explicit volume group list can be passed after the name of : the node that failed. Pick up any such # shift vg_list="$*" # volume group list or null common_groups="" # common between this node and the one that failed common_critical_vgs="" # common between this node and the one that failed if [[ -z $vg_list ]] then # : Find all the concurrent resource groups that contain both $EVENTNODE and $LOCALNODENAME # for group in $(clodmget -q "startup_pref = OAAN" -f group -n HACMPgroup) do # : Check to see if $group is in FORCEDOWN_GROUPS. : If it is, then the associated VGs should be varied off : and we will not be able to process them. # if [[ -n $FORCEDOWN_GROUPS && -n "$(print $FORCEDOWN_GROUPS | grep -w $group)" ]] then # : Skipping $group # continue fi # : If the resource group is OFFLINE, we will not consider those : as they might have been brought offline by user either by a clstop : or a user initiated RG move to offline. # if LC_ALL=C clRGinfo -s $group | grep -w $LOCALNODENAME | grep -w "OFFLINE" then # : Skipping $group for being offline on localnode # continue fi # : Extract the node list # rg_node_list=$(clodmget -q "group = $group" -f nodes -n HACMPgroup) # : If the rg node list contains both $LOCALNODENAME and : $EVENTNODE, add it to the list of groups to process # if print $rg_node_list | grep -w $LOCALNODENAME | grep -qw $EVENTNODE then common_groups="$common_groups $group" fi done # : Look at each of the resource groups in turn to determine what CRITICAL : volume groups the local node $LOCALNODENAME share access with $EVENTNODE # for RG in $common_groups do # : What CRITICAL volume groups does the local node $LOCALNODENAME share access : with $EVENTNODE in resource group $RG # common_critical_vgs="$common_critical_vgs $(clodmget -q "group = $RG and name = CRITICAL_VG" -f value -n HACMPresource)" done else # : Given a list of volume groups. Just check these # common_critical_vgs=$vg_list fi # : Process the list of common volume groups, $common_vgs # for VG in $common_critical_vgs do # : Go check if this volume group retains access to a quorum of disks # if ! vg_fence_check $VG then # : Quorum has been lost. Take the configured action # vg_fence_action $VG fi done