#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2019,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # # @(#) 7d4c34b 43haes/usr/sbin/cluster/events/utils/cl_deactivate_vgs.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM # # COMPONENT_NAME: EVENTUTILS # # FUNCTIONS: none # # ORIGINS: 27 # # # (C) COPYRIGHT International Business Machines Corp. 1990,1994 # All Rights Reserved # Licensed Materials - Property of IBM # US Government Users Restricted Rights - Use, duplication or # disclosure restricted by GSA ADP Schedule Contract with IBM Corp. # ############################################################################### # # Name: cl_deactivate_vgs # # Given a list of volume groups, we try and varyoff any of them # that are currently varied on. # # Returns: # 0 - All of the volume groups are successfully varied off # 1 - varyoffvg of at least one volume group failed # 2 - Zero arguments were passed # # Arguments: list of volume groups # # Environment: VERBOSE_LOGGING, PATH # ############################################################################### ######################################################################### # # Function: vgs_varyoff # # Input: Volume group name # Volume group mode (as determined by lqueryvg) # # Environment: Name of program for error messages # Temporary file name for status # # Output: Volume group is vary'd off # Results are appended to temporary status file # ######################################################################### vgs_varyoff () { typeset PS4_TIMER="true" [[ "$VERBOSE_LOGGING" == "high" ]] && set -x typeset VG=$1 typeset MODE=$2 typeset OPEN_FSs="" typeset OPEN_LVs="" typeset TMP_VG_LIST typeset TS_FLAGS="" integer STATUS=0 integer RC=0 typeset SELECTIVE_FAILOVER="false" typeset LV typeset lv_list= typeset FS typeset FS_MOUNTED="" integer rc_fuser=0 integer rc_varyonvg=0 integer rc_varyoffvg=0 integer rc_lsvg=0 integer rc_dfs=0 integer rc_dvg=0 # Changes for defect 825279. This is a temporary change which can be backed out once # DCR MR0517121725 is serviced integer FV FR FM FF # VRMF of bos.rte.filesystem typeset -Z2 FR # two digit release typeset -Z3 FM # three digit modification typeset -Z3 FF # three digit fix integer FVRMF=0 integer fuser_lvl=601004000 TMP_VG_LIST=$(LC_ALL=C lsvg -l -L $VG 2>/dev/null) rc_lsvg=$? if [[ $EVENT_TYPE == reconfig* ]] then # : This is a DARE event. Find any open filesystems on the given VG # OPEN_FSs=$(print "$TMP_VG_LIST" | awk '$2 ~ /jfs2?$/ && $6 ~ /open/ && $7 != "N/A" {print $7}') if [[ -n $OPEN_FSs ]] then # : Open filesystems can legitimately happen during a DARE event. : This really means that the VG should not be brought offline. # return 0 fi fi if [[ -n $TMP_VG_LIST ]] then # : Get list of open logical volumes corresponding to filesystems # OPEN_LVs=$(print "$TMP_VG_LIST" | awk '$2 ~ /jfs2?$/ && $6 ~ /open/ {print $1}') fi if [[ $rc_lsvg != 0 && "${RG_MOVE_EVENT:-false}" == "true" && "${RG_MOVE_ONLINE:-TMP_ERROR}" != "ONLINE" ]] then # : this is an rg_move on selective fallover, lsvg -l might not work. : try looking up the LVs in the ODM if the VG is online # export SELECTIVE_FAILOVER="true" if print $VG_ON_LIST | grep -qw $VG then # : The VG is online, get a list of all its logical volumes from ODM # OPEN_LVs=$(LC_ALL=C clodmget -q "name = $VG" -f dependency -n CuDep) fi fi if [[ -n $OPEN_LVs ]] then # : Attempt to kill off any processes using the logical volumes. # varyoff will then hopefully work. Varyoff is guaranteed to fail if there # are open connections to any logical volume. # lslpp -lcqOr bos.rte.filesystem | cut -f3 -d':' | IFS=. read FV FR FM FF FVRMF=$FV$FR$FM$FF cltime for LV in $OPEN_LVs # look at each logical volume do # Changes for defect 825279. if (( $FVRMF >= $fuser_lvl )) then fuser -O -k -u -x /dev/$LV # try to kill anything that has it open rc_fuser=$? else fuser -k -u -x /dev/$LV # try to kill anything that has it open rc_fuser=$? fi : rc_fuser=$rc_fuser cltime done sleep 5 # give kill time to work fi if [[ $TWO_NODE_CLUSTER == TRUE ]] then # : For two-node clusters, special processing for the highly available NFS : server function: tell NFS to dump the dup cache into the jfslog or jfs2log # if the level of AIX supports it to allow it to be picked up # by the next node to get this volume group. # : Find the first log device in the saved list of logical volumes # pattern=$FS_TYPES logdev=$(print "$TMP_VG_LIST" | awk '$2 ~ /'${pattern}'/ {printf "/dev/%s\n", $1 ; exit}') if [[ -z $logdev && "${RG_MOVE_EVENT:-false}" == "true" && "${RG_MOVE_ONLINE:-TMP_ERROR}" != "ONLINE" ]] then # : this is an rg_move on selective fallover, lsvg -l might not work. : try looking up log device info in the ODM # if [[ $pattern == 'jfs2?log' ]] then # : odmget search pattern for both jfslog and jfs2log # yes, its different from the one needed for awk. # pattern='jfs*log' fi # : If this logical volume looks like a loglv type, pick it up # for LV in $OPEN_LVs do if [[ -n $(odmget -q "name = ${LV} and \ attribute = type and \ value like ${pattern}" CuAt) ]] then logdev="${logdev} /dev/${LV}" # : nfso expects exactly one log device, so stop looking # break fi done fi if [[ -n $logdev ]] # JFS or maybe JFS2 log logical volume then nfso -H dump_dupcache $logdev fi fi # end 2 nodes w/exports # : Finally, vary off the volume group # # Format for consumption by cl_am utility amlog_trace $AM_VG_VARYOFF_BEGIN "Deactivating Volume Group|$VG" if [[ 32 == $MODE ]] # enhanced concurrent mode then # : This VG is ECM. Move to passive mode. # # If the volume group is an enhanced concurrent mode volume # group, this may be due to it having been varyd on in passive # mode at node up, and turned to active mode later. If so, turn # it back to passive mode. # We do this by attempting to varyon in passive mode first. This # will only succeed if it was originally varied on in passive mode. # If it fails, then we know that it was varied on active from the # beginning and that it needs to be varied off now. Note that we # purposefully discard stderr for the passive varyon attempt since # we fully expect and are prepared to deal with a failure. # TS_FLAGS='-o' cltime varyonvg -c -n -P $VG 2>/dev/null # move to passive mode rc_varyonvg=$? : return code from varyonvg -c -n -P $VG is $rc_varyonvg cltime if (( 0 != $rc_varyonvg )) then # : VG was not varydon into active from passive. Do a regular varyoff. # cltime varyoffvg $VG rc_varyoffvg=$? : varyoffvg $VG return code is $rc_varyoffvg cltime TS_FLAGS='-f' fi else # : ordinary VG, do a regular varyoff # cltime varyoffvg $VG rc_varyoffvg=$? : varyoffvg $VG return code is $rc_varyoffvg cltime TS_FLAGS='-f' fi # Format for consumption by cl_am utility if [[ $rc_varyoffvg != 0 ]] then amlog_err $AM_VG_VARYOFF_FAILURE "Deactivating Volume Group|$VG" else amlog_trace $AM_VG_VARYOFF_END "Deactivating Volume Group|$VG" fi RC=$rc_varyoffvg # : Update LVM volume group timestamps in ODM # cl_update_vg_odm_ts $TS_FLAGS $VG if (( 0 == $rc_varyoffvg )) then # : successful varyoff, set the fence height to read-only # This should protect the VG from inadvertent modification by this node. # cl_set_vg_fence_height -c $VG ro RC=$? if (( $RC != 0 )) then # : Log any error, but continue. If this is a real problem, the varyonvg will fail # ro=$(dspmsg -s 103 cspoc.cat 350 'read only' | cut -f1 -d,) cl_log 10511 "$PROGNAME: Volume group $VG fence height could not be set to read/only" $PROGNAME $VG $ro fi else # : varyoffvg failed, but we continue to try other possibility to : recalculate LV list for this Volume Group. Mouned FS of the LV indicates : that the LV might be changed lv_label # for LV in $(clodmget -q "name = $VG" -f dependency -n CuDep) do LV_type=$(lsattr -E -l $LV -a type -F value) if [[ $LV_type == jfs?(2) ]] then FS=$(clodmget -q "name = $LV and attribute = label" -f value -n CuAt) if [[ -n $FS ]] then # : check if filesystem is mounted # if mount | grep -Fw "$LV" | read skip FS_MOUNTED skip # check mounted fs with the LV then if [[ -n "$FS_MOUNTED" && "$FS" != "$FS_MOUNTED" ]] then FS=$FS_MOUNTED umount $FS rc_dfs=$? if (( 0 != $rc_dfs )) then cl_log 28 "$PROGNAME: Failed unmount $FS.\n" $PROGNAME $FS fi fi fi # mounted fs checked fi # go through LVs fi done if [[ -n $FS_MOUNTED ]] && (( 0 == $rc_dfs )) then # : try again after umounted fs on the VG # cltime varyonvg -c -n -P $VG 2>/dev/null # move to passive mode rc_dvg=$? : rc_dvg=$rc_dvg cltime if (( 0 == $rc_dvg )) then RC=0 fi fi if (( 0 != $RC )) then # : varyoffvg failed, but we continue since takeover node may handle it # Varyoffvg errors are logged here, but do not stop processing, nor # do they result in an event error. If the error that prevents # varyoffvg from working is local - dead adapter, broken path - # then the takeover node may be successful. However, if user wants # to bring down the resource group manually, failure in deactivating # any non-concurrent mode volume groups should be reported as an event error. # cl_log 28 "$PROGNAME: Failed varyoff of $VG.\n" $PROGNAME $VG if (( 0 != $rc_dvg )) then # : Volume Group could not varied on to passive mode, caused by umount fs failure on the VG . # STATUS=1 cl_RMupdate resource_error $VG $PROGNAME "varyonvg -cnP $VG failed" fi if [[ ${RG_MOVE_EVENT:-} == "true" && ${USER_RG_MOVE_TYPE:-} == "USER_RG_OFFLINE" ]] then # : User requested RG offline, so promote varyoff fail to event_error if non-concurrent VG # If concurrent, we will not generate an error as the VG might still be active on takeover node. # VG_ID=$(/usr/sbin/getlvodm -v $VG) CONC=$(lqueryvg -g $VG_ID -C) if (( $CONC != 1 )) then # : VG is non-concurrent, and this was a manual event, so varyoff fail becomes event_error # STATUS=1 cl_RMupdate resource_error $VG $PROGNAME "varyoffvg $VG failed with return code $RC" fi fi fi fi # : Append status to the status file. # Append is used because there may be many instances of this subroutine # appending status, as volume groups are processed in parallel. # echo $VG $STATUS >> $TMP_VARYOFF_STATUS # volume group name and status return $STATUS } ######################################################################### # ######################################################################### deactivate_oem_vgs () { [[ "$VERBOSE_LOGGING" == "high" ]] && set -x typeset OEM_VG="$1" for vg in $OEM_VG do # : get OEM type and custom method # OEM_TYPE=$(cl_get_oem_type -v $OEM_VG) OEM_METHOD_TO_DEACTIVATE=$(cl_get_oem_method -m "OFFLINE" -t $OEM_TYPE) $OEM_METHOD_TO_DEACTIVATE "$vg" RC=$? : exit status of OEM_METHOD_TO_DEACTIVATE "'$OEM_METHOD_TO_DEACTIVATE $vg'" : $RC if (( $RC == 1 )) then # : Update the Resource Manager - release failure # cl_RMupdate resource_error $vg $PROGNAME echo "$PROGNAME: User defined method returned non-zero exit code" STATUS=1 fi done return $STATUS } ######################################################################### # # Start of main # ######################################################################### PROGNAME=${0##*/} export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)" # Including Availability metrics library file . /usr/es/lib/ksh93/availability/cl_amlib [[ $VERBOSE_LOGGING == "high" ]] && { set -x version='%I%' } integer STATUS=0 TMP_VARYOFF_STATUS="/tmp/_deactivate_vgs.tmp" sddsrv_off=FALSE ALLVGS="All_volume_groups" OEM_CALL="false" if (( $# != 0 )) && [[ $1 == "-c" ]] then # : Note if need a callout to an OEM disk accomodation method # OEM_CALL="true" shift fi EVENT_TYPE=${EVENT_TYPE:-"not_set"} # Workaround 'set -u' EVENT_TYPE=${EVENT_TYPE##*/} # Remove pathname if specified # : if JOB_TYPE is set and is not "'GROUP'", then process_resources is parent # if [[ ${JOB_TYPE:-0} != 0 && $JOB_TYPE != "GROUP" ]] then # : parameters passed from process_resources thru environment # PROC_RES=true # parameters passed thu environment else # : not called from process_resources, check for valid call # PROC_RES=false if (( $# == 0 )) # no volume groups specified? then # then caller is confused cl_echo 29 "usage: $PROGNAME volume_groups_to_varyoff\n" $PROGNAME exit 2 fi fi # : set -u will report an error if any variable used in the script is not set # set -u # : Remove the status file if it currently exists # rm -f $TMP_VARYOFF_STATUS # : Each of the V, R, M and F fields are padded to fixed length, : to allow reliable comparisons. E.g., maximum VRMF is : 99.99.999.999 # integer V R M F typeset -Z2 R # two digit release typeset -Z3 M # three digit modification typeset -Z3 F # three digit fix integer VRMF=0 # # If the 'sddsrv' daemon is running - vpath dead path detection and # recovery - turn it off, since interactions with the fibre channel # device driver will, in the case where there actually is a dead path, # slow down every vpath operation. # This is only applicable to older SDD software levels. # if ls /dev/vpath* > /dev/null 2>&1 then # : Check to see if we are running an early level of SDD # sdd_level=106003000 if lslpp -lcq "devices.sdd.*.rte" | cut -f3 -d':' | IFS=. read V R M F then VRMF=$V$R$M$F # get the SDD level fi if (( $R >= 07 )); then sdd_level=107002005 fi # # if (( $VRMF < $sdd_level )) && lssrc_out=$(LC_ALL=C lssrc -s sddsrv) then : SDD is active, and an early level. Stop it. integer pid=0 print "$lssrc_out" | tail -1 | read subsys rest (set -- $rest ; eval print \${$(($#-1))} \${$#}) | read pid state if [[ $subsys == "sddsrv" && $state == "active" ]] && (( $pid != 0 )) then date # took how long to shut down SDD # : The stopsrc command does not include the -c flag for 2 reasons: : 1. The possible SIGKILL could result in "Invalid vpaths", and : 2. Time for the daemon to go inoperative could be several : minutes in cases where many vpaths are not accessible # stopsrc -s sddsrv echo "$PROGNAME: Waiting for sddsrv to go inoperative. This could take several minutes when some vpaths are inaccessible.\n" # : No need to clog the log file with this # set +x # # Now wait for sddsrv to shut down # while [[ $subsys == "sddsrv" && $state != "inoperative" ]] ; do sleep 1 if ! lssrc_out=$(LC_ALL=C lssrc -s sddsrv) then # : SRC stopped talking to us. No longer wait for it # break else # : Pick up current state # lssrc_out=$(LC_ALL=C lssrc -s sddsrv | tail -1) state=$(set -- $lssrc_out ; eval print \${$#}) fi done [[ "$VERBOSE_LOGGING" == "high" ]] && set -x date # took how long to shut down SDD sddsrv_off=TRUE # Note that it was turned off fi fi fi # : Special processing for 2-node NFS clusters # export TWO_NODE_CLUSTER=FALSE export FS_TYPES="jsf2?log" if (( 2 == $(clodmget -q "object = VERBOSE_LOGGING" -f name -n HACMPnode | wc -l ) )) && [[ -n $EXPORT_FILESYSTEM ]] then : two nodes, with exported filesystems export TWO_NODE_CLUSTER=TRUE fi # : Pick up a list of currently varyd on volume groups # VG_ON_LIST=$(lsvg -L -o 2>/tmp/lsvg.err) # : if not called from process_resources, use old-style environment and parameters # if [[ $PROC_RES == false ]] then # : Update the Resource Manager - releasing VGs # cl_RMupdate resource_releasing $ALLVGS $PROGNAME typeset PS4_LOOP="" # : Now, process the list of volume groups passed in # for vg in $* do PS4_LOOP="$vg" # # Find out if it is varied on # if [[ $OEM_CALL == "false" ]] then # : Dealing with AIX LVM volume groups # if ! print $VG_ON_LIST | grep -qw $vg then # : This VG is not varyd on - skip it # cl_echo 30 "$PROGNAME: Volume group $vg already varied off.\n" $PROGNAME $vg else # : This VG is varyd on, so go vary it off. Get the VG mode first # MODE=9999 VGID=$(/usr/sbin/getlvodm -v $vg) # get the VGID MODE=$(lqueryvg -g $VGID -X) # what kind of volume group? RC=$? (( $RC != 0 )) && MODE=0; # from lqueryvg : exit status of lqueryvg -g $VGID -X: $RC vgs_varyoff $vg $MODE & fi else # : Call out for OEM volume groups # deactivate_oem_vgs $vg RC=$? : exit status of deactivate_oem_vgs $vg: $RC if (( $STATUS == 0 && $RC != 0 )) then STATUS=1 # non zero exit code. fi fi done unset PS4_LOOP else # : Called from process_resources # LIST_OF_VOLUME_GROUPS_FOR_RG="" for GROUPNAME in $RESOURCE_GROUPS do export GROUPNAME # : Discover the volume groups for this resource group. # The format of the list is # "rg1vg1 rg1vg2 rg1vg3:rg2vg1 rg2vg2 rg2vg3:rg3vg1..." # The expression below picks up everything to the first ':' # echo $VOLUME_GROUPS | IFS=: read LIST_OF_VOLUME_GROUPS_FOR_RG VOLUME_GROUPS # : Reverse the order, so that VGs release in reverse order of acquisition # In rare scenarios, PowerHA hangs on trying to unmount rootvg during RG move. # So, skip the reserved VGs like rootvg, caavg_private, altinst_rootvg, old_rootvg. # LIST_OF_COMMASEP_VG_FOR_RG=$(echo $LIST_OF_VOLUME_GROUPS_FOR_RG | sed 's/ /,/g') LIST_OF_VOLUME_GROUPS_FOR_RG=$(echo $LIST_OF_COMMASEP_VG_FOR_RG | tr ',' '\n' | egrep -v -w 'rootvg|caavg_private |altinst_rootvg|old_rootvg' |sort -ru) # : Update Resource Manager - releasing VGs for this RG # cl_RMupdate resource_releasing $ALLVGS $PROGNAME # : Process the volume groups for this resource group # for vg in $LIST_OF_VOLUME_GROUPS_FOR_RG do PS4_LOOP="$vg" # # Find out if this volume group is varyd on # if ! print $VG_ON_LIST | grep -qw $vg then # : This VG is not varyd on - skip it # cl_echo 30 "$PROGNAME: Volume group $vg already varied off.\n" $PROGNAME $vg else # : Thie VG is varied on, so go vary it off. Get the VG mode first # MODE=9999 VGID=$(/usr/sbin/getlvodm -v $vg) # get the VGID MODE=$(lqueryvg -g $VGID -X) # what kind of volume group? RC=$? (( $RC != 0 )) && MODE=0; # from lqueryvg : exit status of lqueryvg -g $VGID -X: $RC vgs_varyoff $vg $MODE fi done unset PS4_LOOP done fi # : Wait for the background instances of vgs_varyoff # wait # : Collect any failure indications from backgrounded varyoff processing # if [[ -f $TMP_VARYOFF_STATUS ]] then # # Check failures. A status of '1' indicates a problem with varyoff. # cat $TMP_VARYOFF_STATUS | while read VGNAME VARYOFF_STATUS do if [[ $VARYOFF_STATUS == "1" ]] then # : There was a problem with a varyoff. # if [[ $PROC_RES == true ]] then STATUS=11 else STATUS=1 fi break fi done rm -f $TMP_VARYOFF_STATUS fi # : Update Resource Manager - release success for the non-error VGs # ALLNOERRVGS="All_nonerror_volume_groups" if [[ $PROC_RES == false ]] then cl_RMupdate resource_down $ALLNOERRVGS $PROGNAME else for GROUPNAME in $RESOURCE_GROUPS do cl_RMupdate resource_down $ALLNOERRVGS $PROGNAME done fi if [[ $sddsrv_off == TRUE ]] && ! LC_ALL=C lssrc -s sddsrv | grep -iqw active then # : sddsrv was turned off, turn it back on again # startsrc -s sddsrv fi exit $STATUS