#!/bin/ksh93 # ALTRAN_PROLOG_BEGIN_TAG # This is an automatically generated prolog. # # Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021. All rights reserved. # # ALTRAN_PROLOG_END_TAG # @(#) 7d4c34b 43haes/usr/sbin/cluster/events/cl_ffdc.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM ############################################################################### # # Name: cl_ffdc # # This program is called to perform FFDC for various failures. # # Arguments: -a - capture logs for app monitor exit # -c - capture logs for config_too_long event # -e - capture logs for event_error # -i - capture logs for interface failure # # Returns: 0 - program success # 1 - program failed # ############################################################################### ############################################################################### # # Notes on first failure data capture: # # FFDC is an attempt to collect relevant data as close as possible to the # actual failure. This function represents the best guess at what is # relevant data, based on the current implementation. # It should be obvious that as experience is gained and new feature are # added, that this function will need to be enhanced and extended, so # this information if provided as a guide for modifying the function # in the future. # # Overall structure: # FFDC data is collected under the default log directory - typically # /var/hacmp/log/ffdc - with individual subdirectories under that # named for the type of ffdc data, e.g. application_monitor, event_error, # etc. # The type of data to collect (and hence the resulting folder name) is # specified with a simple command line flag, e.g. "-e" directs collection # of ffdc for "event_error". Should you wish to add a new collection # "type", you will need to add to getopts() and the subsequent case # block which sets the global collection type flag. # The set of data to collect for each type is specified in 2 ways: # 1) given adequate free space in the target filesystem, and being # careful not to consume all remaining free space, the variable # MAX_FILES defines the set of files to collect under ideal conditions. # 2) if there is not adequate space to collect all the desired files, # the variable MIN_FILES defines the smallest set of files to collect # (if possible - we still don't want to consume all free space, even # if it means we don't collect ffdc at all). # When modifying this function, you will need to consider the bare # minimum set of files which will be of use when debugging the problem # and add/define them as MIN_FILES, then think about what other files # may be of use and add them to MAX_FILES. # Keep in mind the balance between capturing necessary data and over # collecting: ffdc is not free - it consumes resources on the customer's # mission critical high availability system so we must be extremely # sensitive to the environment where this function is run. # ############################################################################### ############################################################################### # # Name: gather_clverify_ffdc # # This function gathers any error indications from the clverify logs - # this may prove useful for problem determination, e.g. if clverify # flagged a problem that later led to an event failure. # # Arguments: none # # Returns: 0 - always # ############################################################################### function gather_clverify_ffdc { # # grep out some keywords - although this does not provide all the # failure information, it can provide a quick indication of recent # problems # egrep '^Totals|^ERROR' $clverify_log_dir/* >$LOG_DIRECTORY/clverify.$TIMESTAMP } ############################################################################### # # Name: handle_monitor_ffdc # # This function does some special processing for application monitor logs. # # Arguments: file name - app monitor file name # clamd exit code - exit code from the app monitor - see # src/43haes/usr/sbin/cluster/clamd/clamd_exits.h # # Returns: 0 - success # 1 - unable to collect # ############################################################################### function handle_monitor_ffdc { logFileName=$1 errorStringSuffix=$2 if [[ -z "$errorStringSuffix" || ! -s $logFileName ]] then echo "$0: failed to capture $logFileName with suffix $errorStringSuffix at $(date)" >>$hacmpout_log_dir/hacmp.out return 1 fi # # - set base clappmond daemon log file name: # clappmond.appMonName.rgName.log # - set first instance archive log file name # - set last instance archive log file name # clAppMonDaemonLogName="$logFileName" firstInstanceName1="$clAppMonDaemonLogName.first_$errorStringSuffix" lastInstanceName1="$clAppMonDaemonLogName.last_$errorStringSuffix" #---------------------------------------- # - if the first instance log file exists # and is of non zero bytes # - set archive name to: # last instance name # else # - set archive name to: # first instance name #---------------------------------------- if [[ -s $firstInstanceName1 ]] then archiveFileName1="$lastInstanceName1" else archiveFileName1="$firstInstanceName1" fi #---------------------------------------- # - archive the clappmond daemon log file # preserving the permissions. #---------------------------------------- cp -fp $logFileName $archiveFileName1 #------------------------------------------------ # - if the error logged in the log file is: # - a timeout where the application monitor # did not finish # or # - a non zero return code was # returned from the application monitor # then # - save the application monitor log as well #------------------------------------------------ if [[ $errorStringSuffix == CLAMD_EXIT_MONITOR_TIMEOUT || $errorStringSuffix == CLAMD_EXIT_MONITOR_DETECTED ]] then #------------------------------------------------------------ # - create archive log file name for application # monitor log file clappmond.appMonName.rgName.monitor.log: # clappmond.appMonName.rgName.monitor.log.latest_CLAMD_* # - copy the log file to the archive name #------------------------------------------------------------ appMonLogName=$(echo "$clAppMonDaemonLogName" | sed "s/[.]log$/.monitor.log/g") firstInstanceName2="$appMonLogName.first_$errorStringSuffix" lastInstanceName2="$appMonLogName.last_$errorStringSuffix" #---------------------------------------- # - if the first instance log file exists # and is of non zero bytes # - set archive name to: # last instance name # else # - set archive name to: # first instance name #---------------------------------------- if [[ -s $firstInstanceName2 ]] then archiveFileName2="$lastInstanceName2" else archiveFileName2="$firstInstanceName2" fi #------------------------------------------- # - archive the application monitor log file # preserving the permissions. #------------------------------------------- cp -fp $appMonLogName $archiveFileName2 return $? fi return 0 } ############################################################################### # # main starts here # ############################################################################### PROGNAME=${0##*/} # # Include Availability Metrics library # . /usr/es/lib/ksh93/availability/cl_amlib [[ "$VERBOSE_LOGGING" == "high" ]] && set -x export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)" export LC_ALL=C # # Fetch local node name # LOCALNODENAME=$(get_local_nodename) # # The user can optionally disable ffdc through /etc/environment # if [[ -n "$FFDC_COLLECTION" && $FFDC_COLLECTION == "disable" ]] then echo "$PROGNAME: FFDC_COLLECTION set to disable - return without collecting ffdc" exit 0 fi # # Collect logs for FFDC # # # given adequate free space on the target filesystem, this variable defines # the full set of logs to collect # typeset MAX_FILES="" # # if there is not enough space to collect the desired set, this variable # defines the bare minimum set to collect # typeset MIN_FILES="" # # both these lists must contain full path names to the logs, e.g. # /var/hacmp/log/hacmp.out (not just "hacmp.out") # # # re-use the location set for hacmp.out for ffdc # LOG_DIRECTORY=$(clodmget -q "name=hacmp.out" -nf value HACMPlogs) LOG_DIRECTORY=${LOG_DIRECTORY:-"/var/hacmp/log"} FFDC_DIRECTORY="$LOG_DIRECTORY/ffdc" TIMESTAMP=$(date +%Y.%m.%d.%H.%M.%S) # # on the off chance that the customer has redirected logs, look them up # here # clutils_log_dir=$(clodmget -q "name = clutils.log" -f value -n HACMPlogs) [[ -z $clutils_log_dir ]] && clutils_log_dir=$(clodmget -q "name = clutils.log" -f defaultdir -n HACMPlogs) clutils_log_dir=${clutils_log_dir:-"/var/hacmp/log"} clstrmgrdebug_log_dir=$(clodmget -q "name = clstrmgr.debug" -f value -n HACMPlogs) [[ -z $clstrmgrdebug_log_dir ]] && clstrmgrdebug_log_dir=$(clodmget -q "name = clstrmgr.debug" -f defaultdir -n HACMPlogs) clstrmgrdebug_log_dir=${clstrmgrdebug_log_dir:-"/var/hacmp/log"} cspoc_log_dir=$(clodmget -q "name = cspoc.log" -f value -n HACMPlogs) [[ -z $cspoc_log_dir ]] && cspoc_log_dir=$(clodmget -q "name = cspoc.log" -f defaultdir -n HACMPlogs) cspoc_log_dir=${cspoc_log_dir:-"/var/hacmp/log"} hacmpout_log_dir=$(clodmget -q "name = hacmp.out" -f value -n HACMPlogs) [[ -z $hacmpout_log_dir ]] && hacmpout_log_dir=$(clodmget -q "name = hacmp.out" -f defaultdir -n HACMPlogs) hacmpout_log_dir=${hacmpout_log_dir:-"/var/hacmp/log"} clverify_log_dir=$(clodmget -q "name = clverify.log" -f value -n HACMPlogs) [[ -z $clverify_log_dir ]] && clverify_log_dir=$(clodmget -q "name = clverify.log" -f defaultdir -n HACMPlogs) clverify_log_dir=${clverify_log_dir:-"/var/hacmp/log"} # # clappmond logs are hard coded to # clappmond_log_dir="/var/hacmp/log" # # The location of the caa log is defined in the syslog.conf for whichever # version of syslog is enabled # typeset SYSLOG_CONF="" SYSLOG_CONF=$(clgetsyslog) # Use default configuration file for any kind of failures if (( $? != 0 )) then SYSLOG_CONF="/etc/syslog.conf" fi caa_log_file=$(grep "^caa.debug" ${SYSLOG_CONF} | tail -1 | awk '{print $2}') caa_log_dir=$(dirname $caa_log_file) # # rsct logs # rsct_domain=$(lsrpdomain | tail -1 | cut -f1 -d' ') rsct_log_dir="/var/ct/$rsct_domain/log/cthags" rsct_logs=$(find $rsct_log_dir | egrep -v "aua_blobs|last|nmDiag|netmon_mux") # # what flavor of ffdc are we collecting ? # # when adding a new "flavor", you will add the flag to getopts() and add # to the case statement to set ffdc_mode and any other flags from the # command line # typeset ffdc_mode="" clamd_exit_code="" event_serial="" while getopts "a:c:e:f:iux" opt do case $opt in a) ffdc_mode=application_monitor clamd_exit_code=$OPTARG ;; c) ffdc_mode=config_too_long event_serial=$OPTARG ;; e) ffdc_mode=event_error event_serial=$OPTARG ;; f) # -f only valid for application_monitor mode if [[ $ffdc_mode != "application_monitor" ]] then echo "$PROGNAME: -f only valid with -a (application_monitor mode)" exit 1 fi clamd_file_name=$OPTARG ;; i) ffdc_mode=interface_failure ;; u) ffdc_mode=utilities ;; x) ffdc_mode=xd ;; *) # this utility is only called by clstrmgr, but just in case, # check for invalid args and exit echo "$PROGNAME: invalid argument [$opt]" exit 1 esac done # this utility is only called by clstrmgr, but just in case, # check for missing args and exit if [[ -z $ffdc_mode ]] then echo "$PROGNAME: missing argument" exit 1 fi # # create the target folder if necessary # FFDC_DIRECTORY=$FFDC_DIRECTORY/$ffdc_mode if [[ ! -d $FFDC_DIRECTORY ]] then mkdir -p $FFDC_DIRECTORY fi # # retain the 8 most recent FFDC collections - we do this before calculating # free space, even though we may not collect anything in this pass # rm -f $(ls -t $FFDC_DIRECTORY/eventlogs.* 2>/dev/null | tail +8) rm -f $(ls -t $LOG_DIRECTORY/processes.* 2>/dev/null | tail +8) rm -f $(ls -t $LOG_DIRECTORY/errpt.* 2>/dev/null | tail +8) rm -f $(ls -t $LOG_DIRECTORY/event_serial.* 2>/dev/null | tail +8) rm -f $(ls -t $LOG_DIRECTORY/clverify.* 2>/dev/null | tail +8) # # Get the free space for the target directory # typeset -i FREE_SPACE=0 FREE_SPACE=$(df -k $FFDC_DIRECTORY | tail -1 | awk '{print $3}') # # we will collect the minimal set of logs if space is limited, otherwise # collectins a larger set - this is where you define or modify the set # of logs to collect for each type of ffdc # case $ffdc_mode in application_monitor) # # a monitor can exit with different codes - some fatal, some not. # the clamd will only call this function for errors of interest, # so in addition to saving logs we want to rename them based on # the specific failure so that they are more easily identified # if ! handle_monitor_ffdc $clamd_file_name $clamd_exit_code then # no need to continue exit 0 fi # # app monitor failure is not likely to be accompianed by anything # in hacmp.out or clstrmgr.debug, so the minimal set of logs is just # MIN_FILES="$clappmond_log_dir/clappmond*" # # if space allows, collect a few more, as they may have intersting # background related to the monitor failure (or prior failures) # MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out $clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long" ;; config_too_long) # capture the serial number of the current event echo "$PROGNAME current event serial number is $event_serial" > $LOG_DIRECTORY/event_serial.$TIMESTAMP # # # config too long fires after 6 minutes of inactivity from a # cluster event - it may indicate an error occured, in which case # ffdc would have already been called for event_error, and this # invocation is not likely to capture anything more useful, but # if config too long occurs because of a hang, the most critical # log will be hacmp.out # MIN_FILES="$LOG_DIRECTORY/event_serial.$TIMESTAMP $hacmpout_log_dir/hacmp.out" # # if space allows, collect a few more # MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out.1 $clstrmgrdebug_log_dir/clstrmgr.debug* $caa_log_dir/syslog.caa* $rsct_logs" ;; event_error) # capture the serial number of the failed event echo "$PROGNAME current event serial number is $event_serial" > $LOG_DIRECTORY/event_serial.$TIMESTAMP # # # event_error occurs when an event script exits with a non-zero # return code: the most critical log will be hacmp.out # MIN_FILES="$LOG_DIRECTORY/event_serial.$TIMESTAMP $hacmpout_log_dir/hacmp.out" # # if space allows, collect a few more # MAX_FILES="$MIN_FILES $clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long" ;; interface_failure) # # interface monitoring is done by caa and the notification is # passed up through rsct to the clstrmgr, which may or may not # run an event in response, so the minimum set is # MIN_FILES="$clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long $caa_log_dir/syslog.caa" # # if space allows, collect a few more # MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out $clstrmgrdebug_log_dir/clstrmgr.debug* $caa_log_dir/syslog.caa* $rsct_logs" ;; utilities) # # any time a utility fails, it may be the result of a user error # which is not of much ineterst, or it may be because a system command # failed, in which case we find interesting info in # typeset THIS_USER=${SUDO_USER:-${LOGNAME:-"root"}} if [[ $THIS_USER == root ]]; then smit_logs="/smit.log" else smit_logs=$(ls -1 $HOME/smit.* 2>> /dev/null) fi MIN_FILES="$smit_logs $clutils_log_dir/clutils.log* $cspoc_log_dir/cspoc.*" # # not much more to grab # MAX_FILES="$MIN_FILES" ;; xd) # # the clxd log files will only be in HACMPlogs if xd is installed, # so we don't look them up along with the others, we do it here, # since xd ffdc will only be called if xd is installed # clxd_log_dir=$(clodmget -q "name = clxd.log" -f value -n HACMPlogs) [[ -z $clxd_log_dir ]] && clxd_log_dir=$(clodmget -q "name = clxd.log" -f defaultdir -n HACMPlogs) clxd_log_dir=${cspoc_log_dir:-"/var/hacmp/xd/log"} # # failures in the xd subsystems may indicate a problem with storage, # so we capture all storage specific (EMC, ds8k, etc) # stuff by default - it is not likely that customers will have # multiple mixed storage technologies, so grabbing them all should # not be very expensive. the base logs are # MIN_FILES="$clxd_log_dir/clxd.log $clxd_log_dir/clxd_debug.log" # # space permitting, collect them all # MAX_FILES="$MIN_FILES $clxd_log_dir/clxd*.log* /var/hacmp/log/svcpprc/* /var/hacmp/log/pprc/* /var/hacmp/log/spprc/* and /var/hacmp/log/consistgroup/*" ;; esac # # add ps, clverify and errpt # # Avoided the use of "ps -ef" here due to a reported field issue where when # user accounts are defined in LDAP, the "-f" listing attempts to look up the # user name from the LDAP server. If it is unreachable, it can take 5 - 10 # minutes to time out. The "ps -eo" string below collects the same information # as "ps -ef", but without the user ID conversions to a proper name. Hopefully # this will avoid the problem that the customer saw. # ps -eo 'uid,pid,ppid,cpu,stime,tty,time,args' > $LOG_DIRECTORY/processes.$TIMESTAMP errpt -a -s $(date +'%m%d0000%y') > $LOG_DIRECTORY/errpt.$TIMESTAMP gather_clverify_ffdc MIN_FILES="$MIN_FILES $LOG_DIRECTORY/processes.$TIMESTAMP $LOG_DIRECTORY/errpt.$TIMESTAMP $LOG_DIRECTORY/clverify.$TIMESTAMP" MAX_FILES="$MAX_FILES $LOG_DIRECTORY/processes.$TIMESTAMP $LOG_DIRECTORY/errpt.$TIMESTAMP $LOG_DIRECTORY/clverify.$TIMESTAMP" # # prune the lists - in general, files older than a couple days wont have # valuable data # MAX_FILES=$(find $MAX_FILES -prune -type f -mtime -2) MIN_FILES=$(find $MIN_FILES -prune -type f -mtime -2) # # check the space, starting with the max desired list # FILES=$MAX_FILES REQUIRED_SPACE=$(du -sk $FILES | awk '{x+=$1}END{print x}') # # even though there may be adequate space, we must be careful to not # consume all the free space, which could impact the customers application. # therefore we multiply the required space, just to be sure # (( REQUIRED_SPACE *= 2 )) if (( FREE_SPACE < REQUIRED_SPACE )) then # # not enough space for all the files, try the min list # FILES=$MIN_FILES REQUIRED_SPACE=$(du -sk $FILES | awk '{x+=$1}END{print x}') (( REQUIRED_SPACE *= 2 )) fi # # check again to make sure we have any chance of success. this script is # called from clstrmgr which will ignore any errors, so log the problem # to hacmp.out if we cannot collect # if (( FREE_SPACE < REQUIRED_SPACE )) then echo "$PROGNAME: FFDC collection failed at $(date) because of a lack free space in $FFDC_DIRECTORY:\navailable space was $FREE_SPACE, required space is $REQUIRED_SPACE.\n" >>$hacmpout_log_dir/hacmp.out exit 1 fi echo "$PROGNAME: FFDC collection started at $(date)\n" >>$hacmpout_log_dir/hacmp.out # # Log FFDC start entry in availability log file # amlog_trace $AM_FFDC_BEGIN "FFDC|$LOCALNODENAME" # # pax and compress the logs # pax -s "!/var/adm/ras!./ffdc.$TIMESTAMP!" -s "!^!./ffdc.$TIMESTAMP!" -wvf $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax $FILES 2>>$hacmpout_log_dir/hacmp.out if (( $? )) then echo "$PROGNAME: FFDC collection failed at $(date) because the pax command failed\n" >>$hacmpout_log_dir/hacmp.out # # Log FFDC failure entry in availability log file # amlog_trace $AM_FFDC_FAILURE "FFDC|$LOCALNODENAME" exit 1 fi compress $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax # # misc logs were saved in the snap - delete them now # rm -f $FFDC_DIRECTORY/processes.$TIMESTAMP rm -f $FFDC_DIRECTORY/errpt.$TIMESTAMP rm -f $LOG_DIRECTORY/event_serial.$TIMESTAMP echo "FFDC event log collection saved to $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax.Z\n" >>$hacmpout_log_dir/hacmp.out # # Log FFDC end entry in availability log file # amlog_trace $AM_FFDC_END "FFDC|$LOCALNODENAME" exit 0