#!/bin/ksh93
#  ALTRAN_PROLOG_BEGIN_TAG                                                    
#  This is an automatically generated prolog.                                  
#                                                                              
#  Copyright (C) Altran ACT S.A.S. 2017,2018,2019,2021.  All rights reserved.  
#                                                                              
#  ALTRAN_PROLOG_END_TAG                                                      

# @(#)  7d4c34b 43haes/usr/sbin/cluster/events/cl_ffdc.sh, 726, 2147A_aha726, Feb 05 2021 09:50 PM

###############################################################################
#
#  Name:  cl_ffdc
#
#  This program is called to perform FFDC for various failures.
#
#  Arguments: -a <clamd exit code> - capture logs for app monitor exit
#             -c - capture logs for config_too_long event
#             -e - capture logs for event_error 
#             -i - capture logs for interface failure
#
#  Returns:     0 - program success
#               1 - program failed 
#
###############################################################################

###############################################################################
#
#  Notes on first failure data capture:
#
#  FFDC is an attempt to collect relevant data as close as possible to the
#  actual failure. This function represents the best guess at what is
#  relevant data, based on the current implementation.
#  It should be obvious that as experience is gained and new feature are 
#  added, that this function will need to be enhanced and extended, so
#  this information if provided as a guide for modifying the function
#  in the future.
#  
#  Overall structure:
#    FFDC data is collected under the default log directory - typically
#    /var/hacmp/log/ffdc - with individual subdirectories under that
#    named for the type of ffdc data, e.g. application_monitor, event_error,
#    etc.
#    The type of data to collect (and hence the resulting folder name) is
#    specified with a simple command line flag, e.g. "-e" directs collection
#    of ffdc for "event_error". Should you wish to add a new collection
#    "type", you will need to add to getopts() and the subsequent case
#    block which sets the global collection type flag.
#    The set of data to collect for each type is specified in 2 ways:
#    1) given  adequate free space in the target filesystem, and being
#       careful not to consume all remaining free space, the variable
#       MAX_FILES defines the set of files to collect under ideal conditions.
#    2) if there is not adequate space to collect all the desired files,
#       the variable MIN_FILES defines the smallest set of files to collect
#       (if possible - we still don't want to consume all free space, even
#       if it means we don't collect ffdc at all).
#   When modifying this function, you will need to consider the bare 
#   minimum set of files which will be of use when debugging the problem
#   and add/define them as MIN_FILES, then think about what other files 
#   may be of use and add them to MAX_FILES.
#   Keep in mind the balance between capturing necessary data and over
#   collecting: ffdc is not free - it consumes resources on the customer's
#   mission critical high availability system so we must be extremely 
#   sensitive to the environment where this function is run.
#
###############################################################################

###############################################################################
#
#  Name:  gather_clverify_ffdc
#
#  This function gathers any error indications from the clverify logs - 
#  this may prove useful for problem determination, e.g. if clverify
#  flagged a problem that later led to an event failure.
#
#  Arguments: none
#
#  Returns:     0 - always
#
###############################################################################
function gather_clverify_ffdc {

    #
    # grep out some keywords - although this does not provide all the
    # failure information, it can provide a quick indication of recent
    # problems
    #
    egrep '^Totals|^ERROR' $clverify_log_dir/* >$LOG_DIRECTORY/clverify.$TIMESTAMP
}

###############################################################################
#
#  Name:  handle_monitor_ffdc
#
#  This function does some special processing for application monitor logs.
#
#  Arguments: file name - app monitor file name
#             clamd exit code - exit code from the app monitor - see
#             src/43haes/usr/sbin/cluster/clamd/clamd_exits.h
#
#  Returns:     0 - success
#               1 - unable to collect
#
###############################################################################
function handle_monitor_ffdc {

    logFileName=$1
    errorStringSuffix=$2

    if [[ -z "$errorStringSuffix" || ! -s $logFileName ]] 
    then
        echo "$0: failed to capture $logFileName with suffix $errorStringSuffix at $(date)" >>$hacmpout_log_dir/hacmp.out
        return 1
    fi
    #
    # - set base clappmond daemon log file name:
    #      clappmond.appMonName.rgName.log
    # - set first instance archive log file name
    # - set last  instance archive log file name
    #
    clAppMonDaemonLogName="$logFileName"
    firstInstanceName1="$clAppMonDaemonLogName.first_$errorStringSuffix"
    lastInstanceName1="$clAppMonDaemonLogName.last_$errorStringSuffix"

    #----------------------------------------
    # - if the first instance log file exists
    #      and is of non zero bytes
    #      - set archive name to:
    #        last  instance name
    #   else
    #      - set archive name to:
    #        first instance name
    #----------------------------------------
    if [[ -s $firstInstanceName1 ]]
    then
         archiveFileName1="$lastInstanceName1"
    else
         archiveFileName1="$firstInstanceName1"
    fi

    #----------------------------------------
    # - archive the clappmond daemon log file
    #   preserving the permissions.
    #----------------------------------------
    cp -fp $logFileName $archiveFileName1

    #------------------------------------------------
    # - if the error logged in the log file is:
    #      - a timeout where the application monitor
    #        did not finish
    #      or
    #      - a non zero return code was
    #        returned from the application monitor
    #   then
    #      - save the application monitor log as well
    #------------------------------------------------
    if [[ $errorStringSuffix == CLAMD_EXIT_MONITOR_TIMEOUT ||
          $errorStringSuffix == CLAMD_EXIT_MONITOR_DETECTED ]]
    then
          #------------------------------------------------------------
          # - create archive log file name for application
          #   monitor log file clappmond.appMonName.rgName.monitor.log:
          #      clappmond.appMonName.rgName.monitor.log.latest_CLAMD_*
          # - copy the log file to the archive name
          #------------------------------------------------------------
          appMonLogName=$(echo "$clAppMonDaemonLogName" | sed "s/[.]log$/.monitor.log/g")
          firstInstanceName2="$appMonLogName.first_$errorStringSuffix"
          lastInstanceName2="$appMonLogName.last_$errorStringSuffix"

          #----------------------------------------
          # - if the first instance log file exists
          #      and is of non zero bytes
          #      - set archive name to:
          #        last  instance name
          #   else
          #      - set archive name to:
          #        first instance name
          #----------------------------------------
          if [[ -s $firstInstanceName2 ]]
          then
               archiveFileName2="$lastInstanceName2"
          else
               archiveFileName2="$firstInstanceName2"
          fi

          #-------------------------------------------
          # - archive the application monitor log file
          #   preserving the permissions.
          #-------------------------------------------
          cp -fp $appMonLogName $archiveFileName2
          return $?
    fi
    return 0
}


###############################################################################
#
# main starts here
#
###############################################################################

PROGNAME=${0##*/}
#
# Include Availability Metrics library
#
. /usr/es/lib/ksh93/availability/cl_amlib

[[ "$VERBOSE_LOGGING" == "high" ]] && set -x
export PATH="$(/usr/es/sbin/cluster/utilities/cl_get_path all)"
export LC_ALL=C

#
# Fetch local node name
#
LOCALNODENAME=$(get_local_nodename)

#
# The user can optionally disable ffdc through /etc/environment
#
if [[ -n "$FFDC_COLLECTION" && $FFDC_COLLECTION == "disable" ]]
then
    echo "$PROGNAME: FFDC_COLLECTION set to disable - return without collecting ffdc"
    exit 0
fi

#
# Collect logs for FFDC
#

#
# given adequate free space on the target filesystem, this variable defines
# the full set of logs to collect
#
typeset MAX_FILES=""
#
# if there is not enough space to collect the desired set, this variable
# defines the bare minimum set to collect
#
typeset MIN_FILES=""
#
# both these lists must contain full path names to the logs, e.g.
# /var/hacmp/log/hacmp.out (not just "hacmp.out")
#

#
# re-use the location set for hacmp.out for ffdc
#
LOG_DIRECTORY=$(clodmget -q "name=hacmp.out" -nf value HACMPlogs)
LOG_DIRECTORY=${LOG_DIRECTORY:-"/var/hacmp/log"}
FFDC_DIRECTORY="$LOG_DIRECTORY/ffdc"
TIMESTAMP=$(date +%Y.%m.%d.%H.%M.%S)

#
# on the off chance that the customer has redirected logs, look them up
# here
#
clutils_log_dir=$(clodmget -q "name = clutils.log" -f value -n HACMPlogs)
[[ -z $clutils_log_dir ]] &&
    clutils_log_dir=$(clodmget -q "name = clutils.log" -f defaultdir -n HACMPlogs)
clutils_log_dir=${clutils_log_dir:-"/var/hacmp/log"}

clstrmgrdebug_log_dir=$(clodmget -q "name = clstrmgr.debug" -f value -n HACMPlogs)
[[ -z $clstrmgrdebug_log_dir ]] &&
    clstrmgrdebug_log_dir=$(clodmget -q "name = clstrmgr.debug" -f defaultdir -n HACMPlogs)
clstrmgrdebug_log_dir=${clstrmgrdebug_log_dir:-"/var/hacmp/log"}

cspoc_log_dir=$(clodmget -q "name = cspoc.log" -f value -n HACMPlogs)
[[ -z $cspoc_log_dir ]] &&
    cspoc_log_dir=$(clodmget -q "name = cspoc.log" -f defaultdir -n HACMPlogs)
cspoc_log_dir=${cspoc_log_dir:-"/var/hacmp/log"}

hacmpout_log_dir=$(clodmget -q "name = hacmp.out" -f value -n HACMPlogs)
[[ -z $hacmpout_log_dir ]] &&
    hacmpout_log_dir=$(clodmget -q "name = hacmp.out" -f defaultdir -n HACMPlogs)
hacmpout_log_dir=${hacmpout_log_dir:-"/var/hacmp/log"}

clverify_log_dir=$(clodmget -q "name = clverify.log" -f value -n HACMPlogs)
[[ -z $clverify_log_dir ]] &&
    clverify_log_dir=$(clodmget -q "name = clverify.log" -f defaultdir -n HACMPlogs)
clverify_log_dir=${clverify_log_dir:-"/var/hacmp/log"}

#
# clappmond logs are hard coded to
#
clappmond_log_dir="/var/hacmp/log"

# 
# The location of the caa log is defined in the syslog.conf for whichever
# version of syslog is enabled 
#
typeset SYSLOG_CONF=""
SYSLOG_CONF=$(clgetsyslog)
# Use default configuration file for any kind of failures
if (( $? != 0 ))
then
    SYSLOG_CONF="/etc/syslog.conf"
fi
caa_log_file=$(grep "^caa.debug" ${SYSLOG_CONF} | tail -1 | awk '{print $2}')
caa_log_dir=$(dirname $caa_log_file)

#
# rsct logs 
#
rsct_domain=$(lsrpdomain | tail -1 | cut -f1 -d' ')
rsct_log_dir="/var/ct/$rsct_domain/log/cthags"
rsct_logs=$(find $rsct_log_dir | egrep -v "aua_blobs|last|nmDiag|netmon_mux")

#
# what flavor of ffdc are we collecting ?
#
# when adding a new "flavor", you will add the flag to getopts() and add
# to the case statement to set ffdc_mode and any other flags from the 
# command line
#
typeset ffdc_mode="" clamd_exit_code="" event_serial=""
while getopts "a:c:e:f:iux" opt
do
    case $opt in
    a)
        ffdc_mode=application_monitor
        clamd_exit_code=$OPTARG
        ;;
    c)
        ffdc_mode=config_too_long
        event_serial=$OPTARG
        ;;
    e)
        ffdc_mode=event_error
        event_serial=$OPTARG
        ;;
    f)
        # -f only valid for application_monitor mode
        if [[ $ffdc_mode != "application_monitor" ]]
        then
            echo "$PROGNAME: -f <file> only valid with -a (application_monitor mode)"
            exit 1
        fi
        clamd_file_name=$OPTARG
        ;;
    i)
        ffdc_mode=interface_failure
        ;;
    u)
        ffdc_mode=utilities
        ;;
    x)
        ffdc_mode=xd
        ;;

    *)
        # this utility is only called by clstrmgr, but just in case,
        # check for invalid args and exit
        echo "$PROGNAME: invalid argument [$opt]"
        exit 1

    esac
done

# this utility is only called by clstrmgr, but just in case,
# check for missing args and exit
if [[ -z $ffdc_mode ]]
then
    echo "$PROGNAME: missing argument"
    exit 1
fi

#
# create the target folder if necessary
#
FFDC_DIRECTORY=$FFDC_DIRECTORY/$ffdc_mode
if [[ ! -d  $FFDC_DIRECTORY ]]
then
    mkdir -p $FFDC_DIRECTORY
fi

#
# retain the 8 most recent FFDC collections - we do this before calculating
# free space, even though we may not collect anything in this pass
#
rm -f $(ls -t $FFDC_DIRECTORY/eventlogs.* 2>/dev/null | tail +8)
rm -f $(ls -t $LOG_DIRECTORY/processes.* 2>/dev/null | tail +8)
rm -f $(ls -t $LOG_DIRECTORY/errpt.* 2>/dev/null | tail +8)
rm -f $(ls -t $LOG_DIRECTORY/event_serial.* 2>/dev/null | tail +8)
rm -f $(ls -t $LOG_DIRECTORY/clverify.* 2>/dev/null | tail +8)

#
# Get the free space for the target directory
#
typeset -i FREE_SPACE=0
FREE_SPACE=$(df -k $FFDC_DIRECTORY | tail -1 | awk '{print $3}')

#
# we will collect the minimal set of logs if space is limited, otherwise 
# collectins a larger set - this is where you define or modify the set
# of logs to collect for each type of ffdc
#
case $ffdc_mode in
    application_monitor)
        #
        # a monitor can exit with different codes - some fatal, some not.
        # the clamd will only call this function for errors of interest, 
        # so in addition to saving logs we want to rename them based on
        # the specific failure so that they are more easily identified
        #
        if ! handle_monitor_ffdc $clamd_file_name $clamd_exit_code
        then
            # no need to continue
            exit 0
        fi

        #
        # app monitor failure is not likely to be accompianed by anything
        # in hacmp.out or clstrmgr.debug, so the minimal set of logs is just
        #
        MIN_FILES="$clappmond_log_dir/clappmond*"
        #
        # if space allows, collect a few more, as they may have intersting
        # background related to the monitor failure (or prior failures)
        #
        MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out $clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long"
        ;;
     config_too_long)
        # capture the serial number of the current event
        echo "$PROGNAME current event serial number is $event_serial" > $LOG_DIRECTORY/event_serial.$TIMESTAMP
        #

        #
        # config too long fires after 6 minutes of inactivity from a
        # cluster event - it may indicate an error occured, in which case
        # ffdc would have already been called for event_error, and this
        # invocation is not likely to capture anything more useful, but
        # if config too long occurs because of a hang, the most critical
        # log will be hacmp.out
        #
        MIN_FILES="$LOG_DIRECTORY/event_serial.$TIMESTAMP $hacmpout_log_dir/hacmp.out"
        #
        # if space allows, collect a few more
        #
        MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out.1 $clstrmgrdebug_log_dir/clstrmgr.debug* $caa_log_dir/syslog.caa* $rsct_logs"
        ;;
    event_error)
        # capture the serial number of the failed event
        echo "$PROGNAME current event serial number is $event_serial" > $LOG_DIRECTORY/event_serial.$TIMESTAMP
        #
        #
        # event_error occurs when an event script exits with a non-zero
        # return code: the most critical log will be hacmp.out
        #
        MIN_FILES="$LOG_DIRECTORY/event_serial.$TIMESTAMP $hacmpout_log_dir/hacmp.out"
        #
        # if space allows, collect a few more
        #
        MAX_FILES="$MIN_FILES $clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long"
        ;;
    interface_failure)
        #
        # interface monitoring is done by caa and the notification is
        # passed up through rsct to the clstrmgr, which may or may not
        # run an event in response, so the minimum set is
        #
        MIN_FILES="$clstrmgrdebug_log_dir/clstrmgr.debug $clstrmgrdebug_log_dir/clstrmgr.debug.long $caa_log_dir/syslog.caa"
        #
        # if space allows, collect a few more
        #
        MAX_FILES="$MIN_FILES $hacmpout_log_dir/hacmp.out $clstrmgrdebug_log_dir/clstrmgr.debug* $caa_log_dir/syslog.caa* $rsct_logs"
        ;;
    utilities)
        #
        # any time a utility fails, it may be the result of a user error
        # which is not of much ineterst, or it may be because a system command
        # failed, in which case we find interesting info in
        #
        typeset THIS_USER=${SUDO_USER:-${LOGNAME:-"root"}}
        if [[ $THIS_USER == root ]]; then
            smit_logs="/smit.log"
        else
            smit_logs=$(ls -1 $HOME/smit.* 2>> /dev/null)
        fi

        MIN_FILES="$smit_logs $clutils_log_dir/clutils.log* $cspoc_log_dir/cspoc.*"
        #
        # not much more to grab
        #
        MAX_FILES="$MIN_FILES"
        ;;
    xd)
        #
        # the clxd log files will only be in HACMPlogs if xd is installed, 
        # so we don't look them up along with the others, we do it here,
        # since xd ffdc will only be called if xd is installed
        #
        clxd_log_dir=$(clodmget -q "name = clxd.log" -f value -n HACMPlogs)
        [[ -z $clxd_log_dir ]] &&
            clxd_log_dir=$(clodmget -q "name = clxd.log" -f defaultdir -n HACMPlogs)
        clxd_log_dir=${cspoc_log_dir:-"/var/hacmp/xd/log"}

        #
        # failures in the xd subsystems may indicate a problem with storage,
        # so we capture all storage specific (EMC, ds8k, etc)
        # stuff by default - it is not likely that customers will have
        # multiple mixed storage technologies, so grabbing them all should
        # not be very expensive. the base logs are 
        #
        MIN_FILES="$clxd_log_dir/clxd.log $clxd_log_dir/clxd_debug.log"
        #
        # space permitting, collect them all
        #
        MAX_FILES="$MIN_FILES $clxd_log_dir/clxd*.log* /var/hacmp/log/svcpprc/* /var/hacmp/log/pprc/* /var/hacmp/log/spprc/* and /var/hacmp/log/consistgroup/*"
        ;;
esac

#
# add ps, clverify and errpt
#
# Avoided the use of "ps -ef" here due to a reported field issue where when
# user accounts are defined in LDAP, the "-f" listing attempts to look up the
# user name from the LDAP server. If it is unreachable, it can take 5 - 10
# minutes to time out. The "ps -eo" string below collects the same information
# as "ps -ef", but without the user ID conversions to a proper name. Hopefully
# this will avoid the problem that the customer saw.
#
ps -eo 'uid,pid,ppid,cpu,stime,tty,time,args' > $LOG_DIRECTORY/processes.$TIMESTAMP
errpt -a -s $(date +'%m%d0000%y') > $LOG_DIRECTORY/errpt.$TIMESTAMP
gather_clverify_ffdc

MIN_FILES="$MIN_FILES $LOG_DIRECTORY/processes.$TIMESTAMP $LOG_DIRECTORY/errpt.$TIMESTAMP $LOG_DIRECTORY/clverify.$TIMESTAMP"
MAX_FILES="$MAX_FILES $LOG_DIRECTORY/processes.$TIMESTAMP $LOG_DIRECTORY/errpt.$TIMESTAMP $LOG_DIRECTORY/clverify.$TIMESTAMP"

#
# prune the lists - in general, files older than a couple days wont have
# valuable data
#
MAX_FILES=$(find $MAX_FILES -prune -type f -mtime -2)
MIN_FILES=$(find $MIN_FILES -prune -type f -mtime -2)

#
# check the space, starting with the max desired list
#
FILES=$MAX_FILES
REQUIRED_SPACE=$(du -sk $FILES | awk '{x+=$1}END{print x}')

#
# even though there may be adequate space, we must be careful to not
# consume all the free space, which could impact the customers application.
# therefore we multiply the required space, just to be sure
#
(( REQUIRED_SPACE *= 2 ))
if (( FREE_SPACE < REQUIRED_SPACE ))
then
    #
    # not enough space for all the files, try the min list
    #
    FILES=$MIN_FILES
    REQUIRED_SPACE=$(du -sk $FILES | awk '{x+=$1}END{print x}')
    (( REQUIRED_SPACE *= 2 ))
fi

#
# check again to make sure we have any chance of success. this script is
# called from clstrmgr which will ignore any errors, so log the problem
# to hacmp.out if we cannot collect
#
if (( FREE_SPACE < REQUIRED_SPACE ))
then
    echo "$PROGNAME: FFDC collection failed at $(date) because of a lack free space in $FFDC_DIRECTORY:\navailable space was $FREE_SPACE, required space is $REQUIRED_SPACE.\n" >>$hacmpout_log_dir/hacmp.out
    exit 1
fi

echo "$PROGNAME: FFDC collection started at $(date)\n" >>$hacmpout_log_dir/hacmp.out

#
# Log FFDC start entry in availability log file
#
amlog_trace $AM_FFDC_BEGIN "FFDC|$LOCALNODENAME"

#
#  pax and compress the logs
#
pax -s "!/var/adm/ras!./ffdc.$TIMESTAMP!" -s "!^!./ffdc.$TIMESTAMP!" -wvf $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax $FILES 2>>$hacmpout_log_dir/hacmp.out
if (( $? ))
then
    echo "$PROGNAME: FFDC collection failed at $(date) because the pax command failed\n" >>$hacmpout_log_dir/hacmp.out
    #
    # Log FFDC failure entry in availability log file
    #
    amlog_trace $AM_FFDC_FAILURE "FFDC|$LOCALNODENAME"
    exit 1
fi
compress $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax

#
# misc logs were saved in the snap - delete them now
#
rm -f $FFDC_DIRECTORY/processes.$TIMESTAMP
rm -f $FFDC_DIRECTORY/errpt.$TIMESTAMP
rm -f $LOG_DIRECTORY/event_serial.$TIMESTAMP

echo "FFDC event log collection saved to $FFDC_DIRECTORY/eventlogs.$TIMESTAMP.pax.Z\n" >>$hacmpout_log_dir/hacmp.out

#
# Log FFDC end entry in availability log file
#
amlog_trace $AM_FFDC_END "FFDC|$LOCALNODENAME"

exit 0
