# $Header: emdb/sysman/admin/scripts/alertlog.pl /st_emdbsa_11.2/2 2008/12/22 10:30:22 ychan Exp $
#
# alertlog.pl
# 
# Copyright (c) 2002, 2008, Oracle and/or its affiliates.All rights reserved. 
#
#    NAME
#      alertlog.pl - scans an alert log file for patterns
#
#    DESCRIPTION
#      Given a filename and a pattern, alertlog greps thru an oracle log
#      looking for one pattern while (optionally) ignoring another.  Matching
#      lines are returned in a collection along with the most recent time
#      stamp logged.
#
#    NOTES
#      The output format (column list) of this script is dynamic: the number of
#       columns == n+5, n==EM_METRIC_ERROR_PATTERNS.
#       1: Timestamp/Line Number
#       2: Errors matching no patterns (generic)
#       3..(n+2): Errors matching patterns
#          EM_METRIC_ERROR_PATTERN1..EM_METRIC_ERROR_PATTERNn
#       n+3: Trace file
#       n+4: Error code list
#       n+5: Alert log filename
#
#    MODIFIED   (MM/DD/YY)
#      ychan     12/10/08 - XbranchMerge ychan_bug-6343005 from
#                           st_emdbgc_10.2.0.1.0
#      ychan     11/26/08 - Fix bug 6343005
#      ajdsouza  10/28/06 - Big fix 7422639 
#      mkiran    03/10/07 - 5711362: Report deadlock (ORA-0*60) at GC.
#      hying     12/10/05 - Backport hying_bug-4734310 from main 
#      hying     12/01/05 - 4734310, NT CRSAlert fix 
#      ychan     09/16/05 - Fix bug 4613946 
#      hying     08/11/05 - 4547860
#      hying     04/08/05 - no generic error for CRS 
#      hying     01/03/05 - errorPrefix for alertlog 
#      jsoule    11/29/04 - accept filename/timestamp pattern 
#      jsoule    03/09/04 - provide for flexible columns 
#      jsoule    01/22/04 - tune alertlog monitoring for large error stacks 
#      jsoule    10/27/03 - report error if alert log dir not found 
#      jsoule    10/03/03 - use agentStateDir to locate state 
#      jsoule    09/04/03 - correct offset lookup 
#      jsoule    08/29/03 - allow either alert log file format 
#      jsoule    07/08/03 - improve scalability
#      jsoule    05/27/03 - discard incomplete error stacks
#      jsoule    04/14/03 - add 16038 to the archiver hung category
#      jsoule    03/28/03 - roll up all similar error stacks
#      jsoule    11/27/02 - check ignores after switching to specific error column
#      jsoule    11/26/02 - capture archiver errors appropriately
#      jsoule    11/06/02 - move general alertlog logic to alertlog_util
#      jsoule    10/25/02 - move metric state files from upload to state
#      jsoule    10/17/02 - remove unnecessary if & count lines on bad state
#      jsoule    10/15/02 - return a single-part key
#      jsoule    09/30/02 - do not return the results of the first scan
#      jsoule    09/04/02 - only rollup 600 & 7445 errors
#      jsoule    08/30/02 - change output behavior for "no data
#      jsoule    08/28/02 - fix trace file match
#      jsoule    08/22/02 - modify alert log metric implementation
#      ychan     07/11/02 - Filter db ora errors
#      xxu       06/24/02 - remove /usr/local/bin/perl
#      jsoule    04/05/02 - support timestamps between the 1st and 9th of the month.
#      jsoule    03/14/02 - return null for no problems; transient error code list.
#      prabuck   02/25/02 - changed target directory for writing alert log state files
#      prabuck   02/21/02 - removed parameters known to environment from command line
#      qsong     01/25/02 - Merged qsong_alertlog_again
#      prabuck   01/09/02 - Creation
#

# use/require appropriate libraries

use strict;
use Time::HiRes;
use HTTP::Date;

require "emd_common.pl";
require "alertlog_util.pl";

# Command Line Arguments

my $log_file_absolute    = shift(@ARGV); # log file (absolute)
my $log_timestamp_format = shift(@ARGV); # log (entry) timestamp format
my $errorPrefix = shift(@ARGV); # error prefix for alert log, ORA or CRS

# If CRS, errorPrefix is in the format of CRS%CRSVersion%
if ($errorPrefix =~ /CRS(.+)/)
{
  if ($1 =~ /10.1/)
  {
    EMAGENT_PERL_INFO("CRS Alert is not supported in version $1");
    exit 0;
  }
  ## Strip CRSVersion off
  $errorPrefix = 'CRS';
}

my $backgroundDumpDest   = shift(@ARGV); # background_dump_dest
my $userDumpDest         = shift(@ARGV); # user_dump_dest

my $limitSwitch = shift(@ARGV);    # The switch that limits the amount of
                                   # output this scan can produce.
                                   # Convention: > 0 => time (sec.) to spend
                                   #             eq "EOF" => skip to the end
                                   #             otherwise ignore

my $ignorePattern = shift(@ARGV);  # The pattern (a PERL regular expression)
                                   # for determining which log entries to
                                   # ignore.

# End Arguments

# get target_name, $ORACLE_HOME, and $ORACLE_SID from environment
my $targetGuid = $ENV{EM_TARGET_GUID};
my $oracle_home = $ENV{EM_TARGET_ORACLE_HOME};
my $SID = $ENV{EM_TARGET_ORACLE_SID};
if (EMAGENT_isPerlDebugEnabled())
{
    EMAGENT_PERL_DEBUG("target: $targetGuid; (pattern to suppress: $ignorePattern)");
}

# establish dump destinations
$backgroundDumpDest = get_absolute_dump_dest($backgroundDumpDest, $oracle_home);
$userDumpDest = get_absolute_dump_dest($userDumpDest, $oracle_home);

if ($log_file_absolute eq "[MISSING_LOG]")
{
    # silently quit if the alert log file has been removed
    EMAGENT_PERL_INFO("No log file provided.");
    exit 0;
}
else
{
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("log file (absolute): $log_file_absolute.");
    }
}

# 4734310
if ($^O =~ m/MSWin32/) {
  $log_file_absolute =~ s/\//\\/g;
}
my @path_comps = split(($^O =~ m/MSWin32/ ? "\\\\" : "\/"), $log_file_absolute);
my $log_file_relative = $path_comps[$#path_comps];

my $state_root = $ENV{EM_AGENT_STATE_DIR};

# establish the columns that are not "generic"
my @columnPatterns = ();
&initializeColumnPatterns;

# extract array of numbers (31, 023, etc.) from log_timestamp_format

if (EMAGENT_isPerlDebugEnabled())
{
    EMAGENT_PERL_DEBUG("log timestamp format spec: $log_timestamp_format.");
}

my @max = split(/\D/, $log_timestamp_format);
my $max = \@max;

# translate format into regular expression
$log_timestamp_format =~ s/\d+/(\\d+)/g;

if (EMAGENT_isPerlDebugEnabled())
{
    EMAGENT_PERL_DEBUG("log timestamp format regexp: $log_timestamp_format.");
}

#
# Check the accessibility of the alert log.
#
if (!open(LOG,"< $log_file_absolute"))
{
    # This can happen in two cases:
    # 1) Alert log exists (i.e., was identifiable) but is not readable.  This
    #    certainly warrants a warning message.
    # 2) Alert log does not exist, and has been removed since the log file was
    #    identified.  This is not technically a warning situation -- ideally,
    #    the alert log identification should be cleared, but at this point
    #    that's not feasible.
    if (EMAGENT_isPerlWarningEnabled())
    {
        EMAGENT_PERL_WARN("target: $targetGuid; unable to open alert log at $log_file_absolute");
    }
    exit 0;
}

#
# Build the fully qualify scanned file path.
#

my $separator = $^O =~ m/MSWin32/ ? "\\" : "\/";

#
# Location of the offsets state file is
#  $EMDROOT/sysman/emd/state/<TARGET_GUID>.<alert log filename>
#
my $scannedFile = $state_root.$separator."sysman".$separator."emd".$separator."state".$separator."$targetGuid.$log_file_relative";

# capture start time, in seconds and track the number of stacks processed so
# the time check is performed relatively infrequently.
my $start_time = Time::HiRes::time;
my $stacks_processed = 0;

#
# Limit switch must be:
# - a positive integer (representing seconds)
# - the string "EOF" (indicating that the tail should be skipped)
# - the string "tail" (indicating the normal behavior, process the entire tail)
# When it's something other than these, the "tail" behavior is followed.
#
if ($limitSwitch =~ m/^\d+$/)
{
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("Stopping soon after $limitSwitch seconds.");
    }
}
elsif ($limitSwitch eq "EOF")
{
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("Skipping remaining entries in $log_file_absolute");
    }
}
else
{
    if ($limitSwitch ne "tail")
    {
        if (EMAGENT_isPerlWarningEnabled())
        {
            EMAGENT_PERL_WARN("Invalid limit switch specification ($limitSwitch) -- using 'tail' option");
        }
    }

    $limitSwitch = 0;
}

my $offset = 0;
my $line = 0;

#
# Open scanned file for reading (if scanned file does not exist, create it.)
#
if (!open(SCANNED, "+>> $scannedFile"))
{
    EMAGENT_PERL_ERROR("target: $targetGuid; Cannot open $scannedFile for read/write.");
    exit 1;
}

# seek to top of file

seek(SCANNED, 0, 0);

# file only contains a single line with two numbers:  an offset and line number
# get these and close the scanned file

while (<SCANNED>)
{
    my @pos = split(' ', $_);
    my $pos = \@pos;

    $offset = $pos->[0];
    $line = $pos->[1];
}

close(SCANNED);

my @errStackKeys;          # "keys" for already seen error stacks
my $initialTimestamp = ""; # timestamp for the current error stack

if ($offset == 0 || $limitSwitch eq "EOF")
{
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("target $targetGuid; counting alert log lines only");
    }

    if ($offset == 0)
    {
        #
        # Trust neither offset nor line, and start counting from the beginning.
        #
        $line = 0;
    }
    else
    {
        #
        # Trust both offset and line, and start counting from the offset.
        #
        seek(LOG, $offset, 0);
    }

    while (<LOG>)
    {
        $line++;
    }
}
else
{
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("target $targetGuid; processing alert log tail starting at byte offset $offset, line $line");
    }

    #
    # Scan, and process, the alert log as long as the byte offset is valid
    # for the existing alert log.
    #
    my $lineNumber = 0;        # line number of timestamp
    my $errStack = "";         # content of the current error stack
    my $errCodes = "";         # key for the current error stack
    my $traceFileName = "";    # name of trace file

    #
    # Go directly to EOF (Do not pass GO. Do not collect $200.)
    # Do not read past the end of the file (at the time we first looked).
    #
    seek(LOG, 0, 2);
    my $eof_offset = tell(LOG);
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("EOF position is $eof_offset.");
    }

    #
    # Also, record the file's offset before the current line read, in case we
    # are time- or size-limited and need to reset the LOG file descriptor.
    #
    my $prev_offset;

    if ($offset <= $eof_offset)
    {
        #
        # A valid offset was provided, seek to it and process the lines.
        #
        seek(LOG, $offset, 0);
        $prev_offset = $offset;

        my $addContent;
        my $stack_offset = $offset;

        while (<LOG>)
        {
            $line++;

            #
            # Clean up the line (remove \n, leading, and trailing whitespace).
            #
            chop;
            $_ =~ s/^\s+//;
            $_ =~ s/\s+$//;

            #
            # Discontinue adding error stack content if we've hit the limit
            # on string size supported by EM.
            #
            $addContent = ($prev_offset - $stack_offset < 4000);

            #
            # Look for a new timestamp entry.
            #
            if ($_ =~ m/$log_timestamp_format/ && &timestampValid($&))
            {
                #
                # Process error stack timestamp.
                #

                #
                # Check for a time or size limiting condition.
                # (Note: $prev_offset is a function of the current size of the
                #        file, $eof_offset is the original size of the file;
                #        we don't want to "chase" the end of the file when
                #        it's growing at a rate faster than our scan rate!)
                #
                if (&outOfTime || $prev_offset >= $eof_offset)
                {
                    #
                    # "Unscan" the latest timestamp by backing up to the
                    # previous offset and a single line.
                    #
                    $line--;
                    seek(LOG, $prev_offset, 0);

                    if (EMAGENT_isPerlDebugEnabled())
                    {
                        EMAGENT_PERL_DEBUG("Stopped at " . Time::HiRes::time);
                    }

                    #
                    # Break from the input loop BEFORE writing the error stack
                    # (the last error stack is written after all lines to be
                    # read are).
                    #
                    last;
                }

                #
                # Dump the existing errStack, if there is one.
                #
                if ($initialTimestamp && $errCodes)
                {
                    &writeErrStack($initialTimestamp, $lineNumber,
                                   $errStack, $errCodes,
                                   $traceFileName);
                }

                #
                # Begin a new error stack.  All tests passed, save valid
                # timestamp
                #
                $initialTimestamp = $&;
                $lineNumber = $line;

                $errCodes = "";   # reset information regarding the stack
                $errStack = "";
                $traceFileName = &traceFileOf($_);
                $stack_offset = tell(LOG);

                $stacks_processed++;

                $prev_offset = $stack_offset;
            }
            else
            {
                #
                # Process error stack content.
                #

                #
                # Augment archiver errors so that they are identified
                # (appropriately) as ORA- errors.
                #
                s/(ARC[0-9]*: Error )([0-9]+[^0-9].*)/\1ORA\-\2/;

                if (!$traceFileName && $lineNumber + 1 == $line)
                {
                    #
                    # (Trace file identification is not reported in the stack)
                    #
                    $traceFileName = &traceFileOf($_);
                    $addContent = ($traceFileName eq "");

                    # 5711362: Error stack for deadlock includes ORA-0*60 and
                    # the trace file name on the same line. Inorder to report
                    # deadlock at GC, we consider ORA-0*60 as an exception to 
                    # trace file skip from errStack.
                    if ($traceFileName ne "")
                    {
                        if ($_ =~ m/$errorPrefix-(\d*)/)
                        {
                            my $error = $1;
                            if ( ($errorPrefix eq "ORA") && ($error == 60) )
                            {
                                $addContent = 1;
                            }
                            # bug fix 7422639
                            elsif ( $errorPrefix eq "CRS" )
                            {
                                $addContent = 1;
                            }
                        }
                    }
                }

                if ($addContent)
                {
                    #
                    # (Any other content is, as long as we haven't hit a limit)
                    #
                    if ($errStack eq "")
                    {
                      $errStack .= $_;
                    }
                    else
                    {
                      $errStack .= "~".$_;
                    }

                    my $errCode = &errorCodeOf($_);
                    if ($errCode)
                    {
                        if ($errCodes)
                        {
                            $errCodes .= ", ";
                        }
                        $errCodes .= $errCode;
                    }
                }

                $prev_offset = tell(LOG);
            }
        }

        #
        # Dump the final errStack.
        #
        if ($initialTimestamp && $errCodes)
        {
            &writeErrStack($initialTimestamp, $lineNumber,
                           $errStack, $errCodes,
                           $traceFileName);
        }
    }
    else
    {
        #
        # We know the offset is bogus, so alert log has been recycled;
        # offset will be computed later, but line count should be computed now.
        #
        seek(LOG, 0, 0);

        $line = 0;
        while (<LOG>)
        {
            $line++;
        }
    }
}

$offset = tell(LOG);

#
# Reopen scanned file and write out new offset (log size), line number,
# and time
#
open (SCANNED, "+> $scannedFile") or die "em_error=Cannot open $scannedFile";
print SCANNED "$offset $line $initialTimestamp\n";
close(SCANNED);

close(LOG);

#
# subroutine initializeColumnPatterns
#  Initialize the array of patterns that define non-generic error categories.
#
sub initializeColumnPatterns {
    $#columnPatterns = $ENV{EM_ALERTLOG_ERROR_TYPES} - 1;
    if (EMAGENT_isPerlDebugEnabled())
    {
        EMAGENT_PERL_DEBUG("$ENV{EM_ALERTLOG_ERROR_TYPES} additional columns defined.");
    }
    for ($[+1..$#columnPatterns+1)
    {
        @columnPatterns[$_-1] = $ENV{"EM_ALERTLOG_ERROR_TYPE$_"};
        if (!@columnPatterns[$_-1])
        {
            EMAGENT_PERL_WARN("Column $_ contains no definition.");
        }
        else
        {
            # translate commas to regexp alternatives; drop whitespace
            @columnPatterns[$_-1] =~ tr/, /|/d;

            if (EMAGENT_isPerlDebugEnabled())
            {
                EMAGENT_PERL_DEBUG("Column $_ matches errors @columnPatterns[$_-1].");
            }
        }
    }
}

#
# subroutine timestampValid
#  Check the validity of a timestamp.
#
# $_[0]      - timestamp to validate
#

sub timestampValid {

    my $tsValid = 1;

    my @current = split (/\D/, $_[0]);  # split out each number from timestamp
    my $currval = \@current;

    my $i = 0;

    my $maxval;

    foreach $maxval (@max)   # verify each number is within valid range
    {
        if ($maxval != "")
        {
            while (ord(substr($currval->[$i], 0, 1)) < 48 && 
                   $i < scalar(@current))
            {
                $i++;    # skip null values within @current
            }

            if ($currval->[$i] > $maxval)
            {
                $tsValid = 0;   # timestamp invalid...number too large  
            }

            $currval->[$i] =~ s/0//;
            
            if ($currval->[$i] == "") 
            {
                if (ord(substr($maxval, 0, 1)) > 48)
                {
                    $tsValid = 0;   # timestamp invalid...number = 0
                }
            }
            $currval->[$i] = "";
        }
    }

    $tsValid == 1;
}

#
# subroutine outOfTime
#  Determine if we've bumped up against a time limit.
#
sub outOfTime {
  #
  # Check every 512 log entries to see if we've been time-limited
  #
  !($stacks_processed % 128) &&
  $limitSwitch > 0 &&
  Time::HiRes::time - $start_time >= $limitSwitch;
}

#
# subroutine traceFileOf
#  Extract the trace file of the line.
#
# $_[0]      - line to search for error codes in
#

sub traceFileOf {

    my $traceFilePortion;
    my $loc = $[;

    if (($loc = index($_[0], $backgroundDumpDest, $loc)) >= $[)
    {
        $traceFilePortion = substr($_[0], $loc);
    }
    elsif (($loc = index($_[0], $userDumpDest, $loc)) >= $[)
    {
        $traceFilePortion = substr($_[0], $loc);
    }
    else
    {
        $traceFilePortion = "";
    }

    if ($traceFilePortion ne "")
    {
        if ($traceFilePortion =~ m/([^\s]*).*$/)
        {
            if (substr($traceFilePortion, length($1)-1) eq ":")
            {
                #
                # Remove trailing colon.
                #
                $traceFilePortion = substr($1, 0, length($1)-1);
            }
	}
    }

    $traceFilePortion;
}

#
# subroutine errorCodeOf
#  Extract the error codes of the line.
#
# $_[0]      - line to search for error codes in
#

sub errorCodeOf {

    my $error = "";
    if ($_[0] =~ m/$errorPrefix-(\d*)/)
    {
        $error = $1;
        if ($errorPrefix eq "ORA" && ($error == 600 || $error == 7445) &&
            $_[0] =~ m/^[^\[]*(\[[^\]]*\])/)
        {
            $error .= $1;
        }
    }

    $error;
}

#
# subroutine writeErrStack
#  Write out the current error stack.  Do nothing if we've seen this before.
#
# $_[0]      - initialTimestamp
# $_[1]      - lineNumber
# $_[2]      - errStack
# $_[3]      - errCodes
# $_[4]      - traceFileName
#

sub writeErrStack {

    #
    # combine all similar error stacks
    #
    # to revert to only rolling up identical 600 and 7445 errors,
    #  and (&&) the following condition to the first argument of grep:
    #
    #    m/(600\[[^\]]*\])|(7445\[[^\]]*\])/
    #

    my @matchedStackKeys = grep($_ eq $_[3], @errStackKeys);
    
    if ($#matchedStackKeys == ($[ - 1))
    {
        push(@errStackKeys, $_[3]);

        #
        # Compose the output line (just in case it isn't ignored).
        #
        my $isGeneric = 1;
        my $outputLine = "$_[0]/$_[1]|"; # errStackTimestamp/# lineNumber

        my $errorColumns = "|";
        if ($errorPrefix eq "CRS") # No generic column
        {
            $errorColumns = "";
        }
        my $column = 0;
        my $errorPattern;

	while ($column < @columnPatterns)
        {
            # if we're still considering this error stack possibly generic and
            #  an error list was supplied...
            if ($isGeneric && @columnPatterns[$column])
            {
                $errorPattern = "$errorPrefix-(" . @columnPatterns[$column] . ")";
                # ...attempt a match to reroute this error to a different
                #     column.
                if ($_[2] =~ m/$errorPattern/)
                {
                    $errorColumns .= $_[2];
                    $isGeneric = 0;
                }
            }
            $errorColumns .= "|";
            $column++;
        }
        if (($errorPrefix ne "CRS") && $isGeneric)
        {
            $outputLine .= $_[2];                  # generic error column
        }
        if ($errorPrefix ne "CRS")
        {
            $outputLine .= $errorColumns .             # other error columns
                           "$_[4]|$_[3]|$log_file_absolute\n";  # traceFileName,
                                                   # errCodes,
                                                   # fileName
            #
            # Write the line when either
            #  - there is no pattern to ignore or
            #  - this is not a generic error stack or
            #  - the generic error stack does not match the ignore pattern
            #
            if (!$ignorePattern || !$isGeneric || ($_[2] !~ m/$ignorePattern/))
            {
                # write the timestamp as the format e.g 2008-11-25 02:06:22
                # put it as the last column
                my ($year, $month, $day, $hour, $min, $sec) = HTTP::Date::parse_date($_[0]);
                my $agentTimestamp = $year . "-" . $month . "-" . $day . " ".
                                 $hour . ":" . $min . ":" . $sec ;
                $outputLine = "em_result=" .$agentTimestamp . "|" . $outputLine; 
                print $outputLine;
                EMD_PERL_DEBUG($outputLine);
            }
        }
        else ## for CRS alert log, no traceFileName
        {
            $outputLine .= $errorColumns .             # other error columns
                           "$_[3]|$log_file_absolute\n"; # errCodes,
                                                   # fileName
            #
            # Write the line when it is not generic
            if (!$isGeneric)
            {
                $outputLine = "em_result=" . $outputLine;
                print $outputLine;
                EMD_PERL_DEBUG($outputLine);
            }
        }

    }
}

exit 0;
