Professional Documents
Culture Documents
/bin/ksh
# Script Name: ha_report.ksh
# Author: Mohankumar Gandhi
# EMAIL: mohankumarg@in.ibm.com
# Creation Date: 10th Aug 2017
# Function 1: Try to identify the reason for the recent reboot.
# Function 2: Ping test between the cluster pair.
# Function 3: Analyzing the cl_event_summary log.
# Function 4: Analyzing the cluster.log.
# Function 5: Analyzing the cspoc.log.
# Function 6: Analyzing the Cluster health status.
# Function 7: Analyzing the clverify.log.
# Function 8: Analyzing the Resource Group(s).
# Function 9: Analyzing the cluster ODM verification across cluster participant
nodes.
# Function 10: Checking the existence of "config_too_long" process.
# Version = 9.0
NODE=$(hostname)
HA_REP_SUMM="/tmp/ha_rep_summ.log"
echo
"\033[1;36m*************************************************************\033[m"
echo "\033[1;36m STARTING THE HA REPORT FOR $NODE SERVER \033[m"
echo
"\033[1;36m*************************************************************\033[m"
echo " "
### START THE ERRPT PORTION TO CHECK SHUTDOWN/REBOOT
echo "\033[1;33m******* STARTING THE ERRPT PORTION TO CHECK SHUTDOWN/REBOOT/HA
ERRORS *******\033[m"
echo " $NODE server's recent Reboot/Shutdown status"
echo "----------------------------------------------------------------------------"
SHUT_REBO_LOG="/tmp/shut_rebo.log"
SW_PROB_LOG="/tmp/sw_prob.log"
HW_PROB_LOG="/tmp/hw_prob.log"
HA_PROB_LOG="/tmp/ha_prob.log"
REB_ID="2BFA76F6"
REB_ERR=$(errpt -aj 2BFA76F6)
> /tmp/ha_rep_summ.log
> /tmp/shut_rebo.log
> /tmp/sw_prob.log
> /tmp/hw_prob.log
> /tmp/ha_prob.log
Curr_date=$(date "+%m%d%H%M%y")
Ydate=$(perl -MPOSIX -e 'print strftime ("%m%d%H%M%y\n", localtime(strftime ("%s",
localtime) - 86400))')
# Ydate=$(date +"%m$(date +"%d" | awk '{ printf ("%02d",($1-2)) }')%H%M%y")
errpt -J
REBOOT_ID,ERRLOG_ON,ERRLOG_OFF,SYS_RESET,DUMP_STATS,MINIDUMP,KERNEL_PANIC,DOUBLE_PA
NIC,CONFIGRM_REBOOTOS_E -s $Ydate -e $Curr_date |grep -v IDENTIFIER >>
$SHUT_REBO_LOG
errpt -J DSI_PROC,ISI_PROC,PROGRAM_INT -s $Ydate -e $Curr_date |grep -v IDENTIFIER
>> $SW_PROB_LOG
errpt -J SCAN_ERROR_CHRP,SCANOUT -s $Ydate -e $Curr_date |grep -v IDENTIFIER >>
$HW_PROB_LOG
errpt -J
TS_DMS_WARNING_ST,CL_LOST_AHAFS_EVENT,TS_CRITICAL_CLNT_ER,CL_DEADMAN_LIMIT,CL_REPOS
_DISK_DOWN,CL_REPOS_INACCESS,CL_NETWORK_ISSUE,CL_MULTICAST_BLOCK,CL_ARU_FAILED,CL_A
ST_PANIC,TS_CL_CLINFO_ER,TS_CL_CLINFOFMT_ER,TS_CL_CLREG_ER,TS_CL_CLREGWR_ER,TS_CL_C
MDFAIL_ER,TS_CL_DUPINFO_ER,TS_CL_FATAL_GEN_ER,TS_CL_INVCLINFO_ER,TS_CL_NO_TSTBL_ER
-s $Ydate -e $Curr_date |grep -v IDENTIFIER >> $HA_PROB_LOG
if [[ -s $SHUT_REBO_LOG ]] ; then
echo "\033[1;31m Problem: $NODE got rebooted within 2 days\033[m"
echo "\033[1;31m Problem: $NODE got rebooted within 2 days\033[m" >>
$HA_REP_SUMM
echo " Checking REBOOT_ID on the $NODE"
if [[ $(errpt -J
REBOOT_ID,ERRLOG_ON,ERRLOG_OFF,SYS_RESET,DUMP_STATS,MINIDUMP -s $Ydate -e
$Curr_date |grep -v IDENTIFIER |awk '{print $1}' |grep "2BFA76F6") ==
"$REB_ID" ]] ; then
echo
"----------------------------------------------------------------------------"
echo " ERRPT Details for the Reboot"
echo
"----------------------------------------------------------------------------"
cat "$REB_ERR"
echo
"----------------------------------------------------------------------------"
else
echo " Unable to identify the REBOOT type, Please check errpt manually"
fi
else
echo "\033[1;32m Good: $NODE did not reboot within 2 days\033[m"
fi
echo " "
if [[ -s $SW_PROB_LOG ]] ; then
echo "\033[1;31m Problem: $NODE might crashed within 2 days due to the Software
Error, Please check errpt\033[m"
echo "\033[1;31m Problem: $NODE might crashed within 2 days due to the Software
Error, Please check errpt\033[m" >> $HA_REP_SUMM
echo
"----------------------------------------------------------------------------"
echo " ERRPT Details for the Software Error"
echo
"----------------------------------------------------------------------------"
cat "$SW_PROB_LOG"
echo
"----------------------------------------------------------------------------"
else
echo "\033[1;32m Good: $NODE did not crash due to any Software Error within 2
days \033[m"
fi
echo " "
if [[ -s $HW_PROB_LOG ]] ; then
echo "\033[1;31m Problem: $NODE might crashed within 2 days due to Hardware
Malfunction \033[m"
echo "\033[1;31m Problem: $NODE might crashed within 2 days due to Hardware
Malfunction \033[m" >> $HA_REP_SUMM
echo
"----------------------------------------------------------------------------"
echo " ERRPT Details for the Hardware Malfunction"
echo
"----------------------------------------------------------------------------"
cat "$HW_PROB_LOG"
echo
"----------------------------------------------------------------------------"
else
echo "\033[1;32m Good: $NODE did not crash due to any Hardware Malfunction
within 2 days \033[m"
fi
echo " "
if [[ -s $HA_PROB_LOG ]] ; then
echo "\033[1;31m Problem: $NODE might have an issue with PowerHA/HACMP, Please
check errpt\033[m"
echo "\033[1;31m Problem: $NODE might have an issue with PowerHA/HACMP, Please
check errpt\033[m" >> $HA_REP_SUMM
echo
"----------------------------------------------------------------------------"
echo " ERRPT Details for the PowerHA/HACMP cluster"
echo
"----------------------------------------------------------------------------"
cat "$HA_PROB_LOG"
echo
"----------------------------------------------------------------------------"
else
echo "\033[1;32m Good: $NODE might not have any issue with PowerHA/HACMP within
2 days \033[m"
echo " "
fi
echo "\033[1;33m******* END OF THE ERRPT PORTION TO CHECK SHUTDOWN/REBOOT/HA
ERRORS *******\033[m"
echo " "
echo "----------------------------------------------------------------------------"
echo " Last Cluster Event on $NODE server is on $LAST_EVENT_TIME"
echo " Recent RG related activities on this $NODE server are,"
echo "----------------------------------------------------------------------------"
echo " "
> /tmp/total_last_events.log1
# cat /tmp/total_last_events.log
cat /tmp/total_last_events.log |grep -i "Event" |awk '{print $2}' |sort |uniq |
while read line
do
a=`cat /tmp/cl_even_desc.info | grep -i "$line"`
sed "s/$line/$a/" /tmp/total_last_events.log > /tmp/total_last_events.log1
cp /tmp/total_last_events.log1 /tmp/total_last_events.log
done
#rm /tmp/total_last_events.log1
fi
cat /tmp/total_last_events.log
echo "\033[1;33m******* END OF THE CL_EVENT_SUMMARY ANALYZING *******\033[m"
echo " "
### END OF ANALYZING THE CL_EVENT_SUMMARY
if [[ -s $Cl_RG_list_log ]] ; then
echo
"---------------------------------------------------------------------------"
echo " List of Resource Groups (RGs) on this $NODE server is,"
echo
"---------------------------------------------------------------------------"
cat $Cl_RG_list_log
echo
"---------------------------------------------------------------------------"
echo " "
for rg_name in `cat $Cl_RG_list_log`
do
echo " Participating Nodes for the RG $rg_name are,"
/usr/es/sbin/cluster/utilities/clshowres -g $rg_name |grep "Participating
Node Name"
echo
"---------------------------------------------------------------------------"
echo " Primary/Home Node of the RG $rg_name is
`/usr/es/sbin/cluster/utilities/clshowres -g "$rg_name" |grep "Participating" |awk
'{print $4}'`"
PNODE=$(/usr/es/sbin/cluster/utilities/clshowres -g $rg_name |grep
"Participating Node Name" |awk '{print $4}')
echo " Secondary Node of the RG $rg_name is
`/usr/es/sbin/cluster/utilities/clshowres -g "$rg_name" |grep "Participating" |awk
'{print $5}'`"
echo " RG $rg_name is online on `/usr/es/sbin/cluster/utilities/clRGinfo
|grep "$rg_name" |grep ONLINE | awk '{print $3}'`"
ONODE=$(/usr/es/sbin/cluster/utilities/clRGinfo |grep "$rg_name" |grep
ONLINE | awk '{print $3}' )
if [[ "$PNODE" == "$ONODE" ]];then
echo "\033[1;32m Good: RG $rg_name is ONLINE on HOME-Node \033[m"
echo " "
else
echo " RG $rg_name is either \033[1;31mNOT ONLINE on HOME-Node or
with ERROR state\033[m, Please check..."
echo
"---------------------------------------------------------------------------"
fi
done
else
echo
"---------------------------------------------------------------------------"
echo " There is no Resource Groups (RGs) on this $NODE server"
echo
"---------------------------------------------------------------------------"
fi
echo "\033[1;33m******* END OF THE RESOURCE GROUP STATUS ANALYZING
*******\033[m"
echo " "
### END OF RG STATUS
### START OF To Validate the cluster ODM values and alert if it is not "Good"
echo "\033[1;35m******* STARTING THE CLUSTER ODM STATUS ANALYZING, PLEASE
WAIT...... !!! *******\033[m"
Cl_odm_status_log="/tmp/Cl_odm_status_rg.log"
> /tmp/Cl_odm_status_rg.log
## /usr/es/sbin/cluster/diag/clconfig >> /tmp/Cl_odm_status_rg.log 2>/dev/null
/usr/es/sbin/cluster/diag/clconfig >> "$Cl_odm_status_log" 2>/dev/null
if [[ -s $Cl_odm_status_log ]] ; then
ODM_STATE=$(grep "Error" $Cl_odm_status_log |sort |uniq)
if [ "$ODM_STATE" == "Error" ];then
echo
"---------------------------------------------------------------------------"
echo " Cluster ODM verification Status on the $NODE server is,"
echo
"---------------------------------------------------------------------------"
echo "\033[1;31m Problem: Cluster ODM is having ISSUE/MISMATCH on $NODE and
it's pair\033[m"
echo "\033[1;31m Problem: Cluster ODM is having ISSUE/MISMATCH on $NODE and
it's pair\033[m" >> $HA_REP_SUMM
echo " "
else
echo
"---------------------------------------------------------------------------"
echo " Cluster ODM verification Status on the $NODE server is,"
echo
"---------------------------------------------------------------------------"
echo "\033[1;32m Good: Cluster ODM is good on $NODE\033[m"
echo " "
fi
else
echo
"---------------------------------------------------------------------------"
echo " Cluster ODM verification Status on the $NODE server is,"
echo
"---------------------------------------------------------------------------"
echo "\033[1;31m Problem: Unable to verify Cluster ODM on $NODE and it's
PAIR\033[m"
echo "\033[1;31m Problem: Clconfig command output is EMPTY, Please check cluster
service on $NODE and it's PAIR\033[m"
echo "\033[1;31m Problem: Unable to verify Cluster ODM on $NODE and it's
PAIR\033[m" >> $HA_REP_SUMM
echo "\033[1;31m Problem: Clconfig command output is EMPTY, Please check cluster
service on $NODE and it's PAIR\033[m" >> $HA_REP_SUMM
echo " "
fi
### END OF To Validate the cluster ODM values and alert if it is not "Good"
echo "\033[1;35m******* END OF THE RESOURCE GROUP STATUS ANALYZING
*******\033[m"
echo " "
### START OF CHECKING CONFIG_TOO_LONG ISSUE ###
echo "\033[1;33m******* STARTING TO CHECKING OF CONFIG_TOO_LONG PROCESS EXISTENCE
*******\033[m"
CONF_TOO_LONG=$(ps -ef |grep -i "config_too_long" |grep -v "grep")
if [ -z "$CONF_TOO_LONG" ];then
echo "\033[1;32m Good: There is no config_too_long issue with this $NODE
server \033[m"
else
echo "Problem: config_too_long process is running on this $NODE Server">>
$HA_REP_SUMM
echo "\033[1;31m Problem: config_too_long process is running on this $NODE
Server \033[m"
echo "\033[0;31m Note: If a cluster event, such as a node_up or a node_down
event, lasted longer than 360 seconds, \033[m"
echo "\033[0;31m then every 30 seconds PowerHA SystemMirror issued a
config_too_long warning message inside the hacmp.out file.\033[m"
echo "\033[0;31m Activities that the script is performing take longer than the
specified time to complete \033[m"
echo "\033[0;31m For example, this could happen with events involving many disks
or complex scripts \033[m"
fi
echo " "
echo "\033[1;33m******* END OF THE CHECKING OF CONFIG_TOO_LONG PROCESS EXISTENCE
*******\033[m"
### END OF CHECKING CONFIG_TOO_LONG ISSUE ###
echo " "
if [ -s "$HA_REP_SUMM" ];then
echo
"\033[1;31m************************************************************************
*************************************\033[m"
echo "\033[1;31m OVERALL PROBLEM SUMMARIES OF THE
$NODE SERVER \033[m"
echo "\033[1;31m Note: THIS IS JUST A SUMMARY OF THE PROBLEM STATES
ALONE, PLEASE READ COMPLETE SCRIPT OUTPUT \033[m"
echo
"\033[1;31m************************************************************************
*************************************\033[m"
echo " "
cat "$HA_REP_SUMM"
echo " "
echo
"\033[1;31m************************************************************************
*************************************\033[m"
else
echo " There is no Problem summary " >> /dev/null 2>&1
fi
echo
"\033[1;36m*************************************************************\033[m"
echo "\033[1;36m END OF HA REPORT FOR $NODE server \033[m"
echo
"\033[1;36m*************************************************************\033[m"
echo " "