#!/usr/bin/ksh
#
# Version 3.10 - March 2019
# Stephen Diwell and Earl Jew and Patrice Quet and Chris Gibson and Phil Langerholc.
# Earl    = earlj@us.ibm.com
# Stephen = Stephen.Diwell@dxc.com
# Patrice = patrice.quet@caissedesdepots.fr
# Chris   = cgibson@au1.ibm.com
# Phil    = Phil.Langerholc@ibm.com

#  Assumes IBM System P POWER4-->POWER9 and AIX 5.3/6.1/7.1/7.2
#  Earl Jew --  Senior IT Management Consultant - IBM Power Systems and IBM Systems Storage
#  IBM Lab Services and Training - US Power Systems (group/dept)
#  IBM Certified Technical Sales Specialist - Power Systems and AIX - V1
#  IBM Certified Specialist - Midrange Storage Technical Support V2
#  IBM Certified Specialist - Enterprise Storage Technical Support V2
#  400 North Brand Blvd., c/o IBM 8th floor, Glendale, CA 91203

#  earlj@us.ibm.com   (310) 251-2907 cell
#  Version 3.15 : June 26, 2020 -- added redundant executions and a longer collection of detailed statistics

#================================================================================
#  Mundane Performance Data Collection script

#  Please execute as root-user using syntax not unlike:
#  /bin/ksh ./AIX_Perf_Monitor.ksh
#  Please send a compressed collection report to Earl, earlj@us.ibm.com
#  and I will review and offer my findings by telephone/conf call.

#  Please execute this script when there is an active workload of concern.
#  The script below collects 500kb-20mb of textdata per run.
#================================================================================

#================================================================================
#  Stephen Diwell - Version 2.0
#                 - Added some code to avoid running root only commands (hpmstat/vmo/ioo etc)
#                   if a normal non-root user executes the script.  if [[ ${IAM} == 'root' ]]
#                 - Fixed the awk statement in the lsdev line for entstat and fcstat commands.
#                   I removed the \ and grep statement and added the search string to awk.
#                 - Fixed the lsvg command to lsvg -o so script only queries varyied on VGs.
#                 - Added the -n flag to netstat commands to avoid DNS resolution pauses.
#
#  Stephen Diwell - Version 3.0
#                 - This script now creates its own unique logfile and
#                   re-directs all output to this file and the screen.
#                 - Added the -e and -h flag options.
#                   If you run as root, it will only run the hpmstat command
#                   if you add the -e flag.  The -h is for basic help.
#                 - Added extra ps -fe | sort -nk4 to show most recent active processes.
#                 - Added iostat -F to show most active file systems for IO
#                 - Added some INFORMATION heading, more will be added in next release.
#
#  Stephen Diwell - Version 3.1
#                 - Added hostname to output filename.
#
#  Stephen Diwell - Version 3.2
#                 - Added ps avx - processes and memory size output.
#                 - Added lsvg -l ${VG}
#                 - Added lsattr output for disk attributes.
#                 - Added lslv -l ${LV} output - show disks LV is on.
#                 - Added lsuser -a id pgrp  ALL - For corrolation of stats to user.
#
#  Stephen Diwell - Version 3.3
#                 - Added lsattr for Ethernet and Fibre adapters.
#
#  Stephen Diwell - Version 3.4 - June 2017
#                 - Removed the lsuser output at Earl's request.
#                 - Added the pmcycles -d output.
#  
#  Patrice Quet   - Version 3.5 - March 2018
#                 - Adding ps -kfZ and ps -efZ to display page sizes used by processes.
#
#  Patrice Quet   - Version 3.6 - June 2018
#                 - Adding KDB mempool subcommand to show mempsum free frames
#
#  Patrice Quet   - Version 3.7 - July 2018
#                 - Adding vmstat -IP all to display number of used and free frames of different sizes.
#
#  Patrice Quet   - Version 3.8 - November 2018
#                 - Added option to svmon -G command to display affinity domains related to memory
#                 - Added mpstat -a into cumulative mpstat statistics
#
#  Chris Gibson    - Version 3.9 - February 2019
#	          - Added -E option to lparstat to display processor clock frequency.
#
#  Patrice Quet   - Version 3.10 - March 2019
#		  - Added MPIO cumulative and real-time statistics with iostat -m
#
#  Stephen Diwell - Version 3.11 - May 2019 - Added mpstat -v for > AIX 7.1 TL3 SP2
#
#  Phil Langerholc - Version 3.13 - November 2019 - added mmdf for GPFS installations to check the data in the file system is balanced
#	Added -V to iostat -F to elminate inactive file systems during the monitoring period
#
#  Phil Langerholc -  Version 3.14 - June 2020 - added in mmlsconfig do display GPFS configuration
#
#================================================================================

clear
set -o trackall

MY_PID=$$
SCRIPTNAME=$( basename $0 )
HOSTNAME=$( hostname -s )
DATE=$( date +%Y%m%d )

BASEDIR=/tmp/AIX_Perf_Monitor_${MY_PID}

[[ ! -d ${BASEDIR} ]] && mkdir -m 755 ${BASEDIR}
[[ ! -d ${BASEDIR} ]] && {
	print "ERROR:  Unable to create temporary work directory ${BASEDIR}"
	exit 1
	}

#
# Setup logging and capture all output.
#
LOGFILE=${BASEDIR}/${SCRIPTNAME}.${HOSTNAME}.${DATE}.txt
touch ${LOGFILE}
tail -f ${LOGFILE} &
TAIL_PID=$!

print "\n\nINFORMATION: Please collect log files from ${BASEDIR}"
print "INFORMATION: Redirecting output to ${LOGFILE}"
sleep 6

#
# Trap most signals so we cleanup first.
#
trap 'cleanup ; exit 99' 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

#
# Redirect stderr and stdout to logfile.
# User will also see the output due to the tail -f above.
#
exec 1> ${LOGFILE} 2>&1


## Who am I?
IAM=$( whoami )
[[ ${IAM} == "root" ]] && RUN_ROOT=TRUE || RUN_ROOT=FALSE


## By default, do not run hpmstat command.
## Note that there are known instances of LPARs hanging or crashing
## with HPMSTAT - Use this command at your own risk.
HPMSTAT=FALSE


#
# Basic help for -h or -? flags.
#
usage()
{
print
print "${SCRIPTNAME} [ -h | -e  ]"
print "Where:  -h  shows this help"
print "        -e  runs the hpmstat command"
print
print "WARNING: There are known issues with hpmstat and LPARs crashing."
print "WARNING: Continue to run the hpmstat at your own risk."
print
sleep 2
cleanup
exit 1
}

#
# Clean up the tail process and logging redirection.
#
cleanup()
{
print "\n\nINFORMATION: Cleanup tail process and log redirection."
sleep 1
exec 1>&-
exec 2>&-
kill -9 ${TAIL_PID}
}

#
# Command Line Arguments.
#
while getopts "-v -h -e" OPT 2>/dev/null
do
	case ${OPT} in

		e)	
			[[ ${RUN_ROOT} == "TRUE" ]] && {
				HPMSTAT=TRUE
				print
				print "WARNING: The hpmstat command has been reqeusted to run."
				print "WARNING: There are known issues with hpmstat and LPARs crashing."
				print
				print "WARNING: Continue to run the hpmstat at your own risk."
				print "WARNING: Press ctrl-c within 20 seconds to abort."
				print
				sleep 20
				}

			;;

		h)      usage
			;;

		*)      usage
			;;

	esac

done


#
# Get the current AIX, TL and SP Levels.
# oslevel -s output = 7200-03-02-1846
#
oslevel -s | tr "-" " " | read AIX_LVL TL_LVL SP_LVL REST


# Version 3.11 Update.
# Some commands are only available at certain AIX levels or higher.
# This section tries to deal with that, to avoid commands returning an error.

#
# First check, mpstat -v is only at AIX 7.2 TL3 SP2 or higher.
#
if [[ ${AIX_LVL} -ge 7200 ]]
  then
	MPSTAT_V=TRUE
elif [[ ${AIX_LVL} -eq 7100 && ${TL_LVL} -gt 3 ]]
  then
	MPSTAT_V=TRUE
elif [[ ${AIX_LVL} -eq 7100 && ${TL_LVL} -eq 3 && ${SP_LVL} -ge 2 ]]
  then
	MPSTAT_V=TRUE
  else
	MPSTAT_V=FALSE
fi




#
# Start of functions.
#

# HPMSTAT Function.
run_hpmstat() {

[[ ${HPMSTAT} == "FALSE" ]] && {
	print "\n\nINFORMATION: The hpmstat command not requested to run."
	return
	}

## Requires root-user to execute; makes no changes.

print "\n\n---- hpmstat ----\n"
hpmstat

print "\n\n---- hpmstat -g 53 10 ----\n"
hpmstat -g 53 10

print "\n\n---- hpmstat -g 108 10 ----\n"
hpmstat -g 108 10

print "\n\n---- hpmstat -H 10 ----\n"
hpmstat -H 10

print "\n\n---- hpmstat -H -g 53 10 ----\n"
hpmstat -H -g 53 10

print "\n\n---- hpmstat -H -g 108 10 ----\n"
hpmstat -H -g 108 10

print "\n\n---- hpmstat -k 10 ----\n"
hpmstat -k 10

print "\n\n---- hpmstat -k -g 53 10 ----\n"
hpmstat -k -g 53 10

print "\n\n---- hpmstat -k -g 108 10 ----\n"
hpmstat -k -g 108 10

print "\n\n---- hpmstat -u 10 ----\n"
hpmstat -u 10

print "\n\n---- hpmstat -u -g 53 10 ----\n"
hpmstat -u -g 53 10

print "\n\n---- hpmstat -u -g 108 10 ----\n"
hpmstat -u -g 108 10

print "\n\n---- END hpmstat END ----\n"

}


#
# Get root only stats.
#
root_stats() {

## Requires root-user to execute; makes no changes

print "\n\nINFORMATION: Running root user commands."
print

print "INFORMATION: Running vmo -L."
vmo -L
print "\n\n"

print "INFORMATION: Running ioo -L."
ioo -L
print "\n\n"

print "INFORMATION: Running no -L."
no -L
print "\n\n"

print "INFORMATION: Running nfso -L."
nfso -L
print "\n\n"

print "INFORMATION: Running schedo -L."
schedo -L
print "\n\n"

print "INFORMATION: Running raso -L."
raso -L
print "\n\n"

print "INFORMATION: Running asoo -L."
asoo -L
print "\n\n"

print "INFORMATION: Running lvmo -L."
lvmo -L
print "\n\n"


# For each online volume group, get the lvmo stats.
for VG in $( lsvg -o )
do
	print "\n\nINFORMATION: Running lvmo -a -v ${VG}"
	print
	lvmo -a -v ${VG}
	print "\n\n"

done

}


#########################################################################################
# Start of main script.
#########################################################################################
set -x

print "\n\nINFORMATION: Display basic system configuration."
print

date

id

uname -a

oslevel -s

lparstat -i

lssrad -av

pmcycles -d

print "\n\nINFORMATION: Display paging space."
print

lsps -a
lsps -s

##print "\n\nINFORMATION: Display basic user account and UID data."
##print
##
##lsuser -a id pgrp  ALL

print "\n\nINFORMATION: Display vmstat data."
print

uptime
vmstat -s
vmstat -v
vmstat -IP all

vmstat -IWwt 1 80

print "\n\nINFORMATION: Display KDB output."
print

print "\n\n---- KDB mempool subcommand ----\n"
echo "mempsum *" | kdb | sed -n '/mempsum/,$p'

print "\n\n---- END KDB output END ----\n"

print "\n\nINFORMATION: Display shared memory segments."
print

ipcs -bm

mount
df -k
cat /etc/filesystems
cat /etc/xtab
showmount

print "\n\nINFORMATION: Display ps kernel data."
print

ps -ekf | grep -v egrep | egrep "syncd|lrud|nfsd|biod|wait|getty|wlm|vtiol|j2pg|sched|swapper"

print "\n\nINFORMATION: Display processes and threads counts."
print

ps -el | wc
ps -elmo THREAD | wc

ps -kl | wc
ps -klmo THREAD | wc

print "\n\nINFORMATION: Display top 100 active user processes."
print

ps -fe | sort -nk4 | tail -100

print "\n\nINFORMATION: Display top 100 active kernel processes."
print

ps -fk | sort -nk4 | tail -100

print "\n\nINFORMATION: Display ps memory size output."
print

ps avxww

print "\n\nINFORMATION: Display prtconf output."
print

prtconf

print "\n\nINFORMATION: Display nfs stats."
print

uptime
nfsstat

print "\n\n---- START Cumulative historical mpstat/iostat ----\n"
mpstat -h
print
mpstat -a
print
mpstat -w
print
mpstat -d
print

# Only specific AIX version support the -v flag.
[[ ${MPSTAT_V} == "TRUE" ]] && mpstat -v
print

date
uptime
iostat -a
print
iostat -FVT
print
iostat -m
print
iostat -DlRT
print

print "\n\n---- END Cumulative historical mpstat/iostat ----\n"

uptime
print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100

print
print "\n\n---- START mpstat -w 1 80 ----\n"
mpstat -w 1 80

uptime
print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100

print "\n\n---- START mpstat -d 1 80 ----\n"
mpstat -dw 1 80

uptime
print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100

print "\n\n---- START mpstat -a 1 80 ----\n"
mpstat -aw 1 80
print "\n\n---- START mpstat -i 1 80 ----\n"
mpstat -iw 1 80
print "\n\n---- START mpstat -s 1 80 ----\n"
mpstat -s 1 80

uptime
print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100
print "\n\n---- START mpstat -h 1 80 ----\n"
mpstat -h 1 80

# Only specific AIX version support the -v flag.
[[ ${MPSTAT_V} == "TRUE" ]] && {
	print "\n\n---- START mpstat -v 1 80 ----\n"
	mpstat -v 1 80
	}

print "\n\n---- END mpstat variations ----\n"
uptime

print "\n\n---- START Cumulative historical vmstat -f ----\n"
vmstat -f
print "\n\n---- END Cumulative historical vmstat -f ----\n"
print
print "\n\n---- START Cumulative historical vmstat -i ----\n"
vmstat -i
print "\n\n---- END Cumulative historical vmstat -i ----\n"
print
print "\n\n---- START vmstat -i 1 40 ----\n"
vmstat -i 1 40
print "\n\n---- END vmstat -i 1 40 ----\n"


print "\n\nINFORMATION: Display fcs stats."
print

for I in $( lsdev | awk '/^fcs/{print $1}' )
do
	print "\n\n---- fcstat $I ----\n"
	fcstat $I ; print
	fcstat $I | grep "No" ; print "\n\n-------------------------\n"
done

print "\n\nINFORMATION: Display ent stats."
print

for I in $( lsdev | awk '/^ent/{print $1}' )
do
	entstat -d $I ; print
done


# If not root, skip run hpmstat.
[[ ${RUN_ROOT} == "TRUE" ]] && run_hpmstat


svmon -G -O affinity=on,unit=GB

uptime

[[ ${RUN_ROOT} == "TRUE" ]] && {
	print "\n\nINFORMATION: Display sar stats."
	print

	sar -a 1 80 # requires root-user to execute; makes no changes - Works as non-root user.
	sar -b 1 80 # requires root-user to execute; makes no changes - Works as non-root user.
	sar -c 1 80 # requires root-user to execute; makes no changes - Works as non-root user.
	sar -k 1 80 # requires root-user to execute; makes no changes - Works as non-root user.
	sar -d 1 80 # requires root-user to execute; makes no changes - Works as non-root user.
	}


print "\n\nINFORMATION: Display system configuration."
print

lsdev
lscfg
lsconf

print "\n\nINFORMATION: Display pv details."
print

# Turn off debugging here, it intermixes with the outputs.
# Just for the disks, when there are lots of disks.
set +x

lspv | while read DISK REST
do
	print ${DISK}
	lsattr -El ${DISK}
	print
done | grep -i -e hdisk -e reserve -e algorithm -e queue

print "\n\nINFORMATION: Display fcs details."
print

# Turn on debugging here.
set -x

lsdev -Cc adapter -S Available | awk '/fcs/{print $1}' | while read FCS REST
do
	print ${FCS}
	lsattr -El ${FCS}

	FSCSI=$( print ${FCS} | sed 's;fcs;fscsi;' )
	print ${FSCSI}
	lsattr -El ${FSCSI}
done

print "\n\nINFORMATION: Display eth details."
print

lsdev -Cc adapter -S Available | awk '/ent/{print $1}' | while read ENT REST
do
	print ${ENT}
	lsattr -El ${ENT}

	EN=$( print ${ENT} | sed 's;ent;en;' )
	print ${EN}
	lsattr -El ${EN}
done

# Get the stats only root user can see - vmo, lvmo, schedo etc.
[[ ${RUN_ROOT} == "TRUE" ]] && root_stats

# If not root, skip run hpmstat.
[[ ${RUN_ROOT} == "TRUE" ]] && run_hpmstat


print "\n\nINFORMATION: Display vmstat outputs."
print

uptime
vmstat -IWwt 1 80

# If not root, skip run hpmstat.
[[ ${RUN_ROOT} == "TRUE" ]] && run_hpmstat


print "\n\nINFORMATION: Display nfs outputs."
print

nfso -a

print "\n\nINFORMATION: Display pv outputs."
print

lspv

print "\n\nINFORMATION: Display vg outputs."
print

for VG in $( lsvg -o )
do
	lsvg $VG ; print
	lsvg -p $VG ; print
	lsvg -l $VG ; print
done

print "\n\nINFORMATION: Display lv outputs."
print

for VG in $( lsvg -o )
do
	print
	print ${VG}
	for LV in $( lsvg -l ${VG} | grep -v -e ${VG} -e 'TYPE' | awk '{ print $1 }' )
	do
		lslv -l ${LV}
	done
	print
done



print "\n\nINFORMATION: Display ps outputs."
print

print "\n\n---- ps -ef ----\n"
ps -ef
print "\n\n---- ps -kf ----\n"
ps -kf
print "\n\n---- ps -el ----\n"
ps -el
print "\n\n---- ps -kl ----\n"
ps -kl

print "\n\n---- ps -elmo THREAD ----\n"
ps -elmo THREAD
print "\n\n---- ps -klmo THREAD ----\n"
ps -klmo THREAD

print "\n\n---- ps guw ----\n"
ps guw
print "\n\n---- ps gvw ----\n"
ps gvw

print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100

print "\n\n---- ps -kf | sort -nk4 ----\n"
ps -kf | sort -nk4 | tail -100

print "\n\n---- ps -elmo THREAD | sort -nk6 ----\n"
ps -elmo | sort -nk6 | tail -100

print "\n\n---- ps guw | sort -nk6 ----\n"
ps guw | sort -nk6 | tail -100

print "\n\n---- ps -fe | sort -nk4 ----\n"
ps -fe | sort -nk4 | tail -100

print "\n\n---- ps -efZ ----\n"
ps -efZ
print "\n\n---- ps -kfZ ----\n"
ps -kfZ

print "\n\n---- ps guww ----\n"
ps guww
print "\n\n---- ps gvww ----\n"
ps gvww

print "\n\n---- END ps ----\n"


print "\n\nINFORMATION: Display netstat outputs."
print

ifconfig -a
print

print "\n\n---- Count of TCP/IP Connections (AIX:netstat | grep -c "EST"): \c"  ;  netstat -n | grep -c "EST"  ;  print

netstat -ss
netstat -in
netstat -rn
netstat -m
netstat -v
netstat -c
netstat -Cn
netstat -D
netstat -s
netstat -M
netstat -An
netstat

print "\n\nINFORMATION: Display iostat outputs."
print

print "\n\n---- iostat -a 1 40 ----\n"
iostat -a 1 40
print "\n\n---- iostat -b 1 80 ----\n"
iostat -b 1 80
print "\n\n---- iostat -s 1 40 ----\n"
iostat -s 1 40
print "\n\n---- iostat -FVT 1 80 ----\n"
iostat -FVT 1 80
print "\n\n---- iostat -m 1 40 ----\n"
iostat -m 1 40
print "\n\n---- END iostat ----\n"

print "\n\nINFORMATION: Display lparstat outputs."
print

uptime
print "\n\n---- lparstat # Statistics are cumulative since boot ----\n"
lparstat
print "\n\n---- lparstat -i ----\n"
lparstat -i

[[ ${RUN_ROOT} == "TRUE" ]] && {
	print "\n\n---- lparstat -h 1 80 ----\n"
	lparstat -h 1 80 # requires root-user to execute; makes no changes

	print "\n\n---- lparstat -H ----\n"
	lparstat -H # requires root-user to execute; makes no changes
	}

print "\n\n---- lparstat -d # Shows the detailed CPU utilization statistics since boot ----\n"
lparstat -d
print "\n\n---- lparstat -d 1 80 # Shows the detailed CPU utilization statistics 1sec intervals for 80 secs ----\n"
lparstat -d 1 80
print "\n\n---- lparstat -m ----\n"
lparstat -m
print "\n\n---- lparstat -h 1 80 ----\n"
lparstat -h 1 80
print "\n\n---- lparstat -me ----\n"
lparstat -me
print "\n\n---- lparstat -mp ----\n"
lparstat -mp
print "\n\n---- lparstat -Et 1 80 # Display processor clock frequency statistics 1sec intervals for 80 secs ----\n"
lparstat -Et 1 80
print "\n\n---- lparstat -Ewt 1 80 # Display processor clock frequency & CPU actual vs normalised statistics 1sec intervals for 80 secs ----\n"
lparstat -Ewt 1 80
print "\n\n---- END lparstat END ----\n"

# If not root, skip run hpmstat.
[[ ${RUN_ROOT} == "TRUE" ]] && run_hpmstat


print "\n\nINFORMATION: Display disk IO counters."

print "\n\n---- iostat -DRTl ----\n"
iostat -DRTl

print "\n\n---- iostat -DRTl 60 10 ----\n"
iostat -DRTl 60 10

print "\n\n---- iostat -FVT ----\n"
iostat -FVT

print "\n\n---- iostat -FVT 1 80 ----\n"
iostat -FVT 1 80

print "\n\nINFORMATION: Display vmstat output."
print

uptime
vmstat -s
vmstat -v
vmstat -IP all

vmstat -IWwt 1 80

print "\n\nINFORMATION: Display KDB output."
print

print "\n\n---- KDB mempool subcommand ----\n"
echo "mempsum *" | kdb | sed -n '/mempsum/,$p'

print "\n\n---- END KDB output END ----\n"

print "\n\nINFORMATION: Display final vmstat output."
print

if [ -f  /usr/lpp/mmfs/bin/mmdf ]
then
   print "\n\nINFORMATION: mmdf checking for GPFS file system balancing."
   print
    for fs in `lsfs -v mmfs |grep -v Name | cut -f1 -d ' ' `
  do
    /usr/lpp/mmfs/bin/mmdf $fs
  done #  for fs in `lsfs -v mmfs |grep -v Name | cut -f1 -d ' ' `
   print "\n\n---- END mmdf output END ----\n"
   print
  
fi #if [ -f  /usr/lpp/mmfs/bin/mmdf ]


if [ -f  /usr/lpp/mmfs/bin/mmlsconfig ]
then
   print "\n\nINFORMATION: mmlconfig output, listing cluster config."
   print
    
    /usr/lpp/mmfs/bin/mmlsconfig
 
   print "\n\n---- END mmlsconfig output END ----\n"
   print
  
fi #if [ -f  /usr/lpp/mmfs/bin/mmdf ]

date
id
uname -a

banner Finished

#
# Finished - Cleanup logging.
#
print
print "\n\nINFORMATION: Please email a compressed report to earlj@us.ibm.com if required."
print "INFORMATION: Report output: ${LOGFILE}"
print

# Clean up the logging and tail process.
cleanup

exit 0


