#!/bin/sh
#
# check_cpqim.sh
#
# Nagios plugin script
# Compaq MIBS required
#
# Can be located in the Compaq Management CD in the \TOOLKIT\MIBS directory.
#
# Original code by: Craig Cook    5 Sep 2002 (for Big Brother)
# Modified to work with Nagios bY: Thomas Nilsen	5.2.2003
#
# Version 0.1    Initial Version

###########################################################################
# INSTALLATION
#  step 1 - get the Compaq MIBS, either from the Cpq Management CD
#           or from the ucd-snmp found on www.compaq.com and install on your
#           nagios box. 
#
#  step 2 - ensure the compaq insight manger agents have been configured
#           on your compaq server
#
#  step 3 - copy this script to your nagios plugins folder. 
#           usually located on: /usr/local/nagios/libexec/
#	   
#  step 4 - add a check to checkcommands.cfg called check_cpqim
#           define check{
#             command_name check_cpqim
#             command_line $USER1$/check_cpqim.sh -H $HOSTADDRESS$ -C $ARG1$ -L $ARG2$
#           }
#
#  step 5 - add a service check to a host. Syntax for the plugin is:
#           check_cpqim.sh -C community -L >TEST>
#           <TEST> can be on of five: MEMORY, LOG, FAN, THERMAL or TEMP
#           This is an example of a service checking memory corrections.
#
#  define service{
#        use    generic-service 
#        host_name                       hosta
#        service_description             Cpq Memory
#        is_volatile                     0
#        check_period                    24x7
#        max_check_attempts              3
#        normal_check_interval           5
#        retry_check_interval            1
#        contact_groups                  windows-admins
#        notification_interval           0
#        notification_period             24x7
#        notification_options            w,u,c,r
#        check_command                   check_cpqim!public!MEMORY
#	 }
#  step 6 - change the following variables to fit your setup:

SNMPWALK="/usr/bin/snmpwalk"
SNMPGET="/usr/bin/snmpget"
GREP="/bin/grep"
TAIL="/usr/bin/tail"
SED="/bin/sed"
EXPR="/usr/bin/expr"
MKTEMP="/bin/mktemp"
TMP="/tmp"
RM="/bin/rm"

#  step 7 - restart Nagios (killall -1 nagios)
#
###########################################################################

# Define and export MIBS to be used/required
MIBS=ALL
export MIBS

#
# I HAVE NO IDEA OF WHAT'S REASONABLE HERE...
#
NUMBER_OF_TEMP_SENSORS=0
MEMORY_ERR_WARN_PERCENT=1    # Warn above this percent
MEMORY_ERR_WARN_PERCENT=50   # Panic above this percent
PERCENT_TEMP_WARN=60
PERCENT_TEMP_PANIC=70
MEMORY_ERR_PANIC_PERCENT=20	

###########################################################################
HOST="localhost"
COMMUNITY="public"
###########################################################################

##################
# Start of script
##################


#HEALTH_LOG=`$MKTEMP -q $TMP/$0.XXXXXX`
#if [ $? -ne 0 ]; then
#   echo "Can't create temp file, exiting..."
#   exit 3
#fi

#echo $HEALTH_LOG

#if [ "$SNMPWALK" = "" ] ; then
#	echo "SNMPWALK var not set - Script must be run from 'snmp'..."
#	exit 3
#fi

# This is required
#$SNMPWALK -c $COMMUNITY $HOST cpqHealth > $HEALTH_LOG
#if [ $? -ne 0 ]; then
#   echo "Snmpwalk failed: $0"
#   $RM -f $HEALTH_LOG
#   exit 3
#fi

#####
#####  Get Status proc - used to get all responses
#####
get_memory_info()
{
  #
  # Gather Info from logs
  #

  ##########################
  # Check for memory errors
  ##########################

  # This value specifies whether this system is currently tracking
  # correctable memory errors.
  RESULT=`$GREP cpqHeCorrMemLogStatus.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    MEM_LOG="$3"
    if [ "$MEM_LOG" = "enabled(4)" ] ; then

      # The number of correctable memory errors that have occurred.
      RESULT=`$GREP cpqHeCorrMemTotalErrs.0 $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        MEM_TOTAL_ERRS="$3"
      fi

      # The error threshold for Correctable memory errors. When
      # cpqHeCorrMemErrCount is greater than or equal to this value
      # user action is required to replace the failing memory module.
      RESULT=`$GREP cpqHeCorrMemErrorCntThresh.0 $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        MEM_THRESHOLD="$3"
      fi
      # Get around problem when THRESHOLD is set to 0.. (divide by 0)
      if [ "$MEM_THRESHOLD" = "0" ] ; then
         MEM_THRESHOLD="1"
      fi

      MEMORY_ERR_PERCENT=`$EXPR $MEM_TOTAL_ERRS \* 100 / $MEM_THRESHOLD`

      if [ "$MEMORY_ERR_PERCENT" -gt "$MEMORY_ERR_PANIC_PERCENT" ] ; then
	COLOR="red"
	echo "WARNING: memory errors ${MEMORY_ERR_PERCENT}% (panic at ${MEMORY_ERR_PANIC_PERCENT}%)"
      elif [ "$MEMORY_ERR_PERCENT" -gt "$MEMORY_ERR_WARN_PERCENT" ] ; then
        if [ "$COLOR" != "red" ] ; then
          COLOR="yellow"
        fi
        echo "memory errors ${MEMORY_ERR_PERCENT}% (warn at ${MEMORY_ERR_WARN_PERCENT}%)"
      else
	echo "Memory Errors: $MEM_TOTAL_ERRS / Threshold: $MEM_THRESHOLD"
	COLOR="green"
      fi
    else
       echo "Correctable Memory Error logging not enabled"
       COLOR="red"    
    fi
  fi
#####
#####  End of get_memory_info proc
#####
}

get_thermal_info()
{
  ###########################
  # Check for thermal errors
  ###########################

  # This value specifies the overall condition of the system's
  # thermal environment.

  RESULT=`$GREP cpqHeThermalCondition.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    THERMAL_CONDITION="$3"
    if [ "$THERMAL_CONDITION" != "ok(2)" ] ; then
        COLOR="red"
        echo -n " Thermal errors: ${THERMAL_CONDITION})"
    else
        COLOR="green"
        echo -n "Thermal Condition: $THERMAL_CONDITION / "
    fi
  fi

  # The status of the system's temperature sensors:
  #
  # This value will be one of the following:
  #     other(1)
  #        Temp sensing is not supported by this system or driver.
  #      ok(2)
  #         All temp sensors are within normal operating range.
  #      degraded(3)
  #         A temp sensor is outside of normal operating range.
  #      failed(4)
  #         A temp sensor detects a condition that could permanently
  #         damage the system.

  RESULT=`$GREP cpqHeThermalTempStatus.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    THERMAL_TEMP_STATUS="$3"
    if [ "$THERMAL_TEMP_STATUS" = "failed(4)" ] ; then 
        COLOR="red"
        echo -n "Thermal temp status: ${THERMAL_TEMP_STATUS})"
    elif [ "$THERMAL_TEMP_STATUS" = "degraded(3)" ] ; then
        COLOR="yellow"
        echo -n "$YELLOW_PIC Thermal temp status: ${THERMAL_TEMP_STATUS})"
    else
      COLOR="green"
      echo "Thermal temp status: $THERMAL_TEMP_STATUS"
    fi
  else 
     COLOR="blue"
     echo "Unknows status of TermalTemp"     
  fi
}
##
## FAN Status
get_fan_info()
{
  # The status of the fan(s) in the system.

  RESULT=`$GREP cpqHeThermalSystemFanStatus.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    THERMAL_SYSTEM_FAN_STATUS="$3"
    if [ "$THERMAL_SYSTEM_FAN_STATUS" = "failed(4)" ] ; then
        COLOR="red"
        echo -n " Thermal system fan status: ${THERMAL_SYSTEM_FAN_STATUS})"
    elif [ "$THERMAL_SYSTEM_FAN_STATUS" = "degraded(3)" ] ; then
        COLOR="yellow"
        echo -n "$YELLOW_PIC Thermal system fan status: ${THERMAL_SYSTEM_FAN_STATUS})"
    else
      COLOR="green"      
      echo -n "Thermal fan status: $THERMAL_SYSTEM_FAN_STATUS / "
    fi
  fi

  # The status of the processor fan(s) in the system.
  RESULT=`$GREP cpqHeThermalCpuFanStatus.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    THERMAL_CPU_FAN_STATUS="$3"
    if [ "$THERMAL_CPU_FAN_STATUS" = "failed(4)" ] ; then
        COLOR="red"
        echo -n " Thermal cpu fan status: ${THERMAL_CPU_FAN_STATUS})"
    elif [ "$THERMAL_CPU_FAN_STATUS" = "degraded(3)" ] ; then
        COLOR="yellow"
        echo -n "$YELLOW_PIC Thermal cpu fan status: ${THERMAL_CPU_FAN_STATUS})"
    else
      COLOR="green"
      echo -n "Thermal cpu fan status: $THERMAL_CPU_FAN_STATUS"
    fi
  fi
#####
#####  End of get_temp_info proc
#####
}

get_temperature_info()
{
  ###############################
  # Check for temperature errors
  ###############################

  RESULT=`$GREP cpqHeTemperatureIndex $HEALTH_LOG |
          $TAIL -1 |
         $SED 's/.*\.\([0-9][0-9]* \).*/\1/'`

  if [ "$RESULT" != "" ] ; then
      NUMBER_OF_TEMP_SENSORS="$RESULT"
  fi

  if [ "$NUMBER_OF_TEMP_SENSORS" != "0" ] ; then

    count=1
    while [ $count -le $NUMBER_OF_TEMP_SENSORS ]
    do

      # This specifies the location of the temperature sensor
      # present in the system.

      RESULT=`$GREP cpqHeTemperatureLocale.0.${count} $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        SENSOR_DESC="$3"
      fi

      # This is the current temperature sensor reading in degrees
      # celsius.

      RESULT=`$GREP cpqHeTemperatureCelsius.0.${count} $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        TEMP_CELSIUS="$3"
      fi

      # This is the shutdown threshold temperature sensor setting
      # in degrees celsius.  This is the temerature in which the
      # sensor will be considered to be in a failed state thus
      # causing the system to be shutdown.

      RESULT=`$GREP cpqHeTemperatureThreshold.0.${count} $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        MAX_TEMP_CELSIUS="$3"
      fi

      # The Temperature sensor condition.
      RESULT=`$GREP cpqHeTemperatureCondition.0.${count} $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        TEMP_STATUS="$3"
      fi

      echo -n "$SENSOR_DESC - $TEMP_CELSIUS C -  "

#      echo -n "Current Temp: $TEMP_CELSIUS C"
#      echo -n "Max Temp:     $MAX_TEMP_CELSIUS C"
#      echo -n "Status:       $TEMP_STATUS"

      if [ "$TEMP_STATUS" != "ok(2)" ] ; then
          COLOR="red"
          echo " Temperature errors ${TEMP_STATUS})"
      else
          COLOR="green"
      fi

      PERCENT_TEMP=`$EXPR $TEMP_CELSIUS \* 100 / $MAX_TEMP_CELSIUS`
      echo -n "${PERCENT_TEMP}% <BR> "

      if [ "$PERCENT_TEMP" -gt "$PERCENT_TEMP_PANIC" ] ; then
          COLOR="red"
          echo " Temperature too high: ${PERCENT_TEMP}% (panic at ${PERCENT_TEMP_PANIC})"
      #elif [ "$MEM_TOTAL_ERRS" -gt "$MEMORY_ERR_WARN_PERCENT" ] ; then
      #    if [ "$COLOR" != "red" ] ; then
      #      COLOR="yellow"
      #    fi
          echo -n " Temperature high: ${PERCENT_TEMP}% (warn at ${PERCENT_TEMP_WARN})"
      fi

      count=`$EXPR $count + 1`
    done
  fi
#####
#####  End of get_temperature_info proc
#####
}


get_log_info()
{
  #######################
  # Check for log errors
  #######################

  # This value specifies if this system supports the Integrated
  # Management Log feature.

  RESULT=`$GREP cpqHeEventLogSupported.0 $HEALTH_LOG`
  if [ "$RESULT" != "" ] ; then
    set $RESULT
    EVENT_LOG="$3"
    if [ "$EVENT_LOG" = "supported(3)" ] ; then

      RESULT=`$GREP cpqHeEventLogCondition.0 $HEALTH_LOG`
      if [ "$RESULT" != "" ] ; then
        set $RESULT
        EVENT_LOG_CONDITION="$3"
        if [ "$EVENT_LOG_CONDITION" = "failed(4)" ] ; then
          COLOR="red"
          echo "Critical Log Condition problem: ${EVENT_LOG_CONDITION}"
        elif [ "$EVENT_LOG_CONDITION" = "degraded(3)" ] || [ "$EVENT_LOG_CONDITION" = "other(1)" ] ; then
          if [ "$COLOR" != "red" ] ; then
            COLOR="yellow"
          fi
          echo "Log Condition problem: ${EVENT_LOG_CONDITION}"
        elif [ "$EVENT_LOG_CONDITION" = "ok(2)" ] ; then
	   COLOR="green"
 	   echo "Event Log OK"
 	else
	   COLOR="blue"
	   echo "Uknown state: $EVENT_LOG_CONDITION"
	fi
          # logs must be ok
#          RESULT=`$GREP cpqHeEventLogEntryNumber $HEALTH_LOG |
#            $TAIL -1 |
#            $SED 's/.*\.\([0-9][0-9]* \).*/\1/'`

#          if [ "$RESULT" != "" ] ; then
#            NUMBER_OF_EVENTS="$RESULT"

#            count=0
#            while [ $count -le $NUMBER_OF_EVENTS ]
#            do

              # The Temperature sensor condition.
#              RESULT=`$GREP cpqHeEventLogEntrySeverity.${count} $HEALTH_LOG`
#              if [ "$RESULT" != "" ] ; then
#                set $RESULT
#                EVENT_SEVERITY="$3"
#              fi

              # This value specifies the event log entry occurrence count.
              # This represents the number of times this event has occurred
              # starting from the initial time until the last modified time.
#              RESULT=`$GREP cpqHeEventLogEntryCount.${count} $HEALTH_LOG`
#              if [ "$RESULT" != "" ] ; then
#                set $RESULT
#                NUM_OCCURANCES="$3"
#              fi

              # The time stamp when the event log entry was first created.
              # RESULT=`$GREP cpqHeEventLogInitialTime.${count} $HEALTH_LOG`
              # if [ "$RESULT" != "" ] ; then
              #  set $RESULT
              #  INITITAL_TIME_DATA="$4"
              #  INITITAL_TIME=`echo "$INITITAL_TIME_DATA" |
              #  perl -lne 'print join "",map {chr hex $_} split " ";'`
              #fi

              # The time stamp when the event log entry was last modified.
              # RESULT=`$GREP cpqHeEventLogUpdateTime.${count} $HEALTH_LOG`
              # if [ "$RESULT" != "" ] ; then
              #  set $RESULT
              #  UPDATE_TIME_DATA="$4"
              #  UPDATE_TIME=`echo "$UPDATE_TIME_DATA" |
              #  perl -lne 'print join "",map {chr hex $_} split " ";'`
              #fi

              # A text description of the event log entry.
#              RESULT=`$GREP cpqHeEventLogErrorDesc.${count} $HEALTH_LOG`
#              if [ "$RESULT" != "" ] ; then
#                set $RESULT
#                shift; shift; shift;
#                EVENT_DESC="$*"
#              fi

              # This is the free form data associated with a particular event.
              # RESULT=`$GREP cpqHeEventLogFreeFormData.${count} $HEALTH_LOG`
              # if [ "$RESULT" != "" ] ; then
              #  set $RESULT
              #  EVENT_DATA_RAW="$4"
              #  EVENT_DATA=`echo "$EVENT_DATA_RAW" |
              #  perl -lne 'print join "",map {chr hex $_} split " ";'`
              #fi

#             if [ "$EVENT_SEVERITY" = "critical(15)" ] ; then
#               COLOR="red"
#               echo " ${EVENT_SEVERITY} (${NUM_OCCURANCES}) ${EVENT_DESC}"
#               EVENT_FOUND="true"
#             elif [ "$EVENT_SEVERITY" = "caution(9)" ] ; then
#              if [ "$COLOR" != "red" ] ; then
#                 COLOR="yellow"
#                 echo "$YELLOW_PIC ${EVENT_SEVERITY} (${NUM_OCCURANCES}) ${EVENT_DESC}"
#                 EVENT_FOUND="true"
#               fi
#             else
#                 echo "$GREEN_PIC ${EVENT_SEVERITY} (${NUM_OCCURANCES}) ${EVENT_DESC}"
#                EVENT_FOUND="true"
#             fi
                
#           count=`$EXPR $count + 1`
#           done
#         fi

#          if [ "$EVENT_FOUND" != "true" ] ; then
#             echo "No errors found in log."
#          fi
          
#        fi
      fi
    fi
  fi
#####
#####  End of get_log_info proc
#####
}

#
## Check the return values
##
#
check_values()
{
  $RM -f $HEALTH_LOG

  if [ "$COLOR" = "green" ] ; then
    exit 0
  elif [ "$COLOR" = "yellow" ] ; then
    exit 1
  elif [ "$COLOR" = "red" ] ; then
    exit 2
  else
    exit 3
  fi
}

print_help()
{
echo "check_cpqim v0.1"
echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
echo "copies of the plugins under the terms of the GNU General Public License."
echo "For more information about these matters, see the file named COPYING."
echo "Copyright (c) 2000 Bo Kersey/Karl DeBisschop"

echo "-H hostname"
echo ""
exit 3
}

#####
#####  Main body
#####

if [ -z $1 ]
# Exit and complain if no argument(s) given.
then
  echo "No target host specified"
  echo "Usage: check_cpqim -H <host> -C <comminity> -L <function>"
  exit 3
fi

while getopts "hVt:c:w:H:C:L:" Option
do
  case $Option in
    h     ) print_help;;
    H     ) HOST=$OPTARG;;
    C     ) COMMUNITY=$OPTARG;;
    L     ) TEST=$OPTARG;;
    w	  ) WARNING=$OPTARG;;
    c	  ) CRITICAL=$OPTARG;;
    -help  ) print_help;;
  esac
done
shift $(($OPTIND - 1))

#
## Create temporary file for snmp dump and do the snmpwalk.
##
create_temp()
{
HEALTH_LOG=`$MKTEMP -q $TMP/cpq.XXXXXX`
if [ $? -ne 0 ]; then
   echo "Can't create temp file, exiting..."
   exit 3
fi


if [ "$SNMPWALK" = "" ] ; then
	$RM -f $HEALTH_LOG
        echo "SNMPWALK var not set - Script must be run from 'snmp'..."
        exit 3
fi
}


get_snmpwalk()
{
# This is required
$SNMPWALK -c $COMMUNITY $HOST $SNMPVAL >> $HEALTH_LOG
if [ $? -ne 0 ]; then
   echo "Snmpwalk failed"
   $RM -f $HEALTH_LOG
   exit 3
fi
}

get_snmpget()
{
$SNMPGET -c $COMMUNITY $HOST $SNMPVAL >> $HEALTH_LOG
if [ $? -ne 0 ] ; then
   echo -n "Snmpget failed.. "
   $RM -f $HEALTH_LOG
   exit 3
fi
}
# Validate type input and get data
if [ "$TEST" = "MEMORY" ] ; then
  create_temp
  SNMPVAL="cpqHeCorrectableMemory"
  get_snmpwalk
  get_memory_info 
  check_values
elif [ "$TEST" = "THERMAL" ] ; then
  create_temp
  SNMPVAL="cpqHeThermal"
  get_snmpwalk
  get_thermal_info 
  check_values
elif [ "$TEST" = "TEMP" ] ; then
  create_temp
  SNMPVAL="cpqHeTemperatureEntry"
  get_snmpwalk
  get_temperature_info
  check_values
elif [ "$TEST" = "LOG" ] ; then
  create_temp
  SNMPVAL="cpqHeEventLogSupported.0"
  get_snmpget
  SNMPVAL="cpqHeEventLogCondition.0"
  get_snmpget
  get_log_info 
  check_values
elif [ "$TEST" = "FAN" ] ; then
  create_temp
  SNMPVAL="cpqHeThermal"
  get_snmpwalk
  get_fan_info
  check_values
else
  # Type selection failed..
  echo "No test selected. Use -L <TYPE> where TYPE can be any of: MEMORY / TEMP / THERMAL / LOG"
  exit 3
fi


#
# SEND THE INFO OFF TO BB
#
#clean up our mess
$RM -f $HEALTH_LOG

#####################################################################
# End of compaq_sensor.sh
#####################################################################

