#!/bin/bash
errorfile="/tmp/ceph-error"
node=$(cat /etc/hostname)
email="notifications@company.com"
health=$(/usr/bin/ceph health)
if [ "$health" != "HEALTH_OK" ] && [ ! -f $errorfile ]; then
# health not ok, and file does not yet exist
echo "health not ok, and file does not yet exist"
touch $errorfile
echo "Bad news: Ceph cluster node $node health status became "$health"" | mail -s "ERROR! ceph status $node" $email
fi
if [ "$health" != "HEALTH_OK" ] && [ -f $errorfile ]; then
# health not ok, errorfile already present
# check age of errorfile
if test 'find $errorfile -mmin +30'; then
echo "errorfile older than 30 minutes"
# fresh timestamp on errorfile
touch $errorfile
# and notify again
echo "FYI: Ceph cluster node $node health status is still "$health"" | mail -s "ERROR! ceph status $node" $email
fi
fi
# then assuming health IS ok, we can delete errorfile
if [ "$health" = "HEALTH_OK" ] && [ -f $errorfile ]; then
# health is ok, error file exists, so can be removed
rm -f $errorfile
echo "good news"
echo "Good news: Ceph cluster node $node health status is again "$health"" | mail -s "ceph status $node" $email
fi