This commit is contained in:
Cory Hawkless 2022-10-03 14:26:28 +10:30
parent a907034a9b
commit 822720456a
4 changed files with 245 additions and 104 deletions

34
log.log Normal file
View File

@ -0,0 +1,34 @@
Mon 26 Sep 2022 16:31:23 ACST
==============================
checking node google1:
PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data.
64 bytes from 8.8.8.8: icmp_seq=1 ttl=57 time=22.5 ms
64 bytes from 8.8.8.8: icmp_seq=2 ttl=57 time=22.5 ms
64 bytes from 8.8.8.8: icmp_seq=3 ttl=57 time=23.7 ms
64 bytes from 8.8.8.8: icmp_seq=4 ttl=57 time=23.0 ms
64 bytes from 8.8.8.8: icmp_seq=5 ttl=57 time=23.4 ms
--- 8.8.8.8 ping statistics ---
5 packets transmitted, 5 received, 0% packet loss, time 4006ms
rtt min/avg/max/mdev = 22.525/23.038/23.742/0.472 ms
node google1 UP
==============================
checking node NG-router:
PING 149.36.0.253 (149.36.0.253) 56(84) bytes of data.
From 217.17.208.21 icmp_seq=1 Time to live exceeded
From 217.17.208.21 icmp_seq=2 Time to live exceeded
--- 149.36.0.253 ping statistics ---
3 packets transmitted, 0 received, +2 errors, 100% packet loss, time 2000ms
node NG-router DOWN
==============================
Mon 26 Sep 2022 16:31:29 ACST
cntNodes 2
cntNodesDown 1
cntNotify 0

View File

@ -13,39 +13,40 @@
# apt-get install python-pip # apt-get install python-pip
# pip install slacker-cli # pip install slacker-cli
# #git clone https://pypi.python.org/pypi/slacker-cli/ # #git clone https://pypi.python.org/pypi/slacker-cli/
while true
do
# user defined settings
txtEmail="name@example.com"
txtSubject="Path Loss Report"
txtSeparator="==============================\n"
nTrigger=4
nAttempts=5
txtPdServiceKey="000000000000000"
txtSlackBotToken="xoxb-250697658007-4127858946773-gyM0PMGA5XPbh3ZEYot4Yy5Z"
txtSlackChannel="general"
# user defined settings declare -A nodes
txtEmail="name@example.com" nodes=( \
txtSubject="Path Loss Report" [217.17.208.20]="NG-firsthop" \
txtSeparator="==============================\n" [149.36.0.254]="NG-router" \
nTrigger=4
nAttempts=5
txtPdServiceKey="000000000000000"
txtSlackBotToken="xoxb-something-or-other"
txtSlackChannel="pathloss"
declare -A nodes
nodes=( \
[8.8.8.8]="google1 member" \
[8.8.4.4]="google2" \
) )
# local variables # local variables
status="|" status="|"
cntNotify=0 cntNotify=0
tmpLog=$(mktemp) tmpLog=$(mktemp)
tmpPing=$(mktemp) tmpPing=$(mktemp)
cntNodes=0 cntNodes=0
cntNodesDown=0 cntNodesDown=0
cntMembers=0 cntMembers=0
cntMembersDown=0 cntMembersDown=0
declare -a items declare -a items
echo "tmplog @ ${tmpLog}"
# preload output
date > ${tmpLog}
# preload output # loop through nodes and test
date > ${tmpLog} for node in ${!nodes[*]}; do
# loop through nodes and test
for node in ${!nodes[*]}; do
((cntNodes++)) ((cntNodes++))
@ -73,7 +74,10 @@ for node in ${!nodes[*]}; do
if [[ nTrigger -ge value ]]; then if [[ nTrigger -ge value ]]; then
flagNxt="dn" flagNxt="dn"
((cntNodesDown++)) ((cntNodesDown++))
echo "node ${name} DOWN" >> ${tmpLog};
else else
echo "node ${name} UP" >> ${tmpLog};
flagNxt="up" flagNxt="up"
fi fi
@ -99,36 +103,47 @@ for node in ${!nodes[*]}; do
fi fi
if test "${flagPrv}" != "${flagNxt}"; then if test "${flagPrv}" != "${flagNxt}"; then
echo "Above State Change: ${flagPrv}>${flagNxt}" >> ${tmpLog}
touch /tmp/pl.${flagNxt}.${node} touch /tmp/pl.${flagNxt}.${node}
((cntNotify++)); ((cntNotify++));
mtr -w -b --report ${node} >> ${tmpLog}; mtr -w -b --report -n ${node} >> ${tmpLog};
status="${status} ${name} ${flagPrv}>${flagNxt} |" status="${status} ${name} ${flagPrv}>${flagNxt} |"
echo "" >> ${tmpLog} echo "" >> ${tmpLog}
echo "Above State Change: ${flagPrv}>${flagNxt}" >> ${tmpLog}
fi fi
done done
# footer # footer
echo -e ${txtSeparator} >> ${tmpLog} echo -e ${txtSeparator} >> ${tmpLog}
date >> ${tmpLog} date >> ${tmpLog}
echo "cntNodes ${cntNodes}" >> ${tmpLog};
# notify on failure echo "cntNodesDown ${cntNodesDown}" >> ${tmpLog};
if [[ cntNodes -eq cntNodesDown ]]; then echo "cntNotify ${cntNotify}" >> ${tmpLog};
# notify on failure
if [[ cntNodes -eq cntNodesDown ]]; then
echo "path-loss - all nodes unreachable" >> ${tmpLog}
logger "path-loss - all nodes unreachable" logger "path-loss - all nodes unreachable"
else else
# if something to notify # if something to notify
if [[ cntNotify -gt 0 ]]; then if [[ cntNotify -gt 0 ]]; then
echo "Sending an alert" >> ${tmpLog}
# need a pagerduty alert if all important members are down # need a pagerduty alert if all important members are down
if [[ cntMembers -eq cntMembersDown ]]; then # if [[ cntMembers -eq cntMembersDown ]]; then
response=$(cat ${tmpLog} | ./pd-trigger.sh -L -s "${txtPdServiceKey}" -d "${status}") #response=$(cat ${tmpLog} | ./pd-trigger.sh -L -s "${txtPdServiceKey}" -d "${status}")
fi #fi
# attempt an email # attempt an email
#cat ${tmpLog} | mail -s "${txtSubject}:${status}" ${txtEmail}; #cat ${tmpLog} | mail -s "${txtSubject}:${status}" ${txtEmail};
cat ${tmpLog} | /usr/local/bin/slacker -c ${txtSlackChannel} -t ${txtSlackBotToken} echo "All members down sending slack alert with /usr/local/bin/slacker -c ${txtSlackChannel} -t ${txtSlackBotToken}" >> ${tmpLog}
cat ${tmpLog} | slacker -c ${txtSlackChannel} -t ${txtSlackBotToken}
# fi
fi fi
fi fi
# clean up cp ${tmpLog} ./log.log
rm ${tmpLog} cp ${tmpPing} ./ping.log
rm ${tmpPing}
# clean up
rm ${tmpLog}
rm ${tmpPing}
done
exit 0

7
ping.log Normal file
View File

@ -0,0 +1,7 @@
PING 149.36.0.253 (149.36.0.253) 56(84) bytes of data.
From 217.17.208.21 icmp_seq=1 Time to live exceeded
From 217.17.208.21 icmp_seq=2 Time to live exceeded
--- 149.36.0.253 ping statistics ---
3 packets transmitted, 0 received, +2 errors, 100% packet loss, time 2000ms

85
pingmon.py Normal file
View File

@ -0,0 +1,85 @@
from termios import TAB3
import threading
from types import NoneType
from prometheus_client import Counter
import json
from telnetlib import theNULL
import pingparsing
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
tests={
"1": {
'dest': "66.29.128.140",
'name': 'African webserver',
'packet_loss_rate_permitted': 50,
'rtt_max_permitted': 150,
},
"2": {
'dest': "8.8.8.8",
'name': 'Google DNS',
'packet_loss_rate_permitted': 0,
'rtt_max_permitted': 150,
},
"3": {
'dest': "1.1.1.1",
'name': 'Cloudflare DNS',
'packet_loss_rate_permitted': 0,
'rtt_max_permitted': 150,
},
}
def pinger(test_id):
print("Testing {} on IP {} with RTT threshold of {} and packet loss max of {}".format(
tests[test_id]['name'],tests[test_id]['dest'],
tests[test_id]['rtt_max_permitted'],tests[test_id]['packet_loss_rate_permitted']))
dest=tests[test_id]['dest']
name=tests[test_id]['name']
rtt_max_permitted=tests[test_id]['rtt_max_permitted']
packet_loss_rate_permitted=tests[test_id]['packet_loss_rate_permitted']
ping_parser = pingparsing.PingParsing()
transmitter = pingparsing.PingTransmitter()
transmitter.destination = dest
transmitter.count = 10
transmitter.timeout=2
# while 1:
result = transmitter.ping()
data=ping_parser.parse(result).as_dict()
packet_loss_rate=data["packet_loss_rate"]
rtt_max=data["rtt_max"]
notify=0
print("Dest: {} Loss: {}% RTT: {}ms".format(dest,packet_loss_rate,rtt_max))
if rtt_max>rtt_max_permitted:
print("ERROR: rtt_max_permitted exceeded!")
notify=1
if packet_loss_rate>packet_loss_rate_permitted:
print("ERROR: packet_loss_rate_permitted exceeded!")
notify=1
if notify:
print(json.dumps(ping_parser.parse(result).as_dict(), indent=4))
registry = CollectorRegistry()
g_pl = Gauge('packet_loss_rate', 'Amt of packet loss', ['destination_ip'],registry=registry )
g_pl.labels(dest).set(packet_loss_rate)
if not type(rtt_max)==NoneType:
g_rtt = Gauge('rtt_max', 'Round trip time', ['destination_ip'],registry=registry )
g_rtt.labels(dest).set(rtt_max)
push_to_gateway('10.10.110.250:9091', job='cory_test_job1', registry=registry)
if __name__ == "__main__":
while 1:
for _id,_item in tests.items():
pinger(_id)
# t = threading.Thread(target=pinger, args=(_id,))
# t.start()