diff --git a/log.log b/log.log new file mode 100644 index 0000000..9e8b7bf --- /dev/null +++ b/log.log @@ -0,0 +1,34 @@ +Mon 26 Sep 2022 16:31:23 ACST +============================== + +checking node google1: + +PING 8.8.8.8 (8.8.8.8) 56(84) bytes of data. +64 bytes from 8.8.8.8: icmp_seq=1 ttl=57 time=22.5 ms +64 bytes from 8.8.8.8: icmp_seq=2 ttl=57 time=22.5 ms +64 bytes from 8.8.8.8: icmp_seq=3 ttl=57 time=23.7 ms +64 bytes from 8.8.8.8: icmp_seq=4 ttl=57 time=23.0 ms +64 bytes from 8.8.8.8: icmp_seq=5 ttl=57 time=23.4 ms + +--- 8.8.8.8 ping statistics --- +5 packets transmitted, 5 received, 0% packet loss, time 4006ms +rtt min/avg/max/mdev = 22.525/23.038/23.742/0.472 ms +node google1 UP +============================== + +checking node NG-router: + +PING 149.36.0.253 (149.36.0.253) 56(84) bytes of data. +From 217.17.208.21 icmp_seq=1 Time to live exceeded +From 217.17.208.21 icmp_seq=2 Time to live exceeded + +--- 149.36.0.253 ping statistics --- +3 packets transmitted, 0 received, +2 errors, 100% packet loss, time 2000ms + +node NG-router DOWN +============================== + +Mon 26 Sep 2022 16:31:29 ACST +cntNodes 2 +cntNodesDown 1 +cntNotify 0 diff --git a/path-loss.sh b/path-loss.sh index 1aace72..21278a3 100644 --- a/path-loss.sh +++ b/path-loss.sh @@ -13,122 +13,137 @@ # apt-get install python-pip # pip install slacker-cli # #git clone https://pypi.python.org/pypi/slacker-cli/ +while true +do + # user defined settings + txtEmail="name@example.com" + txtSubject="Path Loss Report" + txtSeparator="==============================\n" + nTrigger=4 + nAttempts=5 + txtPdServiceKey="000000000000000" + txtSlackBotToken="xoxb-250697658007-4127858946773-gyM0PMGA5XPbh3ZEYot4Yy5Z" + txtSlackChannel="general" -# user defined settings -txtEmail="name@example.com" -txtSubject="Path Loss Report" -txtSeparator="==============================\n" -nTrigger=4 -nAttempts=5 -txtPdServiceKey="000000000000000" -txtSlackBotToken="xoxb-something-or-other" -txtSlackChannel="pathloss" + declare -A nodes + nodes=( \ + [217.17.208.20]="NG-firsthop" \ + [149.36.0.254]="NG-router" \ + ) -declare -A nodes -nodes=( \ - [8.8.8.8]="google1 member" \ - [8.8.4.4]="google2" \ - ) + # local variables + status="|" + cntNotify=0 + tmpLog=$(mktemp) + tmpPing=$(mktemp) + cntNodes=0 + cntNodesDown=0 + cntMembers=0 + cntMembersDown=0 + declare -a items + echo "tmplog @ ${tmpLog}" + # preload output + date > ${tmpLog} -# local variables -status="|" -cntNotify=0 -tmpLog=$(mktemp) -tmpPing=$(mktemp) -cntNodes=0 -cntNodesDown=0 -cntMembers=0 -cntMembersDown=0 -declare -a items + # loop through nodes and test + for node in ${!nodes[*]}; do -# preload output -date > ${tmpLog} + ((cntNodes++)) -# loop through nodes and test -for node in ${!nodes[*]}; do + # split out node details + # 0: alias/name + # 1: optional 'member' for determining edge outage + info=${nodes[${node}]} + items[1]="none" + ix=0 + for arg in ${info}; do + items[ix]=${arg} + ((ix++)) + done - ((cntNodes++)) + name="${items[0]}" - # split out node details - # 0: alias/name - # 1: optional 'member' for determining edge outage - info=${nodes[${node}]} - items[1]="none" - ix=0 - for arg in ${info}; do - items[ix]=${arg} - ((ix++)) - done - - name="${items[0]}" - - echo -e ${txtSeparator} >> ${tmpLog}; - echo "checking node ${name}:" >> ${tmpLog}; - echo "" >> ${tmpLog} - - ping -W 1 -c ${nAttempts} ${node} > ${tmpPing} - cat ${tmpPing} >> ${tmpLog} - - value=$(grep transmitted ${tmpPing} | cut -d ' ' -f 4) - if [[ nTrigger -ge value ]]; then - flagNxt="dn" - ((cntNodesDown++)) - else - flagNxt="up" - fi - - if test "member" == "${items[1]}"; then - ((cntMembers++)) - if test "dn" = "${flagNxt}"; then ((cntMembersDown++)); fi - fi - - flagPrv="na" - - if [[ -f /tmp/pl.dn.${node} ]]; then - flagPrv="dn" - if test "up" = "${flagNxt}"; then - rm /tmp/pl.dn.${node} - fi - fi - - if [[ -f /tmp/pl.up.${node} ]]; then - flagPrv="up" - if test "dn" = "${flagNxt}"; then - rm /tmp/pl.up.${node} - fi - fi - - if test "${flagPrv}" != "${flagNxt}"; then - touch /tmp/pl.${flagNxt}.${node} - ((cntNotify++)); - mtr -w -b --report ${node} >> ${tmpLog}; - status="${status} ${name} ${flagPrv}>${flagNxt} |" + echo -e ${txtSeparator} >> ${tmpLog}; + echo "checking node ${name}:" >> ${tmpLog}; echo "" >> ${tmpLog} - echo "Above State Change: ${flagPrv}>${flagNxt}" >> ${tmpLog} + + ping -W 1 -c ${nAttempts} ${node} > ${tmpPing} + cat ${tmpPing} >> ${tmpLog} + + value=$(grep transmitted ${tmpPing} | cut -d ' ' -f 4) + if [[ nTrigger -ge value ]]; then + flagNxt="dn" + ((cntNodesDown++)) + echo "node ${name} DOWN" >> ${tmpLog}; + + else + echo "node ${name} UP" >> ${tmpLog}; + flagNxt="up" + fi + + if test "member" == "${items[1]}"; then + ((cntMembers++)) + if test "dn" = "${flagNxt}"; then ((cntMembersDown++)); fi + fi + + flagPrv="na" + + if [[ -f /tmp/pl.dn.${node} ]]; then + flagPrv="dn" + if test "up" = "${flagNxt}"; then + rm /tmp/pl.dn.${node} + fi + fi + + if [[ -f /tmp/pl.up.${node} ]]; then + flagPrv="up" + if test "dn" = "${flagNxt}"; then + rm /tmp/pl.up.${node} + fi + fi + + if test "${flagPrv}" != "${flagNxt}"; then + echo "Above State Change: ${flagPrv}>${flagNxt}" >> ${tmpLog} + touch /tmp/pl.${flagNxt}.${node} + ((cntNotify++)); + mtr -w -b --report -n ${node} >> ${tmpLog}; + status="${status} ${name} ${flagPrv}>${flagNxt} |" + echo "" >> ${tmpLog} fi done -# footer -echo -e ${txtSeparator} >> ${tmpLog} -date >> ${tmpLog} - -# notify on failure -if [[ cntNodes -eq cntNodesDown ]]; then - logger "path-loss - all nodes unreachable" -else - # if something to notify - if [[ cntNotify -gt 0 ]]; then - # need a pagerduty alert if all important members are down - if [[ cntMembers -eq cntMembersDown ]]; then - response=$(cat ${tmpLog} | ./pd-trigger.sh -L -s "${txtPdServiceKey}" -d "${status}") - fi - # attempt an email - #cat ${tmpLog} | mail -s "${txtSubject}:${status}" ${txtEmail}; - cat ${tmpLog} | /usr/local/bin/slacker -c ${txtSlackChannel} -t ${txtSlackBotToken} + # footer + echo -e ${txtSeparator} >> ${tmpLog} + date >> ${tmpLog} + echo "cntNodes ${cntNodes}" >> ${tmpLog}; + echo "cntNodesDown ${cntNodesDown}" >> ${tmpLog}; + echo "cntNotify ${cntNotify}" >> ${tmpLog}; + # notify on failure + if [[ cntNodes -eq cntNodesDown ]]; then + echo "path-loss - all nodes unreachable" >> ${tmpLog} + logger "path-loss - all nodes unreachable" + else + # if something to notify + if [[ cntNotify -gt 0 ]]; then + echo "Sending an alert" >> ${tmpLog} + # need a pagerduty alert if all important members are down + # if [[ cntMembers -eq cntMembersDown ]]; then + #response=$(cat ${tmpLog} | ./pd-trigger.sh -L -s "${txtPdServiceKey}" -d "${status}") + #fi + # attempt an email + #cat ${tmpLog} | mail -s "${txtSubject}:${status}" ${txtEmail}; + echo "All members down sending slack alert with /usr/local/bin/slacker -c ${txtSlackChannel} -t ${txtSlackBotToken}" >> ${tmpLog} + cat ${tmpLog} | slacker -c ${txtSlackChannel} -t ${txtSlackBotToken} + # fi fi fi -# clean up -rm ${tmpLog} -rm ${tmpPing} + cp ${tmpLog} ./log.log + cp ${tmpPing} ./ping.log + + # clean up + rm ${tmpLog} + rm ${tmpPing} +done +exit 0 \ No newline at end of file diff --git a/ping.log b/ping.log new file mode 100644 index 0000000..3f7a3ba --- /dev/null +++ b/ping.log @@ -0,0 +1,7 @@ +PING 149.36.0.253 (149.36.0.253) 56(84) bytes of data. +From 217.17.208.21 icmp_seq=1 Time to live exceeded +From 217.17.208.21 icmp_seq=2 Time to live exceeded + +--- 149.36.0.253 ping statistics --- +3 packets transmitted, 0 received, +2 errors, 100% packet loss, time 2000ms + diff --git a/pingmon.py b/pingmon.py new file mode 100644 index 0000000..6e9ee19 --- /dev/null +++ b/pingmon.py @@ -0,0 +1,85 @@ +from termios import TAB3 +import threading +from types import NoneType +from prometheus_client import Counter +import json +from telnetlib import theNULL +import pingparsing +from prometheus_client import CollectorRegistry, Gauge, push_to_gateway + +tests={ + "1": { + 'dest': "66.29.128.140", + 'name': 'African webserver', + 'packet_loss_rate_permitted': 50, + 'rtt_max_permitted': 150, + }, + "2": { + 'dest': "8.8.8.8", + 'name': 'Google DNS', + 'packet_loss_rate_permitted': 0, + 'rtt_max_permitted': 150, + }, + "3": { + 'dest': "1.1.1.1", + 'name': 'Cloudflare DNS', + 'packet_loss_rate_permitted': 0, + 'rtt_max_permitted': 150, + }, + +} + +def pinger(test_id): + print("Testing {} on IP {} with RTT threshold of {} and packet loss max of {}".format( + tests[test_id]['name'],tests[test_id]['dest'], + tests[test_id]['rtt_max_permitted'],tests[test_id]['packet_loss_rate_permitted'])) + dest=tests[test_id]['dest'] + name=tests[test_id]['name'] + rtt_max_permitted=tests[test_id]['rtt_max_permitted'] + packet_loss_rate_permitted=tests[test_id]['packet_loss_rate_permitted'] + + ping_parser = pingparsing.PingParsing() + transmitter = pingparsing.PingTransmitter() + transmitter.destination = dest + transmitter.count = 10 + transmitter.timeout=2 + # while 1: + + result = transmitter.ping() + data=ping_parser.parse(result).as_dict() + packet_loss_rate=data["packet_loss_rate"] + rtt_max=data["rtt_max"] + notify=0 + + print("Dest: {} Loss: {}% RTT: {}ms".format(dest,packet_loss_rate,rtt_max)) + if rtt_max>rtt_max_permitted: + print("ERROR: rtt_max_permitted exceeded!") + notify=1 + + if packet_loss_rate>packet_loss_rate_permitted: + print("ERROR: packet_loss_rate_permitted exceeded!") + notify=1 + + if notify: + print(json.dumps(ping_parser.parse(result).as_dict(), indent=4)) + + + + registry = CollectorRegistry() + g_pl = Gauge('packet_loss_rate', 'Amt of packet loss', ['destination_ip'],registry=registry ) + g_pl.labels(dest).set(packet_loss_rate) + + if not type(rtt_max)==NoneType: + g_rtt = Gauge('rtt_max', 'Round trip time', ['destination_ip'],registry=registry ) + g_rtt.labels(dest).set(rtt_max) + + push_to_gateway('10.10.110.250:9091', job='cory_test_job1', registry=registry) + + +if __name__ == "__main__": + while 1: + for _id,_item in tests.items(): + pinger(_id) + # t = threading.Thread(target=pinger, args=(_id,)) + # t.start() +