Built a a script to watch the conntrack levels inside all qrouter namespaces
This commit is contained in:
parent
ee8d75b637
commit
384eab9966
|
@ -0,0 +1,37 @@
|
|||
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
|
||||
import sys, random
|
||||
import subprocess
|
||||
from pyroute2 import netns, NSPopen
|
||||
import socket
|
||||
import time
|
||||
|
||||
registry = CollectorRegistry()
|
||||
|
||||
g_count = Gauge('test_nf_conntrack_count', 'nf_conntrack_count inside given namespace', ['host','namespace'],registry=registry )
|
||||
g_max = Gauge('test_nf_conntrack_max', 'nf_conntrack_max inside given namespace', ['host','namespace'],registry=registry )
|
||||
hostname=socket.gethostname()
|
||||
while True:
|
||||
for _item in netns.listnetns():
|
||||
if str(_item).startswith("qrouter"):
|
||||
nsp = NSPopen(_item, ['cat', '/proc/sys/net/netfilter/nf_conntrack_count'], stdout=subprocess.PIPE)
|
||||
result=nsp.communicate()
|
||||
nsresult_count=(result[0].decode('ascii').strip())
|
||||
|
||||
nsp = NSPopen(_item, ['cat', '/proc/sys/net/netfilter/nf_conntrack_max'], stdout=subprocess.PIPE)
|
||||
result=nsp.communicate()
|
||||
nsresult_max=(result[0].decode('ascii').strip())
|
||||
|
||||
print(_item + " " + nsresult_count + " " + nsresult_max)
|
||||
nsp.wait()
|
||||
nsp.release()
|
||||
|
||||
g_count.labels(hostname,_item).set(nsresult_count)
|
||||
g_max.labels(hostname,_item).set(nsresult_max)
|
||||
# exit()
|
||||
time.sleep(30)
|
||||
|
||||
|
||||
push_to_gateway('10.10.110.250:9091', job='cory_test_job2', registry=registry)
|
||||
|
||||
# print("Done")
|
||||
# # curl -X GET http://10.10.110.250:9091/api/v1/metrics | jq
|
29
pingmon.py
29
pingmon.py
|
@ -1,6 +1,5 @@
|
|||
from termios import TAB3
|
||||
import threading
|
||||
from types import NoneType
|
||||
from prometheus_client import Counter
|
||||
import json
|
||||
from telnetlib import theNULL
|
||||
|
@ -27,7 +26,27 @@ def slack_message(message, channel):
|
|||
assert e.response["error"] # str like 'invalid_auth', 'channel_not_found'
|
||||
|
||||
|
||||
# tests={
|
||||
# "1": {
|
||||
# 'dest': "10.90.1.254",
|
||||
# 'name': 'Runpod GW',
|
||||
# 'packet_loss_rate_permitted': 100,
|
||||
# 'rtt_max_permitted': 5,
|
||||
# },
|
||||
# "2": {
|
||||
# 'dest': "149.36.0.254",
|
||||
# 'name': 'Nexgen Fortigate',
|
||||
# 'packet_loss_rate_permitted': 0,
|
||||
# 'rtt_max_permitted': 5,
|
||||
# },
|
||||
# "3": {
|
||||
# 'dest': "217.17.208.20",
|
||||
# 'name': 'First Hop',
|
||||
# 'packet_loss_rate_permitted': 0,
|
||||
# 'rtt_max_permitted': 20,
|
||||
# },
|
||||
|
||||
# }
|
||||
|
||||
|
||||
tests={
|
||||
|
@ -76,16 +95,16 @@ def pinger(test_id):
|
|||
|
||||
print("Dest: {} Loss: {}% RTT: {}ms".format(dest,packet_loss_rate,rtt_max))
|
||||
if rtt_max>rtt_max_permitted:
|
||||
print("ERROR: rtt_max_permitted exceeded!")
|
||||
error_msg="ERROR: rtt_max_permitted exceeded!"
|
||||
notify=1
|
||||
|
||||
if packet_loss_rate>packet_loss_rate_permitted:
|
||||
print("ERROR: packet_loss_rate_permitted exceeded!")
|
||||
error_msg="ERROR: packet_loss_rate_permitted exceeded!"
|
||||
notify=1
|
||||
|
||||
if notify:
|
||||
issue_data=json.dumps(ping_parser.parse(result).as_dict(), indent=4)
|
||||
slack_message(issue_data,"ng-alerts")
|
||||
slack_message(error_msg + " " + str(tests[test_id]) + issue_data,"ng-alerts")
|
||||
print(issue_data)
|
||||
|
||||
|
||||
|
@ -94,7 +113,7 @@ def pinger(test_id):
|
|||
g_pl = Gauge('packet_loss_rate', 'Amt of packet loss', ['destination_ip'],registry=registry )
|
||||
g_pl.labels(dest).set(packet_loss_rate)
|
||||
|
||||
if not type(rtt_max)==NoneType:
|
||||
if not type(rtt_max)=="NoneType":
|
||||
g_rtt = Gauge('rtt_max', 'Round trip time', ['destination_ip'],registry=registry )
|
||||
g_rtt.labels(dest).set(rtt_max)
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
slack
|
||||
pingparsing
|
||||
prometheus_client
|
Loading…
Reference in New Issue