Built a a script to watch the conntrack levels inside all qrouter namespaces

This commit is contained in:
Cory Hawkless 2022-10-04 16:01:16 +10:30
parent ee8d75b637
commit 384eab9966
3 changed files with 65 additions and 6 deletions

View File

@ -0,0 +1,37 @@
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
import sys, random
import subprocess
from pyroute2 import netns, NSPopen
import socket
import time
registry = CollectorRegistry()
g_count = Gauge('test_nf_conntrack_count', 'nf_conntrack_count inside given namespace', ['host','namespace'],registry=registry )
g_max = Gauge('test_nf_conntrack_max', 'nf_conntrack_max inside given namespace', ['host','namespace'],registry=registry )
hostname=socket.gethostname()
while True:
for _item in netns.listnetns():
if str(_item).startswith("qrouter"):
nsp = NSPopen(_item, ['cat', '/proc/sys/net/netfilter/nf_conntrack_count'], stdout=subprocess.PIPE)
result=nsp.communicate()
nsresult_count=(result[0].decode('ascii').strip())
nsp = NSPopen(_item, ['cat', '/proc/sys/net/netfilter/nf_conntrack_max'], stdout=subprocess.PIPE)
result=nsp.communicate()
nsresult_max=(result[0].decode('ascii').strip())
print(_item + " " + nsresult_count + " " + nsresult_max)
nsp.wait()
nsp.release()
g_count.labels(hostname,_item).set(nsresult_count)
g_max.labels(hostname,_item).set(nsresult_max)
# exit()
time.sleep(30)
push_to_gateway('10.10.110.250:9091', job='cory_test_job2', registry=registry)
# print("Done")
# # curl -X GET http://10.10.110.250:9091/api/v1/metrics | jq

View File

@ -1,6 +1,5 @@
from termios import TAB3 from termios import TAB3
import threading import threading
from types import NoneType
from prometheus_client import Counter from prometheus_client import Counter
import json import json
from telnetlib import theNULL from telnetlib import theNULL
@ -27,7 +26,27 @@ def slack_message(message, channel):
assert e.response["error"] # str like 'invalid_auth', 'channel_not_found' assert e.response["error"] # str like 'invalid_auth', 'channel_not_found'
# tests={
# "1": {
# 'dest': "10.90.1.254",
# 'name': 'Runpod GW',
# 'packet_loss_rate_permitted': 100,
# 'rtt_max_permitted': 5,
# },
# "2": {
# 'dest': "149.36.0.254",
# 'name': 'Nexgen Fortigate',
# 'packet_loss_rate_permitted': 0,
# 'rtt_max_permitted': 5,
# },
# "3": {
# 'dest': "217.17.208.20",
# 'name': 'First Hop',
# 'packet_loss_rate_permitted': 0,
# 'rtt_max_permitted': 20,
# },
# }
tests={ tests={
@ -76,16 +95,16 @@ def pinger(test_id):
print("Dest: {} Loss: {}% RTT: {}ms".format(dest,packet_loss_rate,rtt_max)) print("Dest: {} Loss: {}% RTT: {}ms".format(dest,packet_loss_rate,rtt_max))
if rtt_max>rtt_max_permitted: if rtt_max>rtt_max_permitted:
print("ERROR: rtt_max_permitted exceeded!") error_msg="ERROR: rtt_max_permitted exceeded!"
notify=1 notify=1
if packet_loss_rate>packet_loss_rate_permitted: if packet_loss_rate>packet_loss_rate_permitted:
print("ERROR: packet_loss_rate_permitted exceeded!") error_msg="ERROR: packet_loss_rate_permitted exceeded!"
notify=1 notify=1
if notify: if notify:
issue_data=json.dumps(ping_parser.parse(result).as_dict(), indent=4) issue_data=json.dumps(ping_parser.parse(result).as_dict(), indent=4)
slack_message(issue_data,"ng-alerts") slack_message(error_msg + " " + str(tests[test_id]) + issue_data,"ng-alerts")
print(issue_data) print(issue_data)
@ -94,7 +113,7 @@ def pinger(test_id):
g_pl = Gauge('packet_loss_rate', 'Amt of packet loss', ['destination_ip'],registry=registry ) g_pl = Gauge('packet_loss_rate', 'Amt of packet loss', ['destination_ip'],registry=registry )
g_pl.labels(dest).set(packet_loss_rate) g_pl.labels(dest).set(packet_loss_rate)
if not type(rtt_max)==NoneType: if not type(rtt_max)=="NoneType":
g_rtt = Gauge('rtt_max', 'Round trip time', ['destination_ip'],registry=registry ) g_rtt = Gauge('rtt_max', 'Round trip time', ['destination_ip'],registry=registry )
g_rtt.labels(dest).set(rtt_max) g_rtt.labels(dest).set(rtt_max)

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
slack
pingparsing
prometheus_client