Last active
March 11, 2026 19:41
-
-
Save oddmario/bf1058f0083e237d8a896e3c300c9285 to your computer and use it in GitHub Desktop.
net_drops is a Python script which helps you monitor RX, TX and qdisc packet drops on Linux systems in a live way
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # https://gist.github.com/oddmario/bf1058f0083e237d8a896e3c300c9285 | |
| import os | |
| import time | |
| import subprocess | |
| import re | |
| import argparse | |
| import sys | |
| def get_sysfs_value(iface, metric): | |
| """Reads a single integer value from /sys/class/net.""" | |
| path = f"/sys/class/net/{iface}/statistics/{metric}" | |
| try: | |
| with open(path, 'r') as f: | |
| return int(f.read().strip()) | |
| except (IOError, ValueError): | |
| return 0 | |
| def get_duplex(iface): | |
| path = f"/sys/class/net/{iface}/duplex" | |
| try: | |
| with open(path, 'r') as f: | |
| return str(f.read().strip()) | |
| except (IOError, ValueError): | |
| return "" | |
| def get_qdisc_drops(iface): | |
| """ | |
| Parses 'tc -s qdisc show dev <iface>' to get qdisc specific drops. | |
| Returns the cumulative total of drops since boot/reset. | |
| """ | |
| try: | |
| result = subprocess.run( | |
| ['tc', '-s', 'qdisc', 'show', 'dev', iface], | |
| capture_output=True, | |
| text=True | |
| ) | |
| if result.returncode != 0: | |
| return 0 | |
| drops = 0 | |
| # Find all instances of 'dropped' followed by a number | |
| matches = re.findall(r'dropped\s+(\d+)', result.stdout) | |
| for match in matches: | |
| drops += int(match) | |
| return drops | |
| except Exception: | |
| return 0 | |
| def get_interface_stats(iface): | |
| """Collects all relevant counters for an interface.""" | |
| return { | |
| 'rx_packets': get_sysfs_value(iface, 'rx_packets'), | |
| 'rx_dropped': get_sysfs_value(iface, 'rx_dropped'), | |
| 'tx_packets': get_sysfs_value(iface, 'tx_packets'), | |
| 'tx_dropped': get_sysfs_value(iface, 'tx_dropped'), | |
| 'qdisc_dropped': get_qdisc_drops(iface) | |
| } | |
| def calculate_rate(dropped_count, good_packet_count): | |
| """ | |
| Calculates percentage safely. | |
| Total Events = Good Packets + Dropped Packets | |
| """ | |
| total_events = good_packet_count + dropped_count | |
| if total_events == 0: | |
| return 0.0 | |
| return (dropped_count / total_events) * 100 | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Monitor Network Drop Rates (Live vs Cumulative)") | |
| parser.add_argument("interface", help="The network interface (e.g., eth0)") | |
| parser.add_argument("-i", "--interval", type=int, default=1, help="Update interval in seconds") | |
| args = parser.parse_args() | |
| iface = args.interface | |
| if not os.path.exists(f"/sys/class/net/{iface}"): | |
| print(f"Error: Interface '{iface}' not found.") | |
| sys.exit(1) | |
| currDuplex = get_duplex(iface) | |
| if currDuplex != "full": | |
| print(f"WARNING: The current duplex of your interface is '{currDuplex}'. The duplex should be 'full' on wired bare-metal servers to ensure that no packet loss will happen. Consider checking if the auto-negotiation behavior of your network is faulty.") | |
| print("The duplex is usually 'unknown' or unset on VMs using virtio-net and also on wireless interfaces.") | |
| print("") | |
| print("Recommendations:") | |
| print("- If you see a high TX drop rate, consider increasing the txqueuelen of your interface (although that's not recommended)") | |
| print("- If you see a high RX drop rate, consider increasing your kernel net.core.netdev_max_backlog.") | |
| print("") | |
| print("""- To monitor the drops happening because of the netdev_max_backlog, type: awk '{printf "CPU %d: Processed: %d, Dropped: %d, Time Squeezed: %d\\n", NR-1, strtonum("0x"$1), strtonum("0x"$2), strtonum("0x"$3)}' /proc/net/softnet_stat""") | |
| print("* If Dropped is 0 across all cores, your current netdev_max_backlog size is perfectly fine. You are not losing any packets at this layer.") | |
| print("* If Time Squeezed is rising rapidly but drops are 0, your CPU is struggling to keep up, but your backlog is large enough to absorb the delay (for now).") | |
| print("* If Dropped is rising, your backlog is actively overflowing.") | |
| print("") | |
| print("- If you see a high qdisc drop rate, consider increasing your qdisc's max packets queue limit (note that for some qdiscs, it's normal to have a few drops every now and then because such qdiscs intentionally drop packets to work properly; as long as the drops aren't huge like thousands per second; see https://community.fortinet.com/t5/FortiGate/Troubleshooting-Tip-How-to-check-for-dropped-packets-in-Qdisc/ta-p/391143). It's not recommended to increase the qdisc limits because this causes bufferbloat.") | |
| print("You can check if you're hitting your qdisc max packets queue size by running `watch -n 0.1 \"tc -s qdisc show dev [iface name] | grep backlog\"`") | |
| print("The output will be like 'backlog 9084b 2p requeues 0'. This means that there are currently 2 packets in the qdisc queue") | |
| print("If you're using the fq qdisc: If you're seeing qdisc drops but your currently-queued fq packets aren't exceeding your fq queue size limit, then maybe the drops are happening because the flow_limit of certain connections is being hit. To check whether the fq qdisc drops are because of the flow_limit or the global limit, type `tc -s qdisc show dev <iface_name>` and look for the flows_plimit value. The flows_plimit value shows the count of drops that happened because some flows exceeded their flow_limit. To get the global drops count, subtract the flows_plimit from the total qdisc drops count.") | |
| print("") | |
| print("- If increasing the txqueuelen, qdisc queue size and/or the netdev_max_backlog doesn't solve your packet drops, consider increasing your NIC's RX/TX ring buffer sizes using ethtool.") | |
| print("- A 'high drop rate' is something >= 0.5% or 1.0% (or a continuously increasing drop rate every few seconds/milliseconds)") | |
| print("- If none of the above solve your problems, you may be running out of bandwidth or you need a better NIC (or maybe there's a [temporary] problem with your ISP/hosting provider's network).") | |
| print("- You can also try to increase the kernel's TCP and UDP buffer sizes to absorb any excessive packet drops.") | |
| print("- Make sure NOT to increase any queues/buffer sizes by crazy amounts to avoid causing bufferbloating on your network (especially the TX queues/buffer sizes).") | |
| print("") | |
| print(f"Monitoring {iface} (Interval: {args.interval}s)") | |
| print("Rates displayed as: Current Interval / Total Since Boot") | |
| print("-" * 85) | |
| # Header formatting | |
| headers = f"{'Time':<10} | {'RX Drop % (Now/Tot)':<22} | {'TX Drop % (Now/Tot)':<22} | {'Qdisc % (Now/Tot)':<22}" | |
| print(headers) | |
| print("-" * 85) | |
| try: | |
| # Initialize baseline for delta calculations | |
| prev_stats = get_interface_stats(iface) | |
| while True: | |
| time.sleep(args.interval) | |
| curr_stats = get_interface_stats(iface) | |
| # --- 1. Calculate Deltas (Traffic in the last interval) --- | |
| d_rx_pkts = curr_stats['rx_packets'] - prev_stats['rx_packets'] | |
| d_rx_drop = curr_stats['rx_dropped'] - prev_stats['rx_dropped'] | |
| d_tx_pkts = curr_stats['tx_packets'] - prev_stats['tx_packets'] | |
| d_tx_drop = curr_stats['tx_dropped'] - prev_stats['tx_dropped'] | |
| d_qdisc_drop = curr_stats['qdisc_dropped'] - prev_stats['qdisc_dropped'] | |
| # Delta Rates | |
| rx_rate_now = calculate_rate(d_rx_drop, d_rx_pkts) | |
| tx_rate_now = calculate_rate(d_tx_drop, d_tx_pkts) | |
| qdisc_rate_now = calculate_rate(d_qdisc_drop, d_tx_pkts) # Qdisc is usually on TX path | |
| # --- 2. Calculate Totals (Traffic since boot) --- | |
| # We use the raw counters directly from curr_stats | |
| rx_rate_tot = calculate_rate(curr_stats['rx_dropped'], curr_stats['rx_packets']) | |
| tx_rate_tot = calculate_rate(curr_stats['tx_dropped'], curr_stats['tx_packets']) | |
| qdisc_rate_tot = calculate_rate(curr_stats['qdisc_dropped'], curr_stats['tx_packets']) | |
| # --- 3. Formatting Output --- | |
| # Creates strings like "0.05% / 1.20%" | |
| rx_str = f"{rx_rate_now:.2f}% / {rx_rate_tot:.2f}%" | |
| tx_str = f"{tx_rate_now:.2f}% / {tx_rate_tot:.2f}%" | |
| qd_str = f"{qdisc_rate_now:.2f}% / {qdisc_rate_tot:.2f}%" | |
| timestamp = time.strftime("%H:%M:%S") | |
| print(f"{timestamp:<10} | {rx_str:<22} | {tx_str:<22} | {qd_str:<22}") | |
| # Update previous stats for the next loop | |
| prev_stats = curr_stats | |
| except KeyboardInterrupt: | |
| print("\nStopping calculation.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment