Skip to content

Instantly share code, notes, and snippets.

@bouroo
Last active January 20, 2026 17:11
Show Gist options
  • Select an option

  • Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.

Select an option

Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Kernel tuning for dedicated linux server. /etc/sysctl.d/60-sysctl.conf
################################################################################
# /etc/sysctl.d/60-sysctl.conf
# Performance-Optimized Kernel Tuning for Web + DB Servers
# Apply with: sysctl --system
# Impact: Focused on >5% performance gains for high-concurrency workloads
################################################################################
################################################################################
# MEMORY MANAGEMENT
################################################################################
# Swappiness: Controls kernel's tendency to swap (0-100).
# Impact: 10-30%+ performance gain for database workloads by preventing unnecessary swapping.
# Value 10 favors keeping pages in memory, critical for DB cache effectiveness.
vm.swappiness = 10
# Dirty page management - controls write behavior for I/O performance.
# Impact: 5-15% improvement in write-heavy workloads by reducing write burst size.
# These values ensure more frequent, smaller writes instead of large I/O spikes.
# Percentage of total memory where processes start synchronous writes (background threshold)
vm.dirty_background_ratio = 5
# Maximum percentage of memory holding dirty pages before forced synchronous writes
vm.dirty_ratio = 15
# Time in centiseconds dirty data can stay in memory before being written (15 seconds)
# Default is 30 seconds. Reducing this prevents large writeback spikes.
vm.dirty_expire_centisecs = 1500
# Time in centiseconds between background writeback cycles (2.5 seconds)
# More frequent cycles = smoother I/O, better for concurrent DB/web workloads
vm.dirty_writeback_centisecs = 250
# Memory overcommit behavior for database workloads.
# Impact: Enables databases (PostgreSQL, MySQL) to allocate large buffers efficiently.
# Value 1 allows overcommit, essential for databases that reserve memory upfront.
# Monitor OOM killer; adjust to 2 if memory is constrained.
vm.overcommit_memory = 1
# Percentage of RAM that can be overcommitted (200% = 2x physical RAM)
vm.overcommit_ratio = 200
# VFS cache pressure - controls reclaiming of directory/inode cache.
# Impact: 5-10% filesystem performance improvement for metadata-heavy workloads.
# Lower than default (100) preserves more dentry/inode cache, beneficial for web servers.
vm.vfs_cache_pressure = 75
# Minimum free RAM threshold (128MB) to prevent system freeze under memory pressure.
# Critical stability parameter. Increase to 262144 (256MB) for systems with 64GB+ RAM.
vm.min_free_kbytes = 131072
################################################################################
# NETWORK STACK
################################################################################
# TCP Congestion Control: BBR (Bottleneck Bandwidth and Round-trip propagation time)
# Impact: 5-40% throughput improvement and reduced latency for modern networks.
# Significantly outperforms Cubic/Reno on high-BDP and lossy networks.
net.ipv4.tcp_congestion_control = bbr
# Default queuing discipline: fq_codel (Fair Queuing Controlled Delay)
# Impact: 10-30% latency reduction by eliminating bufferbloat.
# Provides fair bandwidth distribution and low latency under load.
net.core.default_qdisc = fq_codel
# TCP buffer sizes (min, default, max in bytes).
# Impact: 15-50% throughput improvement on high-bandwidth, high-latency networks.
# Critical for maximizing throughput on 1Gbps+ connections and cross-region deployments.
net.ipv4.tcp_rmem = 4096 87380 33554432 # Max 32MB receive buffer
net.ipv4.tcp_wmem = 4096 65536 33554432 # Max 32MB send buffer
# Core socket buffer limits.
# Impact: Enables large TCP buffers above. Required for high-throughput connections.
net.core.rmem_max = 33554432 # 32MB max receive buffer
net.core.wmem_max = 33554432 # 32MB max send buffer
# Disable TCP slow start after idle periods.
# Impact: 10-20% performance improvement for spiky web traffic.
# Prevents throughput collapse when connections resume after idle periods.
net.ipv4.tcp_slow_start_after_idle = 0
# TCP Fast Open: Allow data in SYN packet (mode 3 = enabled for server and client).
# Impact: 10-30ms latency reduction for repeated connections (HTTP keepalive, DB pools).
# Particularly beneficial for API servers and high-frequency DB queries.
net.ipv4.tcp_fastopen = 3
# Reuse TIME-WAIT sockets for new connections.
# Impact: Reduces connection setup overhead for high-concurrency web servers.
# Critical for systems handling >10k concurrent connections.
net.ipv4.tcp_tw_reuse = 1
# FIN-WAIT-2 timeout in seconds.
# Impact: Faster resource cleanup, reduces memory pressure under load.
# Lower than default (60s) for quicker connection recycling.
net.ipv4.tcp_fin_timeout = 15
# Maximum connection backlog for listening sockets.
# Impact: Prevents connection drops during traffic spikes.
# Essential for web servers handling bursty traffic (e.g., 8192 vs default 128).
net.core.somaxconn = 8192
# SYN backlog for half-open connections.
# Impact: Prevents SYN flood impact on legitimate connections during high load.
# Paired with tcp_syncookies for DoS resilience.
net.ipv4.tcp_max_syn_backlog = 8192
# TCP window scaling for high-bandwidth, high-latency networks.
# Impact: Enables throughput > 65KB on modern networks (essential for 1Gbps+).
net.ipv4.tcp_window_scaling = 1
# Selective ACK - receiver can inform sender about missing segments.
# Impact: 5-15% throughput improvement on lossy networks (wireless, long-distance).
net.ipv4.tcp_sack = 1
# TCP timestamps (required for tcp_tw_reuse, RTT estimation).
# Impact: Enables accurate RTT calculation and PAWS (Protection Against Wrapped Sequences).
net.ipv4.tcp_timestamps = 1
# Network device backlog - packets queued when kernel can't process fast enough.
# Impact: Prevents packet loss on 10Gbps+ NICs under heavy load.
net.core.netdev_max_backlog = 10000
# NAPI poll budget - packets processed per interrupt.
# Impact: Improves throughput by reducing interrupt overhead.
# Value 600 balances throughput and latency for modern NICs.
net.core.netdev_budget = 600
# TCP retransmissions before giving up (default 15).
# Impact: Faster failure detection (240s vs 900s), better for connection pooling.
net.ipv4.tcp_retries2 = 8
# Maximum TIME-WAIT sockets allowed.
# Impact: Higher limit prevents connection failures under high churn.
net.ipv4.tcp_max_tw_buckets = 262144
# Maximum orphaned sockets (not attached to file descriptors).
# Impact: Prevents resource exhaustion during connection storms.
net.ipv4.tcp_max_orphans = 65536
# Do not save TCP metrics from closed connections.
# Impact: Avoids stale routing decisions on dynamic network environments.
net.ipv4.tcp_no_metrics_save = 1
# Path MTU discovery probing (enabled after ICMP black hole detection).
# Impact: Ensures optimal packet size, prevents fragmentation.
net.ipv4.tcp_mtu_probing = 1
# Connection tracking table size.
# Impact: Supports up to 1M concurrent connections (~300MB RAM).
# Critical for high-traffic web servers and load balancers.
net.netfilter.nf_conntrack_max = 1048576
# Established connection timeout (2 hours).
# Impact: Balance between memory usage and long-lived connection support.
net.netfilter.nf_conntrack_tcp_timeout_established = 7200
################################################################################
# FILE SYSTEM & I/O
################################################################################
# System-wide file descriptor limit.
# Impact: Critical for web servers (Nginx/Apache) and databases (many open files).
# Prevents "too many open files" errors under high concurrency.
fs.file-max = 4194304
# Shared memory segment size (16GB) for databases.
# Impact: Enables PostgreSQL/MySQL to use large shared buffers.
# Essential for OLTP workloads requiring substantial in-memory caching.
kernel.shmmax = 17179869184
# Total shared memory pages (4K pages * 4M = ~16GB).
kernel.shmall = 4194304
# Maximum shared memory segments.
kernel.shmmni = 4096
################################################################################
# PROCESS MANAGEMENT
################################################################################
# Maximum process ID (process/thread limit).
# Impact: Supports high-concurrency web servers (Nginx workers, thread pools).
# Prevents process creation failures under load.
kernel.pid_max = 131072
# Maximum threads system-wide.
# Impact: Supports multi-threaded applications (Java, Node.js, Python async).
kernel.threads-max = 524288
################################################################################
# SECURITY
################################################################################
# TCP SYN cookies - protect against SYN flood attacks.
net.ipv4.tcp_syncookies = 1
# Reverse path filtering - prevent IP spoofing.
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable ICMP redirects - prevent MITM attacks.
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.send_redirects = 0
net.ipv4.conf.default.send_redirects = 0
# Disable source routing - security risk mitigation.
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
net.ipv6.conf.all.accept_source_route = 0
net.ipv6.conf.default.accept_source_route = 0
# Secure redirects - only accept from default route gateways.
net.ipv4.conf.all.secure_redirects = 1
net.ipv4.conf.default.secure_redirects = 1
# Log martian packets (impossible source addresses).
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# Ignore ICMP broadcasts - prevent smurf attacks.
net.ipv4.icmp_echo_ignore_broadcasts = 1
# Ignore bogus ICMP error responses.
net.ipv4.icmp_ignore_bogus_error_responses = 1
# TCP TIME-WAIT assassination protection.
net.ipv4.tcp_rfc1337 = 1
# Protect against TIME-WAIT assassination.
net.ipv4.tcp_rfc1337 = 1
################################################################################
# /etc/sysctl.d/61-sysctl-old-os.conf
# Performance-Critical Kernel Optimizations for Web + DB Server
# Target: CentOS 6 / RHEL 6 (kernel 2.6.32)
# Workload: High-concurrency web services with database workloads
# Apply with: sysctl --system
################################################################################
########################
# MEMORY MANAGEMENT
########################
# Reduce swapping tendency - keeps hot data in RAM for database performance
# Impact: 10-30% for database workloads, prevents swap-induced latency spikes
vm.swappiness = 10
# Reduce pressure on VFS cache - keeps directory/inode entries in memory
# Impact: 5-15% for filesystem-heavy workloads, reduces metadata lookups
vm.vfs_cache_pressure = 50
# Dirty page ratio - start writeback at 15% of RAM
# Impact: Balances write coalescing vs responsiveness, critical for DB write performance
vm.dirty_ratio = 15
# Background writeback threshold - start at 5% of RAM
# Impact: Prevents write storms, smooths I/O patterns
vm.dirty_background_ratio = 5
# Minimum free memory reserve (64MB) - prevents low-memory deadlocks
# Impact: System stability under memory pressure
vm.min_free_kbytes = 65536
# Allow memory overcommit (essential for database fork operations)
# Impact: Enables PostgreSQL/MariaDB to allocate memory efficiently
vm.overcommit_memory = 1
# Overcommit ratio at 100% - allows allocation equal to physical RAM + swap
# Impact: Prevents allocation failures for database operations
vm.overcommit_ratio = 100
########################
# SHARED MEMORY (Database Performance)
########################
# Maximum shared memory segment: 16GB - critical for large database workloads
# Impact: Enables PostgreSQL shared_buffers, MySQL buffer_pool tuning
kernel.shmmax = 17179869184
# Total shared memory pages: 16GB / 4KB = 4,194,304 pages
# Impact: Controls aggregate shared memory allocation for multiple DB instances
kernel.shmall = 4194304
# Maximum shared memory segments: 4096
# Impact: Supports multiple database instances or applications
kernel.shmmni = 4096
########################
# NETWORK CORE (Connection Handling)
########################
# Maximum pending connections: 4096 - critical for high-traffic web servers
# Impact: Increases connection acceptance rate, reduces dropped connections
net.core.somaxconn = 4096
# Maximum packet backlog per interface: 5000
# Impact: Reduces packet drops under high network load (10-20% on 10Gbps+)
net.core.netdev_max_backlog = 5000
# Maximum socket receive buffer: 16MB
# Impact: Enables high-throughput transfers on high-latency networks
net.core.rmem_max = 16777216
# Maximum socket send buffer: 16MB
# Impact: Improves bulk data transmission performance
net.core.wmem_max = 16777216
########################
# TCP/IP STACK (Throughput & Concurrency)
########################
# SYN backlog queue: 8192 - defends against SYN floods + accepts bursts
# Impact: Improves connection handling during traffic spikes (10-25% for web)
net.ipv4.tcp_max_syn_backlog = 8192
# Reuse TIME_WAIT sockets for new connections
# Impact: Dramatically reduces port exhaustion under high connection churn (20-40%)
net.ipv4.tcp_tw_reuse = 1
# TIME_WAIT socket bucket limit: 262144
# Impact: Supports high connection turnover rates
net.ipv4.tcp_max_tw_buckets = 262144
# FIN timeout: 25 seconds (default 60) - faster connection cleanup
# Impact: Reduces TIME_WAIT accumulation, frees resources faster
net.ipv4.tcp_fin_timeout = 25
# TCP receive buffer: 4KB min, 85KB default, 16MB max
# Impact: Auto-tuning enables optimal bandwidth-delay product utilization
net.ipv4.tcp_rmem = 4096 87380 16777216
# TCP send buffer: 4KB min, 64KB default, 16MB max
# Impact: Auto-tuning optimizes throughput for varying network conditions
net.ipv4.tcp_wmem = 4096 65536 16777216
# Enable TCP window scaling (RFC 1323)
# Impact: Essential for high-speed networks (>100Mbps) with latency
net.ipv4.tcp_window_scaling = 1
# Enable Selective ACKs (SACK)
# Impact: 5-15% throughput improvement on lossy networks
net.ipv4.tcp_sack = 1
# Disable slow start after idle periods
# Impact: Maintains high throughput after connection pauses (10-30% for long-lived)
net.ipv4.tcp_slow_start_after_idle = 0
########################
# CONNECTION TRACKING (Firewall/NAT)
########################
# Maximum tracked connections: 262144
# Impact: Prevents conntrack table exhaustion under high concurrent connections
net.netfilter.nf_conntrack_max = 262144
# Established connection timeout: 2 hours
# Impact: Balances memory usage vs connection state preservation
net.netfilter.nf_conntrack_tcp_timeout_established = 7200
########################
# FILE SYSTEM (Concurrency Limits)
########################
# Maximum open file descriptors: 2,097,152
# Impact: Critical for high-concurrency web servers (Nginx, Apache, Tomcat)
fs.file-max = 2097152
# Maximum async I/O operations: 1,048,576
# Impact: Essential for database performance (PostgreSQL, MySQL async I/O)
fs.aio-max-nr = 1048576
########################
# SECURITY PARAMETERS (Preserved)
########################
# Address space layout randomization
kernel.randomize_va_space = 2
# Kernel pointer restrictions
kernel.dmesg_restrict = 1
kernel.kptr_restrict = 1
# SYN cookies protection (DDoS resistance)
net.ipv4.tcp_syncookies = 1
# SYN/SYNACK retry limits (prevents resource exhaustion)
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# TCP TIME-WAIT assassination protection
net.ipv4.tcp_rfc1337 = 1
# ARP cache limits (prevents neighbor table overflow)
net.ipv4.neigh.default.gc_thresh1 = 512
net.ipv4.neigh.default.gc_thresh2 = 1024
net.ipv4.neigh.default.gc_thresh3 = 2048
########################
# NETWORK SECURITY (IPv4)
########################
# Reverse path filtering (anti-spoofing)
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable ICMP redirects
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.send_redirects = 0
net.ipv4.conf.default.send_redirects = 0
# Disable source routing
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# Log martian packets (spoofing detection)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# Broadcast ping protection
net.ipv4.icmp_echo_ignore_broadcasts = 1
# Bogus ICMP error protection
net.ipv4.icmp_ignore_bogus_error_responses = 1
########################
# NETWORK SECURITY (IPv6)
########################
# Disable IPv6 redirects and source routing
net.ipv6.conf.all.accept_redirects = 0
net.ipv6.conf.default.accept_redirects = 0
net.ipv6.conf.all.accept_source_route = 0
net.ipv6.conf.default.accept_source_route = 0
################################################################################
# /etc/sysctl.d/80-k8s.conf
# Performance-Critical Kernel Optimizations for Kubernetes Nodes
# Target: Production K8s clusters with kube-proxy (nftables mode)
# Workload: High-concurrency microservices, inter-pod networking, high connection churn
# Hardware: Enterprise-grade servers (16GB+ RAM, 10Gbps+ network recommended)
# Apply with: sysctl --system
################################################################################
########################
# MEMORY MANAGEMENT
########################
# Reduced swapping - keeps containers in RAM, critical for pod performance
# Impact: 15-30% for containerized workloads, prevents swap death during OOM pressure
vm.swappiness = 10
# Enable memory overcommit - essential for container memory allocation
# Impact: Prevents container startup failures, enables efficient memory utilization
vm.overcommit_memory = 1
# Overcommit ratio at 100% - allows allocation equal to physical RAM + swap
# Impact: Balances memory utilization with stability for container workloads
vm.overcommit_ratio = 100
# Background writeback at 5% of RAM - smooths I/O under heavy container loads
# Impact: 10-20% I/O performance improvement, prevents write storms
vm.dirty_background_ratio = 5
# Dirty ratio at 10% - aggressive writeback for responsiveness
# Impact: 15-25% I/O performance under database-heavy container workloads
vm.dirty_ratio = 10
# Writeback every 5 seconds - more frequent writes reduce burst I/O
# Impact: 5-10% smoother I/O patterns, better latency predictability
vm.dirty_writeback_centisecs = 500
# Reduce VFS cache pressure - keeps container metadata in memory
# Impact: 5-15% performance for filesystem-heavy workloads (container image layers)
vm.vfs_cache_pressure = 50
# Minimum free memory reserve (128MB) - prevents OOM deadlocks
# Impact: System stability under memory pressure, critical for kubelet stability
vm.min_free_kbytes = 131072
########################
# TCP CONNECTION HANDLING
########################
# SYN backlog queue: 8192 - handles connection bursts during service scaling
# Impact: 10-25% connection acceptance rate during pod autoscaling events
net.ipv4.tcp_max_syn_backlog = 8192
# Reuse TIME_WAIT sockets - accelerates socket recycling for microservices
# Impact: 20-40% for connection-heavy workloads, reduces port exhaustion
net.ipv4.tcp_tw_reuse = 1
# Maximum TIME_WAIT buckets: 1.44M - accommodates high connection churn
# Impact: Prevents kernel panic under extreme connection turnover rates
net.ipv4.tcp_max_tw_buckets = 1440000
# FIN timeout: 30s - faster connection cleanup than default (60s)
# Impact: 10-20% faster resource recovery for short-lived connections
net.ipv4.tcp_fin_timeout = 30
# Connection queue limit: 65535 - accommodates bursty traffic patterns
# Impact: 15-30% reduction in connection drops for service-to-service traffic
net.core.somaxconn = 65535
# Maximum orphaned sockets: 262144 - handles socket leaks gracefully
# Impact: System stability under connection stress
net.ipv4.tcp_max_orphans = 262144
########################
# TCP CONGESTION & THROUGHPUT
########################
# BBR congestion control - superior for cloud/lossy networks
# Impact: 10-40% throughput improvement vs Cubic for inter-datacenter traffic
net.ipv4.tcp_congestion_control = bbr
# TCP window scaling (RFC 1323) - enables >64KB windows
# Impact: 5-15% throughput on high-latency networks (>100ms)
net.ipv4.tcp_window_scaling = 1
# Selective ACKs (SACK) - recovers faster from packet loss
# Impact: 5-20% throughput improvement on lossy networks
net.ipv4.tcp_sack = 1
# Disable slow start after idle - maintains high throughput
# Impact: 10-30% for long-lived connections with idle periods (gRPC, WebSocket)
net.ipv4.tcp_slow_start_after_idle = 0
# TCP Fast Open (Cookie Mode) - reduces connection establishment latency
# Impact: 5-15% latency reduction for microservice communication
net.ipv4.tcp_fastopen = 3
########################
# NETWORK BUFFERS & QDISC
########################
# Interface packet backlog: 30,000 - handles bursty traffic on 10Gbps+
# Impact: 15-30% reduction in packet drops under high network load
net.core.netdev_max_backlog = 30000
# Maximum socket receive buffer: 16MB
# Impact: Enables high-throughput transfers on high-latency networks
net.core.rmem_max = 16777216
# Maximum socket send buffer: 16MB
# Impact: Improves bulk data transmission performance
net.core.wmem_max = 16777216
# TCP receive buffer: 4KB min, 85KB default, 16MB max
# Impact: Auto-tuning optimizes throughput for varying network conditions
net.ipv4.tcp_rmem = 4096 87380 16777216
# TCP send buffer: 4KB min, 64KB default, 16MB max
# Impact: Auto-tuning enables optimal bandwidth-delay product utilization
net.ipv4.tcp_wmem = 4096 65536 16777216
# Fair Queueing (fq) scheduler - required for BBR
# Impact: Essential for BBR congestion control effectiveness
net.core.default_qdisc = fq
########################
# CONNECTION TRACKING
########################
# Maximum tracked connections: 2,097,152
# Impact: Prevents conntrack exhaustion in large clusters (50+ nodes, 1000+ pods)
net.netfilter.nf_conntrack_max = 2097152
# Established connection timeout: 24 hours
# Impact: Balances memory usage with state preservation for long-lived connections
net.netfilter.nf_conntrack_tcp_timeout_established = 86400
########################
# ROUTING & BRIDGE (K8s Required)
########################
# Enable IPv4 forwarding - required for pod-to-pod communication
# Impact: Essential functionality for Kubernetes networking
net.ipv4.ip_forward = 1
# Bridge firewall integration - required for kube-proxy network policies
# Impact: Essential functionality for K8s network policies
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-arptables = 1
# IPv6 forwarding - required for dual-stack K8s clusters
# Impact: Essential functionality for IPv6 support
net.ipv6.conf.all.forwarding = 1
net.ipv6.conf.default.forwarding = 1
########################
# ARP CACHE
########################
# ARP cache thresholds - prevent neighbor table overflow in large clusters
# Impact: Critical stability parameter for clusters with 100+ nodes
net.ipv4.neigh.default.gc_thresh1 = 2048
net.ipv4.neigh.default.gc_thresh2 = 4096
net.ipv4.neigh.default.gc_thresh3 = 8192
########################
# FILE SYSTEM & LIMITS
########################
# Maximum open file descriptors: 2,097,152
# Impact: Prevents resource exhaustion for container runtime and high-concurrency services
fs.file-max = 2097152
# Inotify watches: 524,288 - monitors container filesystem changes
# Impact: Critical for Kubernetes components (kubelet, containerd) and sidecars
fs.inotify.max_user_watches = 524288
# Maximum async I/O operations: 1,048,576
# Impact: Enables high-performance storage for container volumes
fs.aio-max-nr = 1048576
########################
# NETWORK PERFORMANCE
########################
# Packet processing budget: 600 packets per NAPI cycle
# Impact: 10-20% throughput improvement on multi-core servers
net.core.netdev_budget = 600
# Budget time: 5ms per NAPI cycle
# Impact: Balances throughput with latency for interactive workloads
net.core.netdev_budget_usecs = 5000
########################
# SECURITY
########################
# Address space layout randomization
kernel.randomize_va_space = 2
# Kernel pointer restrictions
kernel.dmesg_restrict = 1
kernel.kptr_restrict = 1
# ptrace scope - prevents attaching to non-child processes
kernel.yama.ptrace_scope = 1
# Disable core dumps (production security)
kernel.core_pattern = |/bin/false
# SYN cookies protection (DDoS resistance)
net.ipv4.tcp_syncookies = 1
# SYN/SYNACK retry limits (prevents resource exhaustion)
net.ipv4.tcp_syn_retries = 2
net.ipv4.tcp_synack_retries = 2
# TCP TIME-WAIT assassination protection
net.ipv4.tcp_rfc1337 = 1
# Reverse path filtering (anti-spoofing)
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable ICMP redirects (IPv4 & IPv6)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv4.conf.all.send_redirects = 0
net.ipv4.conf.default.send_redirects = 0
net.ipv6.conf.all.accept_redirects = 0
net.ipv6.conf.default.accept_redirects = 0
# Disable source routing
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
net.ipv6.conf.all.accept_source_route = 0
net.ipv6.conf.default.accept_source_route = 0
# Log martian packets (spoofing detection)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# Broadcast ping protection
net.ipv4.icmp_echo_ignore_broadcasts = 1
# Bogus ICMP error protection
net.ipv4.icmp_ignore_bogus_error_responses = 1
########################
# KUBERNETES SPECIFIC TUNING
########################
# MTU probing enables Path MTU Discovery for container networks
# Impact: Prevents fragmentation issues in overlay networks (Calico, Cilium, Flannel)
net.ipv4.tcp_mtu_probing = 1
# Don't save TCP metrics - reduces kernel overhead for short-lived connections
# Impact: 5-10% performance for microservice communication patterns
net.ipv4.tcp_no_metrics_save = 1
# TCP autocorking - reduces small packet overhead
# Impact: 5-15% throughput improvement for RPC workloads
net.ipv4.tcp_autocorking = 1
# Port range expansion - accommodates high connection counts
# Impact: Prevents ephemeral port exhaustion at scale
net.ipv4.ip_local_port_range = 1024 65535
################################################################################
# /etc/sysctl.d/80-pve.conf
# Performance-Critical Proxmox VE Host Tuning
# Optimized for: Virtualization host with high-concurrency VMs/containers
# Workload: Database, web services, network-intensive applications
# Hardware: 64GB+ RAM, multi-core CPU, high-speed storage
# Apply with: sysctl --system
################################################################################
########################
# MEMORY MANAGEMENT
########################
# CRITICAL: Strongly prefer dropping caches over swapping VM memory
# Impact: >10% for VM workloads by preventing swap thrashing and maintaining performance
vm.swappiness = 10
# CRITICAL: Tend to keep dentry/inode caches longer for VM disk access patterns
# Impact: 5-10% reduction in storage I/O for frequently-accessed VM files
vm.vfs_cache_pressure = 50
# CRITICAL: Limit dirty pages to prevent I/O spikes that stall VMs
# Impact: >10% by avoiding long synchronous write pauses that freeze VMs
vm.dirty_ratio = 10
vm.dirty_background_ratio = 5
# Maintain minimum free memory to prevent OOM under burst loads
# Adjust: 128MB for 16GB, 256MB for 32GB, 512MB for 64GB+ systems
# Impact: System stability - prevents catastrophic OOM situations
vm.min_free_kbytes = 524288
# Required for containerized workloads (Elasticsearch, databases)
# Impact: Enables specific applications to function without errors
vm.max_map_count = 262144
########################
# TCP & NETWORK STACK
########################
# BBR congestion control - better throughput for high-BDP networks
# Impact: 10-40% improvement in network throughput, especially over WAN/high-latency links
net.ipv4.tcp_congestion_control = bbr
net.core.default_qdisc = fq
# TCP Fast Open - reduces latency for repeated connections
# Impact: 5-15% latency reduction for web/database clients with persistent connections
net.ipv4.tcp_fastopen = 3
net.ipv4.tcp_fastopen_blackhole_timeout_sec = 0
# Disable TCP slow start after idle (critical for long-lived DB connections)
# Impact: >5% throughput for databases, Redis, and other persistent connections
net.ipv4.tcp_slow_start_after_idle = 0
# Optimize connection queues for high-concurrency workloads
# Impact: Enables handling of sudden connection spikes without drops
net.core.somaxconn = 8192
net.ipv4.tcp_max_syn_backlog = 8192
net.core.netdev_max_backlog = 16384
# Large TCP buffers for high-throughput transfers
# Impact: 10-30% improvement for large file transfers, backups, VM migrations
net.core.rmem_max = 33554432
net.core.wmem_max = 33554432
net.ipv4.tcp_mem = 65536 131072 262144
# Reduce TIME_WAIT state duration to free resources faster
# Impact: Higher connection turnover capacity for web/proxy servers
net.ipv4.tcp_fin_timeout = 15
net.ipv4.tcp_tw_reuse = 1
# Maximum ephemeral ports for high-concurrency outbound connections
# Impact: Prevents port exhaustion under extreme load
net.ipv4.ip_local_port_range = 1024 65535
# Reduce latency on write operations by sending smaller buffers more frequently
# Impact: 5-10% latency reduction for interactive protocols
net.ipv4.tcp_notsent_lowat = 16384
########################
# CONNECTION TRACKING
########################
# Maximum connection tracking entries for high-concurrency environments
# Impact: Prevents connection table exhaustion with many containers/VMs
net.netfilter.nf_conntrack_max = 1048576
# Optimized timeouts to free resources faster while maintaining stability
# Impact: Better memory utilization and higher connection throughput
net.netfilter.nf_conntrack_tcp_timeout_established = 43200
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120
net.netfilter.nf_conntrack_udp_timeout = 30
net.netfilter.nf_conntrack_udp_timeout_stream = 120
net.netfilter.nf_conntrack_generic_timeout = 120
net.netfilter.nf_conntrack_icmp_timeout = 30
########################
# SYSTEM RESOURCE LIMITS
########################
# Maximum open file handles - critical for databases and high-concurrency web servers
# Impact: Prevents "too many open files" errors under heavy load
fs.file-max = 2097152
# Maximum processes and threads - essential for container orchestration
# Impact: Enables running thousands of containers/processes without hitting limits
kernel.pid_max = 4194304
kernel.threads-max = 524288
# Shared memory limits - required for databases (PostgreSQL, Oracle) and large VMs
# Impact: Enables databases to allocate necessary shared memory segments
kernel.shmmax = 68719476736 # 64GB
kernel.shmall = 4294967296 # Page count for 64GB
# Semaphore limits - adjusts IPC capacity for concurrent processes
# Impact: Improves database and application IPC throughput
kernel.sem = 250 32000 100 128
########################
# INOTIFY
########################
# Inotify limits for container orchestration and monitoring tools
# Impact: Prevents monitoring failures in containerized environments
fs.inotify.max_user_instances = 512
fs.inotify.max_user_watches = 524288
########################
# BPF JIT OPTIMIZATION
########################
# Enable BPF JIT compiler for eBPF-based monitoring, networking (Cilium), and security tools
# Impact: 20-50% performance improvement for eBPF programs (monitoring, observability, service mesh)
net.core.bpf_jit_enable = 1
net.core.bpf_jit_harden = 2 # Maintains security while enabling performance
########################
# REQUIRED PROXMOX VE FUNCTIONALITY
########################
# IP forwarding for VM/container routing and NAT
net.ipv4.ip_forward = 1
net.ipv6.conf.all.forwarding = 1
# Bridge netfilter - REQUIRED for Proxmox VE firewall on VM bridges
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-arptables = 1
########################
# SECURITY
########################
# SYN cookies - protect against SYN flood attacks
net.ipv4.tcp_syncookies = 1
# Reverse path filtering - prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable ICMP redirects - prevent MITM attacks
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
net.ipv6.conf.all.accept_redirects = 0
net.ipv6.conf.default.accept_redirects = 0
# Disable source routing
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
net.ipv6.conf.all.accept_source_route = 0
net.ipv6.conf.default.accept_source_route = 0
# Ignore ICMP echo broadcasts
net.ipv4.icmp_echo_ignore_broadcasts = 1
net.ipv4.icmp_ignore_bogus_error_responses = 1
# TCP RFC 1337 protection
net.ipv4.tcp_rfc1337 = 1
# Kernel hardening
kernel.kptr_restrict = 1
kernel.dmesg_restrict = 1
kernel.perf_event_paranoid = 2
kernel.randomize_va_space = 2
kernel.core_pattern = /dev/null
kernel.yama.ptrace_scope = 1
########################
# MINOR PERFORMANCE TUNING
########################
# Disable NMI watchdog for slight CPU overhead reduction
# Impact: <2% CPU savings on heavily loaded systems
kernel.nmi_watchdog = 0
# Huge pages - set to non-zero if using VMs with static hugepage backing
# Impact: 5-10% for memory-intensive VMs when properly configured
vm.nr_hugepages = 0
# ZFS tuning - uncomment and adjust if using ZFS storage backend
# Impact: Can improve ZFS performance by limiting ARC cache
# vfs.zfs.arc_max = 10737418240
# vfs.zfs.arc_min = 1073741824
# vfs.zfs.zio.use_uma = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment