Skip to content

Instantly share code, notes, and snippets.

@mbutler
Created August 27, 2025 14:10
Show Gist options
  • Select an option

  • Save mbutler/38ca0b4601f554e86b3e54a039e2cb97 to your computer and use it in GitHub Desktop.

Select an option

Save mbutler/38ca0b4601f554e86b3e54a039e2cb97 to your computer and use it in GitHub Desktop.
Analyze Apache logs to generate usage stats
#!/usr/bin/env python3
"""
Analyze Apache *combined* access logs and export CSVs.
Outputs:
00_summary.csv # headline metrics
10_pages.csv # per-page counts (querystrings stripped, static excluded)
20_daily_requests.csv # all requests per day
21_daily_pageviews.csv # "pageviews" per day (static excluded)
30_status_codes.csv # status code breakdown
40_referrers.csv # top referrers (excluding "-")
50_404_urls.csv # top 404 paths
Usage:
python3 analyze_apache.py geneticprivacy.log
python3 analyze_apache.py geneticprivacy.log --prefix geneticprivacy --ignore-bots
"""
# --------------------------------------------------------------------
# Apache access log locations
#
# SUSE Linux Enterprise Server (SLES):
# /var/log/apache2/access_log (default global log)
# /var/log/apache2/geneticprivacy-access.log (per-vhost log, as configured)
#
# Ubuntu / Debian:
# /var/log/apache2/access.log (default global log)
# /var/log/apache2/other_vhosts_access.log (per-vhost requests if using vhost_combined)
#
# Rotated logs typically have suffixes like .1, .2.gz etc.
# Example: /var/log/apache2/access_log-20250820.gz
#
# To merge them for analysis:
# zcat -f /var/log/apache2/access_log* > ~/all_access.log
#
# --------------------------------------------------------------------
import argparse, csv, gzip, os, re, sys
from collections import Counter, defaultdict
from datetime import datetime
# Regex for Apache COMBINED format
LOG_RE = re.compile(
r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<ts>[^\]]+)\]\s+'
r'"(?P<method>\S+)\s+(?P<path>\S+)(?:\s+HTTP/\d\.\d)?"\s+'
r'(?P<status>\d{3})\s+(?P<size>\S+)\s+'
r'"(?P<ref>[^"]*)"\s+"(?P<ua>[^"]*)"'
)
# Consider these "static" (excluded from pageview counts)
STATIC_EXT = re.compile(r'\.(css|js|png|jpe?g|svg|gif|ico|woff2?|mp4|webm|pdf|txt)$', re.I)
# Simple bot filter (optional)
BOTS_RE = re.compile(r'bot|crawler|spider|fetch|monitor|pingdom|uptime|curl|wget|python-requests', re.I)
def parse_ts(ts: str) -> datetime:
# Example: 25/Jun/2025:11:21:04 -0500
return datetime.strptime(ts, "%d/%b/%Y:%H:%M:%S %z")
def open_maybe_gz(path):
return gzip.open(path, 'rt', errors='ignore') if path.endswith('.gz') else open(path, 'rt', errors='ignore')
def main():
ap = argparse.ArgumentParser()
ap.add_argument('logfile', help='Path to Apache access log (combined). .gz supported.')
ap.add_argument('--prefix', default='report', help='Prefix for output CSV filenames')
ap.add_argument('--ignore-bots', action='store_true', help='Exclude obvious bots/crawlers by UA')
args = ap.parse_args()
if not os.path.exists(args.logfile):
print(f"File not found: {args.logfile}", file=sys.stderr)
sys.exit(1)
total_requests = 0
unique_ips = set()
unique_ip_ua = set()
# Aggregates
pages = Counter() # path -> count (static excluded, query stripped)
daily_requests = Counter() # YYYY-MM-DD -> count (all)
daily_pageviews = Counter() # YYYY-MM-DD -> count (static excluded)
status_counts = Counter() # status -> count
referrers = Counter() # referrer -> count
not_found = Counter() # 404 path -> count
first_dt = None
last_dt = None
with open_maybe_gz(args.logfile) as f:
for line in f:
m = LOG_RE.search(line)
if not m:
continue
ip = m.group('ip')
ts = m.group('ts')
method = m.group('method')
path = m.group('path')
status = int(m.group('status'))
size = m.group('size')
ref = m.group('ref') or '-'
ua = m.group('ua') or '-'
# Optional bot filter
if args.ignore_bots and (BOTS_RE.search(ua) or BOTS_RE.search(ref)):
continue
# Timestamp parsing
try:
dt = parse_ts(ts)
except Exception:
continue
day = dt.strftime('%Y-%m-%d')
# Update first/last seen
if first_dt is None or dt < first_dt:
first_dt = dt
if last_dt is None or dt > last_dt:
last_dt = dt
total_requests += 1
unique_ips.add(ip)
unique_ip_ua.add(f"{ip}|{ua}")
# Daily all-requests
daily_requests[day] += 1
# Status
status_counts[status] += 1
# Path normalization for pageviews
clean_path = path.split('?', 1)[0]
if not STATIC_EXT.search(clean_path):
pages[clean_path] += 1
daily_pageviews[day] += 1
# 404 details
if status == 404:
not_found[clean_path] += 1
# Referrer
if ref != '-':
referrers[ref] += 1
# Write CSV helpers
def write_pairs(filename, rows, header=('key', 'count')):
with open(filename, 'w', newline='') as out:
w = csv.writer(out)
w.writerow(header)
for k, v in rows:
w.writerow([k, v])
# 00_summary.csv
summary_path = f"{args.prefix}-00_summary.csv"
with open(summary_path, 'w', newline='') as out:
w = csv.writer(out)
w.writerow(['metric', 'value'])
w.writerow(['date_range_start', first_dt.isoformat() if first_dt else ''])
w.writerow(['date_range_end', last_dt.isoformat() if last_dt else ''])
w.writerow(['total_requests', total_requests])
w.writerow(['unique_ips', len(unique_ips)])
w.writerow(['unique_ip_useragent', len(unique_ip_ua)])
w.writerow(['pageviews_excluding_static', sum(pages.values())])
# Status buckets
w.writerow(['status_2xx', sum(c for s, c in status_counts.items() if 200 <= s <= 299)])
w.writerow(['status_3xx', sum(c for s, c in status_counts.items() if 300 <= s <= 399)])
w.writerow(['status_4xx', sum(c for s, c in status_counts.items() if 400 <= s <= 499)])
w.writerow(['status_5xx', sum(c for s, c in status_counts.items() if 500 <= s <= 599)])
# 10_pages.csv
write_pairs(f"{args.prefix}-10_pages.csv", pages.most_common(), header=('path', 'count'))
# 20_daily_requests.csv
write_pairs(f"{args.prefix}-20_daily_requests.csv",
sorted(daily_requests.items(), key=lambda kv: kv[0]),
header=('date', 'count'))
# 21_daily_pageviews.csv
write_pairs(f"{args.prefix}-21_daily_pageviews.csv",
sorted(daily_pageviews.items(), key=lambda kv: kv[0]),
header=('date', 'count'))
# 30_status_codes.csv
write_pairs(f"{args.prefix}-30_status_codes.csv",
sorted(status_counts.items(), key=lambda kv: kv[0]),
header=('status', 'count'))
# 40_referrers.csv
write_pairs(f"{args.prefix}-40_referrers.csv", referrers.most_common(), header=('referrer', 'count'))
# 50_404_urls.csv
write_pairs(f"{args.prefix}-50_404_urls.csv", not_found.most_common(), header=('path', 'count'))
print(f"Done. CSVs written with prefix: {args.prefix}-*.csv")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment