Created
August 27, 2025 14:10
-
-
Save mbutler/38ca0b4601f554e86b3e54a039e2cb97 to your computer and use it in GitHub Desktop.
Analyze Apache logs to generate usage stats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Analyze Apache *combined* access logs and export CSVs. | |
| Outputs: | |
| 00_summary.csv # headline metrics | |
| 10_pages.csv # per-page counts (querystrings stripped, static excluded) | |
| 20_daily_requests.csv # all requests per day | |
| 21_daily_pageviews.csv # "pageviews" per day (static excluded) | |
| 30_status_codes.csv # status code breakdown | |
| 40_referrers.csv # top referrers (excluding "-") | |
| 50_404_urls.csv # top 404 paths | |
| Usage: | |
| python3 analyze_apache.py geneticprivacy.log | |
| python3 analyze_apache.py geneticprivacy.log --prefix geneticprivacy --ignore-bots | |
| """ | |
| # -------------------------------------------------------------------- | |
| # Apache access log locations | |
| # | |
| # SUSE Linux Enterprise Server (SLES): | |
| # /var/log/apache2/access_log (default global log) | |
| # /var/log/apache2/geneticprivacy-access.log (per-vhost log, as configured) | |
| # | |
| # Ubuntu / Debian: | |
| # /var/log/apache2/access.log (default global log) | |
| # /var/log/apache2/other_vhosts_access.log (per-vhost requests if using vhost_combined) | |
| # | |
| # Rotated logs typically have suffixes like .1, .2.gz etc. | |
| # Example: /var/log/apache2/access_log-20250820.gz | |
| # | |
| # To merge them for analysis: | |
| # zcat -f /var/log/apache2/access_log* > ~/all_access.log | |
| # | |
| # -------------------------------------------------------------------- | |
| import argparse, csv, gzip, os, re, sys | |
| from collections import Counter, defaultdict | |
| from datetime import datetime | |
| # Regex for Apache COMBINED format | |
| LOG_RE = re.compile( | |
| r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<ts>[^\]]+)\]\s+' | |
| r'"(?P<method>\S+)\s+(?P<path>\S+)(?:\s+HTTP/\d\.\d)?"\s+' | |
| r'(?P<status>\d{3})\s+(?P<size>\S+)\s+' | |
| r'"(?P<ref>[^"]*)"\s+"(?P<ua>[^"]*)"' | |
| ) | |
| # Consider these "static" (excluded from pageview counts) | |
| STATIC_EXT = re.compile(r'\.(css|js|png|jpe?g|svg|gif|ico|woff2?|mp4|webm|pdf|txt)$', re.I) | |
| # Simple bot filter (optional) | |
| BOTS_RE = re.compile(r'bot|crawler|spider|fetch|monitor|pingdom|uptime|curl|wget|python-requests', re.I) | |
| def parse_ts(ts: str) -> datetime: | |
| # Example: 25/Jun/2025:11:21:04 -0500 | |
| return datetime.strptime(ts, "%d/%b/%Y:%H:%M:%S %z") | |
| def open_maybe_gz(path): | |
| return gzip.open(path, 'rt', errors='ignore') if path.endswith('.gz') else open(path, 'rt', errors='ignore') | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument('logfile', help='Path to Apache access log (combined). .gz supported.') | |
| ap.add_argument('--prefix', default='report', help='Prefix for output CSV filenames') | |
| ap.add_argument('--ignore-bots', action='store_true', help='Exclude obvious bots/crawlers by UA') | |
| args = ap.parse_args() | |
| if not os.path.exists(args.logfile): | |
| print(f"File not found: {args.logfile}", file=sys.stderr) | |
| sys.exit(1) | |
| total_requests = 0 | |
| unique_ips = set() | |
| unique_ip_ua = set() | |
| # Aggregates | |
| pages = Counter() # path -> count (static excluded, query stripped) | |
| daily_requests = Counter() # YYYY-MM-DD -> count (all) | |
| daily_pageviews = Counter() # YYYY-MM-DD -> count (static excluded) | |
| status_counts = Counter() # status -> count | |
| referrers = Counter() # referrer -> count | |
| not_found = Counter() # 404 path -> count | |
| first_dt = None | |
| last_dt = None | |
| with open_maybe_gz(args.logfile) as f: | |
| for line in f: | |
| m = LOG_RE.search(line) | |
| if not m: | |
| continue | |
| ip = m.group('ip') | |
| ts = m.group('ts') | |
| method = m.group('method') | |
| path = m.group('path') | |
| status = int(m.group('status')) | |
| size = m.group('size') | |
| ref = m.group('ref') or '-' | |
| ua = m.group('ua') or '-' | |
| # Optional bot filter | |
| if args.ignore_bots and (BOTS_RE.search(ua) or BOTS_RE.search(ref)): | |
| continue | |
| # Timestamp parsing | |
| try: | |
| dt = parse_ts(ts) | |
| except Exception: | |
| continue | |
| day = dt.strftime('%Y-%m-%d') | |
| # Update first/last seen | |
| if first_dt is None or dt < first_dt: | |
| first_dt = dt | |
| if last_dt is None or dt > last_dt: | |
| last_dt = dt | |
| total_requests += 1 | |
| unique_ips.add(ip) | |
| unique_ip_ua.add(f"{ip}|{ua}") | |
| # Daily all-requests | |
| daily_requests[day] += 1 | |
| # Status | |
| status_counts[status] += 1 | |
| # Path normalization for pageviews | |
| clean_path = path.split('?', 1)[0] | |
| if not STATIC_EXT.search(clean_path): | |
| pages[clean_path] += 1 | |
| daily_pageviews[day] += 1 | |
| # 404 details | |
| if status == 404: | |
| not_found[clean_path] += 1 | |
| # Referrer | |
| if ref != '-': | |
| referrers[ref] += 1 | |
| # Write CSV helpers | |
| def write_pairs(filename, rows, header=('key', 'count')): | |
| with open(filename, 'w', newline='') as out: | |
| w = csv.writer(out) | |
| w.writerow(header) | |
| for k, v in rows: | |
| w.writerow([k, v]) | |
| # 00_summary.csv | |
| summary_path = f"{args.prefix}-00_summary.csv" | |
| with open(summary_path, 'w', newline='') as out: | |
| w = csv.writer(out) | |
| w.writerow(['metric', 'value']) | |
| w.writerow(['date_range_start', first_dt.isoformat() if first_dt else '']) | |
| w.writerow(['date_range_end', last_dt.isoformat() if last_dt else '']) | |
| w.writerow(['total_requests', total_requests]) | |
| w.writerow(['unique_ips', len(unique_ips)]) | |
| w.writerow(['unique_ip_useragent', len(unique_ip_ua)]) | |
| w.writerow(['pageviews_excluding_static', sum(pages.values())]) | |
| # Status buckets | |
| w.writerow(['status_2xx', sum(c for s, c in status_counts.items() if 200 <= s <= 299)]) | |
| w.writerow(['status_3xx', sum(c for s, c in status_counts.items() if 300 <= s <= 399)]) | |
| w.writerow(['status_4xx', sum(c for s, c in status_counts.items() if 400 <= s <= 499)]) | |
| w.writerow(['status_5xx', sum(c for s, c in status_counts.items() if 500 <= s <= 599)]) | |
| # 10_pages.csv | |
| write_pairs(f"{args.prefix}-10_pages.csv", pages.most_common(), header=('path', 'count')) | |
| # 20_daily_requests.csv | |
| write_pairs(f"{args.prefix}-20_daily_requests.csv", | |
| sorted(daily_requests.items(), key=lambda kv: kv[0]), | |
| header=('date', 'count')) | |
| # 21_daily_pageviews.csv | |
| write_pairs(f"{args.prefix}-21_daily_pageviews.csv", | |
| sorted(daily_pageviews.items(), key=lambda kv: kv[0]), | |
| header=('date', 'count')) | |
| # 30_status_codes.csv | |
| write_pairs(f"{args.prefix}-30_status_codes.csv", | |
| sorted(status_counts.items(), key=lambda kv: kv[0]), | |
| header=('status', 'count')) | |
| # 40_referrers.csv | |
| write_pairs(f"{args.prefix}-40_referrers.csv", referrers.most_common(), header=('referrer', 'count')) | |
| # 50_404_urls.csv | |
| write_pairs(f"{args.prefix}-50_404_urls.csv", not_found.most_common(), header=('path', 'count')) | |
| print(f"Done. CSVs written with prefix: {args.prefix}-*.csv") | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment