mbutler/analyze_apache.py

## analyze_apache.py
#!/usr/bin/env python3
"""
Analyze Apache *combined* access logs and export CSVs.

Outputs:
  00_summary.csv                # headline metrics
  10_pages.csv                  # per-page counts (querystrings stripped, static excluded)
  20_daily_requests.csv         # all requests per day
  21_daily_pageviews.csv        # "pageviews" per day (static excluded)
  30_status_codes.csv           # status code breakdown
  40_referrers.csv              # top referrers (excluding "-")
  50_404_urls.csv               # top 404 paths

Usage:
  python3 analyze_apache.py geneticprivacy.log
  python3 analyze_apache.py geneticprivacy.log --prefix geneticprivacy --ignore-bots
"""

# --------------------------------------------------------------------
# Apache access log locations
#
# SUSE Linux Enterprise Server (SLES):
#   /var/log/apache2/access_log                  (default global log)
#   /var/log/apache2/geneticprivacy-access.log   (per-vhost log, as configured)
#
# Ubuntu / Debian:
#   /var/log/apache2/access.log                  (default global log)
#   /var/log/apache2/other_vhosts_access.log     (per-vhost requests if using vhost_combined)
#
# Rotated logs typically have suffixes like .1, .2.gz etc.
# Example: /var/log/apache2/access_log-20250820.gz
#
# To merge them for analysis:
#   zcat -f /var/log/apache2/access_log* > ~/all_access.log
#
# --------------------------------------------------------------------


import argparse, csv, gzip, os, re, sys
from collections import Counter, defaultdict
from datetime import datetime

# Regex for Apache COMBINED format
LOG_RE = re.compile(
    r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<ts>[^\]]+)\]\s+'
    r'"(?P<method>\S+)\s+(?P<path>\S+)(?:\s+HTTP/\d\.\d)?"\s+'
    r'(?P<status>\d{3})\s+(?P<size>\S+)\s+'
    r'"(?P<ref>[^"]*)"\s+"(?P<ua>[^"]*)"'
)

# Consider these "static" (excluded from pageview counts)
STATIC_EXT = re.compile(r'\.(css|js|png|jpe?g|svg|gif|ico|woff2?|mp4|webm|pdf|txt)$', re.I)

# Simple bot filter (optional)
BOTS_RE = re.compile(r'bot|crawler|spider|fetch|monitor|pingdom|uptime|curl|wget|python-requests', re.I)

def parse_ts(ts: str) -> datetime:
    # Example: 25/Jun/2025:11:21:04 -0500
    return datetime.strptime(ts, "%d/%b/%Y:%H:%M:%S %z")

def open_maybe_gz(path):
    return gzip.open(path, 'rt', errors='ignore') if path.endswith('.gz') else open(path, 'rt', errors='ignore')

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('logfile', help='Path to Apache access log (combined). .gz supported.')
    ap.add_argument('--prefix', default='report', help='Prefix for output CSV filenames')
    ap.add_argument('--ignore-bots', action='store_true', help='Exclude obvious bots/crawlers by UA')
    args = ap.parse_args()

    if not os.path.exists(args.logfile):
        print(f"File not found: {args.logfile}", file=sys.stderr)
        sys.exit(1)

    total_requests = 0
    unique_ips = set()
    unique_ip_ua = set()

    # Aggregates
    pages = Counter()                 # path -> count (static excluded, query stripped)
    daily_requests = Counter()        # YYYY-MM-DD -> count (all)
    daily_pageviews = Counter()       # YYYY-MM-DD -> count (static excluded)
    status_counts = Counter()         # status -> count
    referrers = Counter()             # referrer -> count
    not_found = Counter()             # 404 path -> count

    first_dt = None
    last_dt = None

    with open_maybe_gz(args.logfile) as f:
        for line in f:
            m = LOG_RE.search(line)
            if not m:
                continue

            ip = m.group('ip')
            ts = m.group('ts')
            method = m.group('method')
            path = m.group('path')
            status = int(m.group('status'))
            size = m.group('size')
            ref = m.group('ref') or '-'
            ua = m.group('ua') or '-'

            # Optional bot filter
            if args.ignore_bots and (BOTS_RE.search(ua) or BOTS_RE.search(ref)):
                continue

            # Timestamp parsing
            try:
                dt = parse_ts(ts)
            except Exception:
                continue
            day = dt.strftime('%Y-%m-%d')

            # Update first/last seen
            if first_dt is None or dt < first_dt:
                first_dt = dt
            if last_dt is None or dt > last_dt:
                last_dt = dt

            total_requests += 1
            unique_ips.add(ip)
            unique_ip_ua.add(f"{ip}|{ua}")

            # Daily all-requests
            daily_requests[day] += 1

            # Status
            status_counts[status] += 1

            # Path normalization for pageviews
            clean_path = path.split('?', 1)[0]
            if not STATIC_EXT.search(clean_path):
                pages[clean_path] += 1
                daily_pageviews[day] += 1

            # 404 details
            if status == 404:
                not_found[clean_path] += 1

            # Referrer
            if ref != '-':
                referrers[ref] += 1

    # Write CSV helpers
    def write_pairs(filename, rows, header=('key', 'count')):
        with open(filename, 'w', newline='') as out:
            w = csv.writer(out)
            w.writerow(header)
            for k, v in rows:
                w.writerow([k, v])

    # 00_summary.csv
    summary_path = f"{args.prefix}-00_summary.csv"
    with open(summary_path, 'w', newline='') as out:
        w = csv.writer(out)
        w.writerow(['metric', 'value'])
        w.writerow(['date_range_start', first_dt.isoformat() if first_dt else ''])
        w.writerow(['date_range_end', last_dt.isoformat() if last_dt else ''])
        w.writerow(['total_requests', total_requests])
        w.writerow(['unique_ips', len(unique_ips)])
        w.writerow(['unique_ip_useragent', len(unique_ip_ua)])
        w.writerow(['pageviews_excluding_static', sum(pages.values())])
        # Status buckets
        w.writerow(['status_2xx', sum(c for s, c in status_counts.items() if 200 <= s <= 299)])
        w.writerow(['status_3xx', sum(c for s, c in status_counts.items() if 300 <= s <= 399)])
        w.writerow(['status_4xx', sum(c for s, c in status_counts.items() if 400 <= s <= 499)])
        w.writerow(['status_5xx', sum(c for s, c in status_counts.items() if 500 <= s <= 599)])

    # 10_pages.csv
    write_pairs(f"{args.prefix}-10_pages.csv", pages.most_common(), header=('path', 'count'))

    # 20_daily_requests.csv
    write_pairs(f"{args.prefix}-20_daily_requests.csv",
                sorted(daily_requests.items(), key=lambda kv: kv[0]),
                header=('date', 'count'))

    # 21_daily_pageviews.csv
    write_pairs(f"{args.prefix}-21_daily_pageviews.csv",
                sorted(daily_pageviews.items(), key=lambda kv: kv[0]),
                header=('date', 'count'))

    # 30_status_codes.csv
    write_pairs(f"{args.prefix}-30_status_codes.csv",
                sorted(status_counts.items(), key=lambda kv: kv[0]),
                header=('status', 'count'))

    # 40_referrers.csv
    write_pairs(f"{args.prefix}-40_referrers.csv", referrers.most_common(), header=('referrer', 'count'))

    # 50_404_urls.csv
    write_pairs(f"{args.prefix}-50_404_urls.csv", not_found.most_common(), header=('path', 'count'))

    print(f"Done. CSVs written with prefix: {args.prefix}-*.csv")

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Analyze Apache combined access logs and export CSVs.

	Outputs:
	00_summary.csv # headline metrics
	10_pages.csv # per-page counts (querystrings stripped, static excluded)
	20_daily_requests.csv # all requests per day
	21_daily_pageviews.csv # "pageviews" per day (static excluded)
	30_status_codes.csv # status code breakdown
	40_referrers.csv # top referrers (excluding "-")
	50_404_urls.csv # top 404 paths

	Usage:
	python3 analyze_apache.py geneticprivacy.log
	python3 analyze_apache.py geneticprivacy.log --prefix geneticprivacy --ignore-bots
	"""

	# --------------------------------------------------------------------
	# Apache access log locations
	#
	# SUSE Linux Enterprise Server (SLES):
	# /var/log/apache2/access_log (default global log)
	# /var/log/apache2/geneticprivacy-access.log (per-vhost log, as configured)
	#
	# Ubuntu / Debian:
	# /var/log/apache2/access.log (default global log)
	# /var/log/apache2/other_vhosts_access.log (per-vhost requests if using vhost_combined)
	#
	# Rotated logs typically have suffixes like .1, .2.gz etc.
	# Example: /var/log/apache2/access_log-20250820.gz
	#
	# To merge them for analysis:
	# zcat -f /var/log/apache2/access_log* > ~/all_access.log
	#
	# --------------------------------------------------------------------


	import argparse, csv, gzip, os, re, sys
	from collections import Counter, defaultdict
	from datetime import datetime

	# Regex for Apache COMBINED format
	LOG_RE = re.compile(
	r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<ts>[^\]]+)\]\s+'
	r'"(?P<method>\S+)\s+(?P<path>\S+)(?:\s+HTTP/\d\.\d)?"\s+'
	r'(?P<status>\d{3})\s+(?P<size>\S+)\s+'
	r'"(?P<ref>[^"])"\s+"(?P<ua>[^"])"'
	)

	# Consider these "static" (excluded from pageview counts)
	STATIC_EXT = re.compile(r'\.(css\|js\|png\|jpe?g\|svg\|gif\|ico\|woff2?\|mp4\|webm\|pdf\|txt)$', re.I)

	# Simple bot filter (optional)
	BOTS_RE = re.compile(r'bot\|crawler\|spider\|fetch\|monitor\|pingdom\|uptime\|curl\|wget\|python-requests', re.I)

	def parse_ts(ts: str) -> datetime:
	# Example: 25/Jun/2025:11:21:04 -0500
	return datetime.strptime(ts, "%d/%b/%Y:%H:%M:%S %z")

	def open_maybe_gz(path):
	return gzip.open(path, 'rt', errors='ignore') if path.endswith('.gz') else open(path, 'rt', errors='ignore')

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument('logfile', help='Path to Apache access log (combined). .gz supported.')
	ap.add_argument('--prefix', default='report', help='Prefix for output CSV filenames')
	ap.add_argument('--ignore-bots', action='store_true', help='Exclude obvious bots/crawlers by UA')
	args = ap.parse_args()

	if not os.path.exists(args.logfile):
	print(f"File not found: {args.logfile}", file=sys.stderr)
	sys.exit(1)

	total_requests = 0
	unique_ips = set()
	unique_ip_ua = set()

	# Aggregates
	pages = Counter() # path -> count (static excluded, query stripped)
	daily_requests = Counter() # YYYY-MM-DD -> count (all)
	daily_pageviews = Counter() # YYYY-MM-DD -> count (static excluded)
	status_counts = Counter() # status -> count
	referrers = Counter() # referrer -> count
	not_found = Counter() # 404 path -> count

	first_dt = None
	last_dt = None

	with open_maybe_gz(args.logfile) as f:
	for line in f:
	m = LOG_RE.search(line)
	if not m:
	continue

	ip = m.group('ip')
	ts = m.group('ts')
	method = m.group('method')
	path = m.group('path')
	status = int(m.group('status'))
	size = m.group('size')
	ref = m.group('ref') or '-'
	ua = m.group('ua') or '-'

	# Optional bot filter
	if args.ignore_bots and (BOTS_RE.search(ua) or BOTS_RE.search(ref)):
	continue

	# Timestamp parsing
	try:
	dt = parse_ts(ts)
	except Exception:
	continue
	day = dt.strftime('%Y-%m-%d')

	# Update first/last seen
	if first_dt is None or dt < first_dt:
	first_dt = dt
	if last_dt is None or dt > last_dt:
	last_dt = dt

	total_requests += 1
	unique_ips.add(ip)
	unique_ip_ua.add(f"{ip}\|{ua}")

	# Daily all-requests
	daily_requests[day] += 1

	# Status
	status_counts[status] += 1

	# Path normalization for pageviews
	clean_path = path.split('?', 1)[0]
	if not STATIC_EXT.search(clean_path):
	pages[clean_path] += 1
	daily_pageviews[day] += 1

	# 404 details
	if status == 404:
	not_found[clean_path] += 1

	# Referrer
	if ref != '-':
	referrers[ref] += 1

	# Write CSV helpers
	def write_pairs(filename, rows, header=('key', 'count')):
	with open(filename, 'w', newline='') as out:
	w = csv.writer(out)
	w.writerow(header)
	for k, v in rows:
	w.writerow([k, v])

	# 00_summary.csv
	summary_path = f"{args.prefix}-00_summary.csv"
	with open(summary_path, 'w', newline='') as out:
	w = csv.writer(out)
	w.writerow(['metric', 'value'])
	w.writerow(['date_range_start', first_dt.isoformat() if first_dt else ''])
	w.writerow(['date_range_end', last_dt.isoformat() if last_dt else ''])
	w.writerow(['total_requests', total_requests])
	w.writerow(['unique_ips', len(unique_ips)])
	w.writerow(['unique_ip_useragent', len(unique_ip_ua)])
	w.writerow(['pageviews_excluding_static', sum(pages.values())])
	# Status buckets
	w.writerow(['status_2xx', sum(c for s, c in status_counts.items() if 200 <= s <= 299)])
	w.writerow(['status_3xx', sum(c for s, c in status_counts.items() if 300 <= s <= 399)])
	w.writerow(['status_4xx', sum(c for s, c in status_counts.items() if 400 <= s <= 499)])
	w.writerow(['status_5xx', sum(c for s, c in status_counts.items() if 500 <= s <= 599)])

	# 10_pages.csv
	write_pairs(f"{args.prefix}-10_pages.csv", pages.most_common(), header=('path', 'count'))

	# 20_daily_requests.csv
	write_pairs(f"{args.prefix}-20_daily_requests.csv",
	sorted(daily_requests.items(), key=lambda kv: kv[0]),
	header=('date', 'count'))

	# 21_daily_pageviews.csv
	write_pairs(f"{args.prefix}-21_daily_pageviews.csv",
	sorted(daily_pageviews.items(), key=lambda kv: kv[0]),
	header=('date', 'count'))

	# 30_status_codes.csv
	write_pairs(f"{args.prefix}-30_status_codes.csv",
	sorted(status_counts.items(), key=lambda kv: kv[0]),
	header=('status', 'count'))

	# 40_referrers.csv
	write_pairs(f"{args.prefix}-40_referrers.csv", referrers.most_common(), header=('referrer', 'count'))

	# 50_404_urls.csv
	write_pairs(f"{args.prefix}-50_404_urls.csv", not_found.most_common(), header=('path', 'count'))

	print(f"Done. CSVs written with prefix: {args.prefix}-*.csv")

	if __name__ == '__main__':
	main()
No results found