Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save Vesihiisi/129dd005be07647fd7f4ee3cc6f56511 to your computer and use it in GitHub Desktop.

Select an option

Save Vesihiisi/129dd005be07647fd7f4ee3cc6f56511 to your computer and use it in GitHub Desktop.
import requests
from collections import Counter
from datetime import date, timedelta
import csv
import urllib.parse
YEAR = 2025
COUNTRY = "SE"
START_DATE = date(2025, 1, 1)
END_DATE = date(2025, 11, 30)
OUTPUT_CSV = f"wikimedia_topviews_{COUNTRY}_{YEAR}_jan-nov_top1000.csv"
TOP_N = 1000
HEADERS = {
"accept": "application/json",
"User-Agent": "wmse-country-topviews"
}
def is_article(title: str) -> bool:
title_lower = title.lower()
if title_lower.startswith("special:"):
return False
file_prefixes = [
"file:", "fil:", "image:", "datei:", "fichier:", "immagine:", "archivo:"
]
if any(title_lower.startswith(prefix) for prefix in file_prefixes):
return False
return True
totals = Counter()
peak_views = {}
current = START_DATE
while current <= END_DATE:
y, m, d = current.year, current.month, current.day
url = (
"https://wikimedia.org/api/rest_v1/metrics/pageviews/top-per-country/"
f"{COUNTRY}/all-access/{y}/{m:02d}/{d:02d}"
)
print(f"Fetching {current.isoformat()} ...")
try:
r = requests.get(url, headers=HEADERS, timeout=30)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
if r.status_code == 404:
print(f" Skipping {current}: data not available (404)")
elif r.status_code == 403:
print(f" Skipping {current}: forbidden (403)")
else:
print(f" Skipping {current}: HTTP error {r.status_code} ({e})")
current += timedelta(days=1)
continue
except requests.exceptions.RequestException as e:
print(f" Skipping {current}: network error {e}")
current += timedelta(days=1)
continue
data = r.json()
items = data.get("items")
if not items:
print(f" Skipping {current}: no items field")
current += timedelta(days=1)
continue
articles = items[0].get("articles", [])
for entry in articles:
views = entry.get("views_ceil", entry.get("views"))
if views is None:
continue
project = entry["project"]
title = entry["article"]
if not is_article(title):
continue
key = (project, title)
totals[key] += views
if key not in peak_views or views > peak_views[key][0]:
peak_views[key] = (views, current.isoformat())
current += timedelta(days=1)
sorted_items = totals.most_common()
if TOP_N is not None:
sorted_items = sorted_items[:TOP_N]
print(f"\nTotal unique (project, article) pairs counted: {len(totals)}")
print(f"Writing top {len(sorted_items)} rows to {OUTPUT_CSV} ...")
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["rank", "project", "title", "views", "peak_day", "url"])
for rank, ((project, title), views) in enumerate(sorted_items, start=1):
peak_day = peak_views.get((project, title), (0, ""))[1]
url_title = urllib.parse.quote(title.replace(" ", "_"), safe="")
article_url = f"https://{project}.org/wiki/{url_title}"
writer.writerow([rank, project, title, views, peak_day, article_url])
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment