Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save Vesihiisi/18e244d0453b017dca86d3f74de4611e to your computer and use it in GitHub Desktop.

Select an option

Save Vesihiisi/18e244d0453b017dca86d3f74de4611e to your computer and use it in GitHub Desktop.
import requests
from collections import Counter, defaultdict
from datetime import date
import csv
import urllib.parse
YEAR = 2025
OUTPUT_CSV = f"svwiki_topviews_{YEAR}_ytd_top1000.csv"
HEADERS = {
"accept": "application/json",
"User-Agent": "wmse-viewcount-script"
}
BASE_ARTICLE_URL = "https://sv.wikipedia.org/wiki/"
today = date.today()
totals = Counter()
monthly_views = defaultdict(lambda: [0] * 13) # index 1–12 used
def should_exclude(title: str) -> bool:
if title.startswith("Portal:"):
return True
if title == "wiki.phtml":
return True
if title.startswith("Fil:"):
return True
if title.startswith("Special:"):
return True
return False
for month in range(1, 13):
if YEAR == today.year and month > today.month:
print(f"Skipping {YEAR}-{month:02d}: future month")
continue
url = (
f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/"
f"sv.wikipedia.org/all-access/{YEAR}/{month:02d}/all-days"
)
print(f"Fetching {url} ...")
try:
r = requests.get(url, headers=HEADERS, timeout=30)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
if r.status_code == 404:
print(f"Skipping {YEAR}-{month:02d}: data not yet available (404)")
continue
if r.status_code == 403:
print(f"Skipping {YEAR}-{month:02d}: forbidden (403)")
continue
print(f"Skipping {YEAR}-{month:02d}: HTTP error {e}")
continue
except requests.exceptions.RequestException as e:
print(f"Skipping {YEAR}-{month:02d}: network error {e}")
continue
data = r.json()
if not data.get("items"):
print(f"Skipping {YEAR}-{month:02d}: no items in response")
continue
articles = data["items"][0].get("articles", [])
for article in articles:
title = article["article"]
views = article["views"]
if should_exclude(title):
continue
totals[title] += views
monthly_views[title][month] += views
sorted_articles = totals.most_common()
sorted_articles = sorted_articles[:1000]
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["rank", "title", "views", "peak_month", "url"])
for rank, (title, views) in enumerate(sorted_articles, start=1):
# Determine the month with the most views
# monthly_views[title] is a list of 13 ints, index 1–12
month_list = monthly_views[title]
# argmax over months 1..12
best_month = max(range(1, 13), key=lambda m: month_list[m])
peak_month_str = f"{YEAR}-{best_month:02d}"
# Build URL
url_title = urllib.parse.quote(title.replace(" ", "_"), safe="")
url = f"{BASE_ARTICLE_URL}{url_title}"
writer.writerow([rank, title, views, peak_month_str, url])
print(f"\nWrote top {len(sorted_articles)} rows to {OUTPUT_CSV}\n")
print("Top 20 articles this year so far (after exclusions):\n")
for rank, (title, views) in enumerate(sorted_articles[:20], start=1):
# Same peak-month logic for console output
month_list = monthly_views[title]
best_month = max(range(1, 13), key=lambda m: month_list[m])
peak_month_str = f"{YEAR}-{best_month:02d}"
print(f"{rank}\t{title}\t{views}\t{peak_month_str}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment