Created
December 3, 2025 13:28
-
-
Save Vesihiisi/129dd005be07647fd7f4ee3cc6f56511 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from collections import Counter | |
| from datetime import date, timedelta | |
| import csv | |
| import urllib.parse | |
| YEAR = 2025 | |
| COUNTRY = "SE" | |
| START_DATE = date(2025, 1, 1) | |
| END_DATE = date(2025, 11, 30) | |
| OUTPUT_CSV = f"wikimedia_topviews_{COUNTRY}_{YEAR}_jan-nov_top1000.csv" | |
| TOP_N = 1000 | |
| HEADERS = { | |
| "accept": "application/json", | |
| "User-Agent": "wmse-country-topviews" | |
| } | |
| def is_article(title: str) -> bool: | |
| title_lower = title.lower() | |
| if title_lower.startswith("special:"): | |
| return False | |
| file_prefixes = [ | |
| "file:", "fil:", "image:", "datei:", "fichier:", "immagine:", "archivo:" | |
| ] | |
| if any(title_lower.startswith(prefix) for prefix in file_prefixes): | |
| return False | |
| return True | |
| totals = Counter() | |
| peak_views = {} | |
| current = START_DATE | |
| while current <= END_DATE: | |
| y, m, d = current.year, current.month, current.day | |
| url = ( | |
| "https://wikimedia.org/api/rest_v1/metrics/pageviews/top-per-country/" | |
| f"{COUNTRY}/all-access/{y}/{m:02d}/{d:02d}" | |
| ) | |
| print(f"Fetching {current.isoformat()} ...") | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=30) | |
| r.raise_for_status() | |
| except requests.exceptions.HTTPError as e: | |
| if r.status_code == 404: | |
| print(f" Skipping {current}: data not available (404)") | |
| elif r.status_code == 403: | |
| print(f" Skipping {current}: forbidden (403)") | |
| else: | |
| print(f" Skipping {current}: HTTP error {r.status_code} ({e})") | |
| current += timedelta(days=1) | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| print(f" Skipping {current}: network error {e}") | |
| current += timedelta(days=1) | |
| continue | |
| data = r.json() | |
| items = data.get("items") | |
| if not items: | |
| print(f" Skipping {current}: no items field") | |
| current += timedelta(days=1) | |
| continue | |
| articles = items[0].get("articles", []) | |
| for entry in articles: | |
| views = entry.get("views_ceil", entry.get("views")) | |
| if views is None: | |
| continue | |
| project = entry["project"] | |
| title = entry["article"] | |
| if not is_article(title): | |
| continue | |
| key = (project, title) | |
| totals[key] += views | |
| if key not in peak_views or views > peak_views[key][0]: | |
| peak_views[key] = (views, current.isoformat()) | |
| current += timedelta(days=1) | |
| sorted_items = totals.most_common() | |
| if TOP_N is not None: | |
| sorted_items = sorted_items[:TOP_N] | |
| print(f"\nTotal unique (project, article) pairs counted: {len(totals)}") | |
| print(f"Writing top {len(sorted_items)} rows to {OUTPUT_CSV} ...") | |
| with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["rank", "project", "title", "views", "peak_day", "url"]) | |
| for rank, ((project, title), views) in enumerate(sorted_items, start=1): | |
| peak_day = peak_views.get((project, title), (0, ""))[1] | |
| url_title = urllib.parse.quote(title.replace(" ", "_"), safe="") | |
| article_url = f"https://{project}.org/wiki/{url_title}" | |
| writer.writerow([rank, project, title, views, peak_day, article_url]) | |
| print("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment