Created
December 3, 2025 13:27
-
-
Save Vesihiisi/18e244d0453b017dca86d3f74de4611e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from collections import Counter, defaultdict | |
| from datetime import date | |
| import csv | |
| import urllib.parse | |
| YEAR = 2025 | |
| OUTPUT_CSV = f"svwiki_topviews_{YEAR}_ytd_top1000.csv" | |
| HEADERS = { | |
| "accept": "application/json", | |
| "User-Agent": "wmse-viewcount-script" | |
| } | |
| BASE_ARTICLE_URL = "https://sv.wikipedia.org/wiki/" | |
| today = date.today() | |
| totals = Counter() | |
| monthly_views = defaultdict(lambda: [0] * 13) # index 1–12 used | |
| def should_exclude(title: str) -> bool: | |
| if title.startswith("Portal:"): | |
| return True | |
| if title == "wiki.phtml": | |
| return True | |
| if title.startswith("Fil:"): | |
| return True | |
| if title.startswith("Special:"): | |
| return True | |
| return False | |
| for month in range(1, 13): | |
| if YEAR == today.year and month > today.month: | |
| print(f"Skipping {YEAR}-{month:02d}: future month") | |
| continue | |
| url = ( | |
| f"https://wikimedia.org/api/rest_v1/metrics/pageviews/top/" | |
| f"sv.wikipedia.org/all-access/{YEAR}/{month:02d}/all-days" | |
| ) | |
| print(f"Fetching {url} ...") | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=30) | |
| r.raise_for_status() | |
| except requests.exceptions.HTTPError as e: | |
| if r.status_code == 404: | |
| print(f"Skipping {YEAR}-{month:02d}: data not yet available (404)") | |
| continue | |
| if r.status_code == 403: | |
| print(f"Skipping {YEAR}-{month:02d}: forbidden (403)") | |
| continue | |
| print(f"Skipping {YEAR}-{month:02d}: HTTP error {e}") | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| print(f"Skipping {YEAR}-{month:02d}: network error {e}") | |
| continue | |
| data = r.json() | |
| if not data.get("items"): | |
| print(f"Skipping {YEAR}-{month:02d}: no items in response") | |
| continue | |
| articles = data["items"][0].get("articles", []) | |
| for article in articles: | |
| title = article["article"] | |
| views = article["views"] | |
| if should_exclude(title): | |
| continue | |
| totals[title] += views | |
| monthly_views[title][month] += views | |
| sorted_articles = totals.most_common() | |
| sorted_articles = sorted_articles[:1000] | |
| with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["rank", "title", "views", "peak_month", "url"]) | |
| for rank, (title, views) in enumerate(sorted_articles, start=1): | |
| # Determine the month with the most views | |
| # monthly_views[title] is a list of 13 ints, index 1–12 | |
| month_list = monthly_views[title] | |
| # argmax over months 1..12 | |
| best_month = max(range(1, 13), key=lambda m: month_list[m]) | |
| peak_month_str = f"{YEAR}-{best_month:02d}" | |
| # Build URL | |
| url_title = urllib.parse.quote(title.replace(" ", "_"), safe="") | |
| url = f"{BASE_ARTICLE_URL}{url_title}" | |
| writer.writerow([rank, title, views, peak_month_str, url]) | |
| print(f"\nWrote top {len(sorted_articles)} rows to {OUTPUT_CSV}\n") | |
| print("Top 20 articles this year so far (after exclusions):\n") | |
| for rank, (title, views) in enumerate(sorted_articles[:20], start=1): | |
| # Same peak-month logic for console output | |
| month_list = monthly_views[title] | |
| best_month = max(range(1, 13), key=lambda m: month_list[m]) | |
| peak_month_str = f"{YEAR}-{best_month:02d}" | |
| print(f"{rank}\t{title}\t{views}\t{peak_month_str}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment