Vesihiisi/Articles with most views from Sweden Jan-November.py

## Articles with most views from Sweden Jan-November.py
import requests
from collections import Counter
from datetime import date, timedelta
import csv
import urllib.parse

YEAR = 2025
COUNTRY = "SE"
START_DATE = date(2025, 1, 1)
END_DATE = date(2025, 11, 30)

OUTPUT_CSV = f"wikimedia_topviews_{COUNTRY}_{YEAR}_jan-nov_top1000.csv"

TOP_N = 1000

HEADERS = {
    "accept": "application/json",
    "User-Agent": "wmse-country-topviews"
}

def is_article(title: str) -> bool:
    title_lower = title.lower()

    if title_lower.startswith("special:"):
        return False

    file_prefixes = [
        "file:", "fil:", "image:", "datei:", "fichier:", "immagine:", "archivo:"
    ]
    if any(title_lower.startswith(prefix) for prefix in file_prefixes):
        return False

    return True

totals = Counter()
peak_views = {}

current = START_DATE
while current <= END_DATE:
    y, m, d = current.year, current.month, current.day

    url = (
        "https://wikimedia.org/api/rest_v1/metrics/pageviews/top-per-country/"
        f"{COUNTRY}/all-access/{y}/{m:02d}/{d:02d}"
    )

    print(f"Fetching {current.isoformat()} ...")

    try:
        r = requests.get(url, headers=HEADERS, timeout=30)
        r.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if r.status_code == 404:
            print(f"  Skipping {current}: data not available (404)")
        elif r.status_code == 403:
            print(f"  Skipping {current}: forbidden (403)")
        else:
            print(f"  Skipping {current}: HTTP error {r.status_code} ({e})")
        current += timedelta(days=1)
        continue
    except requests.exceptions.RequestException as e:
        print(f"  Skipping {current}: network error {e}")
        current += timedelta(days=1)
        continue

    data = r.json()
    items = data.get("items")
    if not items:
        print(f"  Skipping {current}: no items field")
        current += timedelta(days=1)
        continue

    articles = items[0].get("articles", [])
    for entry in articles:
        views = entry.get("views_ceil", entry.get("views"))
        if views is None:
            continue

        project = entry["project"]
        title = entry["article"]

        if not is_article(title):
            continue

        key = (project, title)

        totals[key] += views

        if key not in peak_views or views > peak_views[key][0]:
            peak_views[key] = (views, current.isoformat())

    current += timedelta(days=1)

sorted_items = totals.most_common()
if TOP_N is not None:
    sorted_items = sorted_items[:TOP_N]

print(f"\nTotal unique (project, article) pairs counted: {len(totals)}")
print(f"Writing top {len(sorted_items)} rows to {OUTPUT_CSV} ...")

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["rank", "project", "title", "views", "peak_day", "url"])

    for rank, ((project, title), views) in enumerate(sorted_items, start=1):
        peak_day = peak_views.get((project, title), (0, ""))[1]
        url_title = urllib.parse.quote(title.replace(" ", "_"), safe="")
        article_url = f"https://{project}.org/wiki/{url_title}"

        writer.writerow([rank, project, title, views, peak_day, article_url])

print("Done.")
	import requests
	from collections import Counter
	from datetime import date, timedelta
	import csv
	import urllib.parse

	YEAR = 2025
	COUNTRY = "SE"
	START_DATE = date(2025, 1, 1)
	END_DATE = date(2025, 11, 30)

	OUTPUT_CSV = f"wikimedia_topviews_{COUNTRY}_{YEAR}_jan-nov_top1000.csv"

	TOP_N = 1000

	HEADERS = {
	"accept": "application/json",
	"User-Agent": "wmse-country-topviews"
	}

	def is_article(title: str) -> bool:
	title_lower = title.lower()

	if title_lower.startswith("special:"):
	return False

	file_prefixes = [
	"file:", "fil:", "image:", "datei:", "fichier:", "immagine:", "archivo:"
	]
	if any(title_lower.startswith(prefix) for prefix in file_prefixes):
	return False

	return True

	totals = Counter()
	peak_views = {}

	current = START_DATE
	while current <= END_DATE:
	y, m, d = current.year, current.month, current.day

	url = (
	"https://wikimedia.org/api/rest_v1/metrics/pageviews/top-per-country/"
	f"{COUNTRY}/all-access/{y}/{m:02d}/{d:02d}"
	)

	print(f"Fetching {current.isoformat()} ...")

	try:
	r = requests.get(url, headers=HEADERS, timeout=30)
	r.raise_for_status()
	except requests.exceptions.HTTPError as e:
	if r.status_code == 404:
	print(f" Skipping {current}: data not available (404)")
	elif r.status_code == 403:
	print(f" Skipping {current}: forbidden (403)")
	else:
	print(f" Skipping {current}: HTTP error {r.status_code} ({e})")
	current += timedelta(days=1)
	continue
	except requests.exceptions.RequestException as e:
	print(f" Skipping {current}: network error {e}")
	current += timedelta(days=1)
	continue

	data = r.json()
	items = data.get("items")
	if not items:
	print(f" Skipping {current}: no items field")
	current += timedelta(days=1)
	continue

	articles = items[0].get("articles", [])
	for entry in articles:
	views = entry.get("views_ceil", entry.get("views"))
	if views is None:
	continue

	project = entry["project"]
	title = entry["article"]

	if not is_article(title):
	continue

	key = (project, title)

	totals[key] += views

	if key not in peak_views or views > peak_views[key][0]:
	peak_views[key] = (views, current.isoformat())

	current += timedelta(days=1)

	sorted_items = totals.most_common()
	if TOP_N is not None:
	sorted_items = sorted_items[:TOP_N]

	print(f"\nTotal unique (project, article) pairs counted: {len(totals)}")
	print(f"Writing top {len(sorted_items)} rows to {OUTPUT_CSV} ...")

	with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow(["rank", "project", "title", "views", "peak_day", "url"])

	for rank, ((project, title), views) in enumerate(sorted_items, start=1):
	peak_day = peak_views.get((project, title), (0, ""))[1]
	url_title = urllib.parse.quote(title.replace(" ", "_"), safe="")
	article_url = f"https://{project}.org/wiki/{url_title}"

	writer.writerow([rank, project, title, views, peak_day, article_url])

	print("Done.")
No results found