Created
February 27, 2026 22:38
-
-
Save fnielsen/70069b61da9f7eb09b721e4d82b63710 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from textwrap import dedent | |
| from datetime import datetime | |
| DOIS = [ | |
| "10.1016/j.erss.2023.103213", | |
| "10.1016/j.erss.2021.102482", | |
| "10.1016/j.erss.2023.103053", | |
| "10.1016/j.erss.2021.102444", | |
| "10.1016/j.erss.2020.101837", | |
| "10.1016/j.erss.2022.102714", | |
| "10.1007/s40518-021-00184-6", | |
| "10.1016/j.esd.2024.101546", | |
| "10.1016/j.erss.2023.103010", | |
| "10.1002/wcc.932", | |
| "10.1016/j.erss.2025.104036", | |
| "10.1016/j.erss.2015.10.004", | |
| "10.1016/j.erss.2020.101907", | |
| "10.1002/geo2.70040", | |
| "10.1016/j.erss.2025.104149", | |
| "10.1016/j.erss.2022.102862", | |
| "10.1007/s13280-025-02202-z", | |
| "10.1016/j.erss.2025.104422", | |
| "10.1016/j.erss.2020.101716", | |
| "10.3390/en17143512", | |
| "10.1016/j.erss.2020.101768", | |
| "10.1016/j.erss.2025.104016", | |
| "10.1016/j.erss.2018.06.010", | |
| "10.1016/j.copsyc.2024.101987", | |
| "10.3390/su13042128", | |
| "10.1080/14693062.2023.2256697", | |
| "10.1016/j.erss.2020.101834", | |
| "10.1016/j.rser.2025.115892", | |
| "10.1146/annurev-environ-112621-063400", | |
| "10.1016/j.erss.2020.101871", | |
| "10.1016/j.egyr.2024.01.040", | |
| "10.1016/j.crm.2022.100438", | |
| "10.1146/annurev-environ-101718-033103", | |
| "10.1016/j.agsy.2025.104512", | |
| "10.3390/su15032441", | |
| "10.1016/j.erss.2025.104067", | |
| "10.1007/s43621-021-00024-z", | |
| "10.1186/s13705-021-00330-4", | |
| "10.3390/su11041023", | |
| "10.1016/j.rser.2021.111504", | |
| "10.17573/cepar.2025.2.09", | |
| "10.1111/gec3.12662", | |
| "10.1016/j.erss.2020.101876", | |
| "10.1016/j.eiar.2025.108187", | |
| "10.1016/j.erss.2023.103221", | |
| "10.1016/j.erss.2016.04.001", | |
| "10.1016/j.erss.2021.102333", | |
| "10.1016/j.erss.2022.102913", | |
| "10.1017/sus.2025.2", | |
| "10.1016/j.erss.2021.102257", | |
| "10.1016/j.jenvman.2024.120804", | |
| "10.1007/s13412-021-00726-w", | |
| "10.3390/su16198700", | |
| "10.1017/sus.2024.24", | |
| "10.1016/j.jclepro.2024.143470", | |
| "10.1139/er-2024-0018", | |
| ] | |
| def sparql_escape_string(s: str) -> str: | |
| return s.replace("\\", "\\\\").replace('"', '\\"') | |
| def parse_year(date_str: str | None) -> str: | |
| if not date_str: | |
| return "" | |
| # Scholarly/Wikidata returns ISO-ish strings; safest is take first 4 digits if present. | |
| if len(date_str) >= 4 and date_str[:4].isdigit(): | |
| return date_str[:4] | |
| return "" | |
| def wiki_escape_cell(text: str) -> str: | |
| if text is None: | |
| return "" | |
| # Avoid breaking table cells | |
| text = text.replace("|", "|") | |
| text = text.replace("\n", " ").replace("\r", " ") | |
| return text | |
| def qid_from_uri(uri: str) -> str: | |
| # e.g. http://www.wikidata.org/entity/Q123 -> Q123 | |
| return uri.rsplit("/", 1)[-1] | |
| def build_table_for_dois(dois: list[str]) -> str: | |
| endpoint = "https://query-scholarly.wikidata.org/sparql" | |
| ua = "fnielsen mailto:faan@dtu.dk" | |
| dois_up = [d.strip().upper() for d in dois if d.strip()] | |
| values_block = "\n".join(f'"{sparql_escape_string(d)}"' for d in dois_up) | |
| # One row per (work, matched-doi). Also fetch: | |
| # - earliest publication date (MIN) | |
| # - one title sample (prefer P1476, else English label) | |
| # - all DOIs on the work (GROUP_CONCAT) so we can warn on multi-DOI items | |
| query = dedent(f""" | |
| SELECT ?work ?doi | |
| (MIN(?pubDate) AS ?earliestDate) | |
| (SAMPLE(?titleStr0) AS ?titleSample) | |
| (GROUP_CONCAT(DISTINCT ?allDoi; separator="|") AS ?allDois) | |
| WHERE {{ | |
| VALUES ?doi {{ {values_block} }} | |
| ?work wdt:P356 ?doi . | |
| OPTIONAL {{ ?work wdt:P577 ?pubDate . }} | |
| OPTIONAL {{ | |
| ?work wdt:P1476 ?titleLit . | |
| BIND(STR(?titleLit) AS ?titleStr0) | |
| }} | |
| OPTIONAL {{ | |
| ?work rdfs:label ?labelEn . | |
| FILTER(LANG(?labelEn) = "en") | |
| }} | |
| # Fallback: if P1476 missing, use English label | |
| BIND(COALESCE(?titleStr0, STR(?labelEn)) AS ?titleStr0) | |
| OPTIONAL {{ ?work wdt:P356 ?allDoi . }} | |
| }} | |
| GROUP BY ?work ?doi | |
| ORDER BY ?doi ?work | |
| """) | |
| r = requests.get( | |
| endpoint, | |
| params={"format": "json", "query": query}, | |
| headers={"Accept": "application/sparql-results+json", "User-Agent": ua}, | |
| timeout=120, | |
| ) | |
| r.raise_for_status() | |
| bindings = r.json()["results"]["bindings"] | |
| # Collect rows + warnings | |
| rows = [] # (doi, qid, year, title, all_dois_set) | |
| doi_to_qids: dict[str, set[str]] = {} | |
| qid_to_dois: dict[str, set[str]] = {} | |
| for b in bindings: | |
| work_uri = b["work"]["value"] | |
| qid = qid_from_uri(work_uri) | |
| doi = b["doi"]["value"] | |
| earliest = b.get("earliestDate", {}).get("value") | |
| year = parse_year(earliest) | |
| title = b.get("titleSample", {}).get("value", "") | |
| title = wiki_escape_cell(title) | |
| all_dois_raw = b.get("allDois", {}).get("value", "") | |
| all_dois_set = set(d for d in all_dois_raw.split("|") if d) | |
| rows.append((doi, qid, year, title, all_dois_set)) | |
| doi_to_qids.setdefault(doi, set()).add(qid) | |
| qid_to_dois.setdefault(qid, set()).update(all_dois_set) | |
| # Warnings | |
| warn_lines = [] | |
| multi_qids = {doi: qids for doi, qids in doi_to_qids.items() if len(qids) > 1} | |
| multi_dois = {qid: doiset for qid, doiset in qid_to_dois.items() if len(doiset) > 1} | |
| if multi_qids: | |
| warn_lines.append("**Warning: DOIs with multiple QIDs (possible duplicates in Wikidata):**") | |
| for doi in sorted(multi_qids.keys()): | |
| qids = ", ".join(sorted(multi_qids[doi])) | |
| warn_lines.append(f"* {doi} → {qids}") | |
| if multi_dois: | |
| warn_lines.append("**Warning: QIDs with multiple DOIs (multiple P356 on the same item):**") | |
| for qid in sorted(multi_dois.keys()): | |
| dois = ", ".join(sorted(multi_dois[qid])) | |
| warn_lines.append(f"* {qid} → {dois}") | |
| # Build MediaWiki table | |
| # Link QIDs to Wikidata; link DOI to doi.org | |
| table_lines = [] | |
| table_lines.append('{| class="wikitable sortable"') | |
| table_lines.append("! QID !! Year !! DOI !! Title") | |
| for doi, qid, year, title, _all_dois_set in rows: | |
| qid_link = f"[[d:{qid}|{qid}]]" | |
| doi_link = f"[https://doi.org/{doi} {doi}]" | |
| table_lines.append("|-") | |
| table_lines.append(f"| {qid_link} || {year} || {doi_link} || {title}") | |
| table_lines.append("|}") | |
| # If no matches, still output something useful | |
| if not rows: | |
| warn_lines.insert(0, "**No matches found for the provided DOIs on query-scholarly.wikidata.org.**") | |
| return "\n".join(warn_lines + [""] + table_lines) | |
| if __name__ == "__main__": | |
| wiki_text = build_table_for_dois(DOIS) | |
| print(wiki_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment