Last active
August 12, 2024 21:08
-
-
Save galtay/8d6d1b95e82a64c4849ce820f312eabb to your computer and use it in GitHub Desktop.
wikipedia_dump_status
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| from dataclasses import dataclass | |
| from typing import Dict | |
| import requests | |
| MIRROR_URL = "https://dumps.wikimedia.org" | |
| WIKI = "enwiki" | |
| @dataclass | |
| class WdsFile: | |
| name: str | |
| sha1: str | |
| md5: str | |
| size: int | |
| url: str | |
| @dataclass | |
| class WdsJob: | |
| name: str | |
| status: str | |
| files: Dict[str, WdsFile] | |
| updated: str | |
| @dataclass | |
| class WikimediaDumpStatus: | |
| jobs: Dict[str, WdsJob] | |
| version: str | |
| def __init__(self, dumpstatus: Dict) -> None: | |
| self._dumpstatus = dumpstatus | |
| self.version = dumpstatus["version"] | |
| self.jobs = {} | |
| for jobname, job in dumpstatus["jobs"].items(): | |
| files = { | |
| filename: WdsFile( | |
| name=filename, | |
| sha1=fileinfo.get("sha1", ""), | |
| md5=fileinfo.get("md5", ""), | |
| size=fileinfo.get("size", 0), | |
| url=fileinfo.get("url", ""), | |
| ) for filename, fileinfo in job.get("files", {}).items() | |
| } | |
| job = WdsJob( | |
| name=jobname, | |
| status=job["status"], | |
| updated=job["updated"], | |
| files=files | |
| ) | |
| self.jobs[jobname] = job | |
| @classmethod | |
| def from_url(self, url): | |
| try: | |
| res = requests.get(url) | |
| except requests.exceptions.RequestException as oops: | |
| raise SystemExit(oops) | |
| return self(res.json()) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Check Wikipedia Dump Status.") | |
| parser.add_argument( | |
| "yyyymmdd", | |
| help="date string for dump, e.g. 20200601") | |
| parser.add_argument( | |
| "--mirror_url", | |
| default=MIRROR_URL, | |
| help="base url for mirror") | |
| parser.add_argument( | |
| "--wiki", | |
| default="enwiki", | |
| help="which wikipedia?") | |
| args = parser.parse_args() | |
| url = "{}/{}/{}/dumpstatus.json".format(args.mirror_url, args.wiki, args.yyyymmdd) | |
| wds = WikimediaDumpStatus.from_url(url) | |
| for jobname in sorted(wds.jobs.keys()): | |
| if wds.jobs[jobname].status == "done": | |
| mark = u"\u2705" | |
| elif wds.jobs[jobname].status == "waiting": | |
| mark = u"\u274c" | |
| else: | |
| mark = "" | |
| print("{}: {} {}".format(jobname, wds.jobs[jobname].status, mark)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment