Skip to content

Instantly share code, notes, and snippets.

@galtay
Last active August 12, 2024 21:08
Show Gist options
  • Select an option

  • Save galtay/8d6d1b95e82a64c4849ce820f312eabb to your computer and use it in GitHub Desktop.

Select an option

Save galtay/8d6d1b95e82a64c4849ce820f312eabb to your computer and use it in GitHub Desktop.
wikipedia_dump_status
import argparse
from dataclasses import dataclass
from typing import Dict
import requests
MIRROR_URL = "https://dumps.wikimedia.org"
WIKI = "enwiki"
@dataclass
class WdsFile:
name: str
sha1: str
md5: str
size: int
url: str
@dataclass
class WdsJob:
name: str
status: str
files: Dict[str, WdsFile]
updated: str
@dataclass
class WikimediaDumpStatus:
jobs: Dict[str, WdsJob]
version: str
def __init__(self, dumpstatus: Dict) -> None:
self._dumpstatus = dumpstatus
self.version = dumpstatus["version"]
self.jobs = {}
for jobname, job in dumpstatus["jobs"].items():
files = {
filename: WdsFile(
name=filename,
sha1=fileinfo.get("sha1", ""),
md5=fileinfo.get("md5", ""),
size=fileinfo.get("size", 0),
url=fileinfo.get("url", ""),
) for filename, fileinfo in job.get("files", {}).items()
}
job = WdsJob(
name=jobname,
status=job["status"],
updated=job["updated"],
files=files
)
self.jobs[jobname] = job
@classmethod
def from_url(self, url):
try:
res = requests.get(url)
except requests.exceptions.RequestException as oops:
raise SystemExit(oops)
return self(res.json())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check Wikipedia Dump Status.")
parser.add_argument(
"yyyymmdd",
help="date string for dump, e.g. 20200601")
parser.add_argument(
"--mirror_url",
default=MIRROR_URL,
help="base url for mirror")
parser.add_argument(
"--wiki",
default="enwiki",
help="which wikipedia?")
args = parser.parse_args()
url = "{}/{}/{}/dumpstatus.json".format(args.mirror_url, args.wiki, args.yyyymmdd)
wds = WikimediaDumpStatus.from_url(url)
for jobname in sorted(wds.jobs.keys()):
if wds.jobs[jobname].status == "done":
mark = u"\u2705"
elif wds.jobs[jobname].status == "waiting":
mark = u"\u274c"
else:
mark = ""
print("{}: {} {}".format(jobname, wds.jobs[jobname].status, mark))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment