|
#Tweaked example script from qwikidata |
|
import sys |
|
import time |
|
from qwikidata.entity import WikidataItem |
|
from qwikidata.json_dump import WikidataJsonDump |
|
from qwikidata.utils import dump_entities_to_json |
|
from subprocess import call |
|
|
|
def dump_block_of_taxa(taxa, out_fp): |
|
sys.stderr.write(f"Dumping {len(taxa)} to {out_fp} ...\n") |
|
dump_entities_to_json(taxa, out_fp) |
|
retcode = call(["bzip2", out_fp]) |
|
if retcode != 0: |
|
raise RuntimeError(f"bzip2 of {out_fp} failed.\n") |
|
|
|
def main(inp_fp, id_fp, out_fp): |
|
with open(id_fp, "r") as inp: |
|
tmp = [i.strip() for i in inp.readlines()] |
|
ids_to_save = [] |
|
for i in tmp: |
|
if not i: |
|
continue |
|
assert(i.startswith('"')) |
|
assert(i.endswith('"')) |
|
ids_to_save.append(i[1:-1]) |
|
|
|
ids_to_save = frozenset(ids_to_save) |
|
|
|
|
|
# create an instance of WikidataJsonDump |
|
wjd_dump_path = sys.argv[1] |
|
wjd = WikidataJsonDump(wjd_dump_path) |
|
|
|
# create an iterable of WikidataItem representing politicians |
|
authors = [] |
|
t1 = time.time() |
|
prev_dumped = 0 |
|
for ii, entity_dict in enumerate(wjd): |
|
qid = entity_dict["id"] |
|
if qid in ids_to_save: |
|
entity = WikidataItem(entity_dict) |
|
authors.append(entity) |
|
nt = len(authors) |
|
if ii % 1000 == 0: |
|
t2 = time.time() |
|
dt = t2 - t1 |
|
rate = ii/dt |
|
tt = nt + prev_dumped |
|
sys.stderr.write(f"found {tt} entities among {ii} entities [entities/s: {rate:.2f}]\n") |
|
if nt == 1000: |
|
prev_dumped += nt |
|
dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}") |
|
authors = [] |
|
nt = len(authors) |
|
prev_dumped += nt |
|
sys.stderr.write(f"found {prev_dumped} entities among {ii} entities\n") |
|
if authors: |
|
dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}") |
|
sys.stderr.write(f"Done!\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
sys.exit(main(inp_fp=sys.argv[1], |
|
id_fp=sys.argv[2], |
|
out_fp="au_entities.json")) |