Skip to content

Instantly share code, notes, and snippets.

@mtholder
Last active July 4, 2025 14:42
Show Gist options
  • Select an option

  • Save mtholder/7976f73c1811691979eac497951faf6a to your computer and use it in GitHub Desktop.

Select an option

Save mtholder/7976f73c1811691979eac497951faf6a to your computer and use it in GitHub Desktop.
extract entities with specific entity IDs from the full wikidata JSON dump using https://github.com/kensho-technologies/qwikidata/tree/develop
#Tweaked example script from qwikidata
import sys
import time
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json
from subprocess import call
def dump_block_of_taxa(taxa, out_fp):
sys.stderr.write(f"Dumping {len(taxa)} to {out_fp} ...\n")
dump_entities_to_json(taxa, out_fp)
retcode = call(["bzip2", out_fp])
if retcode != 0:
raise RuntimeError(f"bzip2 of {out_fp} failed.\n")
def main(inp_fp, id_fp, out_fp):
with open(id_fp, "r") as inp:
tmp = [i.strip() for i in inp.readlines()]
ids_to_save = []
for i in tmp:
if not i:
continue
assert(i.startswith('"'))
assert(i.endswith('"'))
ids_to_save.append(i[1:-1])
ids_to_save = frozenset(ids_to_save)
# create an instance of WikidataJsonDump
wjd_dump_path = sys.argv[1]
wjd = WikidataJsonDump(wjd_dump_path)
# create an iterable of WikidataItem representing politicians
authors = []
t1 = time.time()
prev_dumped = 0
for ii, entity_dict in enumerate(wjd):
qid = entity_dict["id"]
if qid in ids_to_save:
entity = WikidataItem(entity_dict)
authors.append(entity)
nt = len(authors)
if ii % 1000 == 0:
t2 = time.time()
dt = t2 - t1
rate = ii/dt
tt = nt + prev_dumped
sys.stderr.write(f"found {tt} entities among {ii} entities [entities/s: {rate:.2f}]\n")
if nt == 1000:
prev_dumped += nt
dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
authors = []
nt = len(authors)
prev_dumped += nt
sys.stderr.write(f"found {prev_dumped} entities among {ii} entities\n")
if authors:
dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
sys.stderr.write(f"Done!\n")
if __name__ == "__main__":
sys.exit(main(inp_fp=sys.argv[1],
id_fp=sys.argv[2],
out_fp="au_entities.json"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment