mtholder/author_using_qwikidata.py

## author_using_qwikidata.py
#Tweaked example script from qwikidata
import sys
import time
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json
from subprocess import call

def dump_block_of_taxa(taxa, out_fp):
    sys.stderr.write(f"Dumping {len(taxa)} to {out_fp} ...\n")
    dump_entities_to_json(taxa, out_fp)
    retcode = call(["bzip2", out_fp])
    if retcode != 0:
        raise RuntimeError(f"bzip2 of {out_fp} failed.\n")

def main(inp_fp, id_fp, out_fp):
    with open(id_fp, "r") as inp:
        tmp = [i.strip() for i in inp.readlines()]
    ids_to_save = []
    for i in tmp:
        if not i:
            continue
        assert(i.startswith('"'))
        assert(i.endswith('"'))
        ids_to_save.append(i[1:-1])

    ids_to_save = frozenset(ids_to_save)


    # create an instance of WikidataJsonDump
    wjd_dump_path = sys.argv[1]
    wjd = WikidataJsonDump(wjd_dump_path)

    # create an iterable of WikidataItem representing politicians
    authors = []
    t1 = time.time()
    prev_dumped = 0
    for ii, entity_dict in enumerate(wjd):
        qid = entity_dict["id"]
        if qid in ids_to_save:
            entity = WikidataItem(entity_dict)
            authors.append(entity)
        nt = len(authors)
        if ii % 1000 == 0:
            t2 = time.time()
            dt = t2 - t1
            rate = ii/dt
            tt = nt + prev_dumped
            sys.stderr.write(f"found {tt} entities among {ii} entities [entities/s: {rate:.2f}]\n")
        if nt == 1000:
            prev_dumped += nt
            dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
            authors = []
    nt = len(authors)
    prev_dumped += nt
    sys.stderr.write(f"found {prev_dumped} entities among {ii} entities\n")
    if authors:
        dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
    sys.stderr.write(f"Done!\n")


if __name__ == "__main__":
    sys.exit(main(inp_fp=sys.argv[1],
                  id_fp=sys.argv[2],
                  out_fp="au_entities.json"))
	#Tweaked example script from qwikidata
	import sys
	import time
	from qwikidata.entity import WikidataItem
	from qwikidata.json_dump import WikidataJsonDump
	from qwikidata.utils import dump_entities_to_json
	from subprocess import call

	def dump_block_of_taxa(taxa, out_fp):
	sys.stderr.write(f"Dumping {len(taxa)} to {out_fp} ...\n")
	dump_entities_to_json(taxa, out_fp)
	retcode = call(["bzip2", out_fp])
	if retcode != 0:
	raise RuntimeError(f"bzip2 of {out_fp} failed.\n")

	def main(inp_fp, id_fp, out_fp):
	with open(id_fp, "r") as inp:
	tmp = [i.strip() for i in inp.readlines()]
	ids_to_save = []
	for i in tmp:
	if not i:
	continue
	assert(i.startswith('"'))
	assert(i.endswith('"'))
	ids_to_save.append(i[1:-1])

	ids_to_save = frozenset(ids_to_save)


	# create an instance of WikidataJsonDump
	wjd_dump_path = sys.argv[1]
	wjd = WikidataJsonDump(wjd_dump_path)

	# create an iterable of WikidataItem representing politicians
	authors = []
	t1 = time.time()
	prev_dumped = 0
	for ii, entity_dict in enumerate(wjd):
	qid = entity_dict["id"]
	if qid in ids_to_save:
	entity = WikidataItem(entity_dict)
	authors.append(entity)
	nt = len(authors)
	if ii % 1000 == 0:
	t2 = time.time()
	dt = t2 - t1
	rate = ii/dt
	tt = nt + prev_dumped
	sys.stderr.write(f"found {tt} entities among {ii} entities [entities/s: {rate:.2f}]\n")
	if nt == 1000:
	prev_dumped += nt
	dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
	authors = []
	nt = len(authors)
	prev_dumped += nt
	sys.stderr.write(f"found {prev_dumped} entities among {ii} entities\n")
	if authors:
	dump_block_of_taxa(authors, f"block-{prev_dumped}-entities-{out_fp}")
	sys.stderr.write(f"Done!\n")


	if __name__ == "__main__":
	sys.exit(main(inp_fp=sys.argv[1],
	id_fp=sys.argv[2],
	out_fp="au_entities.json"))
No results found