Created
October 28, 2022 15:49
-
-
Save richard-orr/499a580b04fb5904ccc12f8031fa3b95 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import glob | |
| import gzip | |
| import json | |
| import os | |
| SNAPSHOT_DIR = 'openalex-snapshot' | |
| CSV_DIR = 'csv-files' | |
| FILES_PER_ENTITY = int(os.environ.get('OPENALEX_DEMO_FILES_PER_ENTITY', '0')) | |
| csv_files = { | |
| 'institutions': { | |
| 'institutions': { | |
| 'name': os.path.join(CSV_DIR, 'institutions.csv.gz'), | |
| 'columns': [ | |
| 'id', 'ror', 'display_name', 'country_code', 'type', 'homepage_url', 'image_url', 'image_thumbnail_url', | |
| 'display_name_acroynyms', 'display_name_alternatives', 'works_count', 'cited_by_count', 'works_api_url', | |
| 'updated_date' | |
| ] | |
| }, | |
| 'ids': { | |
| 'name': os.path.join(CSV_DIR, 'institutions_ids.csv.gz'), | |
| 'columns': [ | |
| 'institution_id', 'openalex', 'ror', 'grid', 'wikipedia', 'wikidata', 'mag' | |
| ] | |
| }, | |
| 'geo': { | |
| 'name': os.path.join(CSV_DIR, 'institutions_geo.csv.gz'), | |
| 'columns': [ | |
| 'institution_id', 'city', 'geonames_city_id', 'region', 'country_code', 'country', 'latitude', | |
| 'longitude' | |
| ] | |
| }, | |
| 'associated_institutions': { | |
| 'name': os.path.join(CSV_DIR, 'institutions_associated_institutions.csv.gz'), | |
| 'columns': [ | |
| 'institution_id', 'associated_institution_id', 'relationship' | |
| ] | |
| }, | |
| 'counts_by_year': { | |
| 'name': os.path.join(CSV_DIR, 'institutions_counts_by_year.csv.gz'), | |
| 'columns': [ | |
| 'institution_id', 'year', 'works_count', 'cited_by_count' | |
| ] | |
| } | |
| }, | |
| 'authors': { | |
| 'authors': { | |
| 'name': os.path.join(CSV_DIR, 'authors.csv.gz'), | |
| 'columns': [ | |
| 'id', 'orcid', 'display_name', 'display_name_alternatives', 'works_count', 'cited_by_count', | |
| 'last_known_institution', 'works_api_url', 'updated_date' | |
| ] | |
| }, | |
| 'ids': { | |
| 'name': os.path.join(CSV_DIR, 'authors_ids.csv.gz'), | |
| 'columns': [ | |
| 'author_id', 'openalex', 'orcid', 'scopus', 'twitter', 'wikipedia', 'mag' | |
| ] | |
| }, | |
| 'counts_by_year': { | |
| 'name': os.path.join(CSV_DIR, 'authors_counts_by_year.csv.gz'), | |
| 'columns': [ | |
| 'author_id', 'year', 'works_count', 'cited_by_count' | |
| ] | |
| } | |
| }, | |
| 'concepts': { | |
| 'concepts': { | |
| 'name': os.path.join(CSV_DIR, 'concepts.csv.gz'), | |
| 'columns': [ | |
| 'id', 'wikidata', 'display_name', 'level', 'description', 'works_count', 'cited_by_count', 'image_url', | |
| 'image_thumbnail_url', 'works_api_url', 'updated_date' | |
| ] | |
| }, | |
| 'ancestors': { | |
| 'name': os.path.join(CSV_DIR, 'concepts_ancestors.csv.gz'), | |
| 'columns': ['concept_id', 'ancestor_id'] | |
| }, | |
| 'counts_by_year': { | |
| 'name': os.path.join(CSV_DIR, 'concepts_counts_by_year.csv.gz'), | |
| 'columns': ['concept_id', 'year', 'works_count', 'cited_by_count'] | |
| }, | |
| 'ids': { | |
| 'name': os.path.join(CSV_DIR, 'concepts_ids.csv.gz'), | |
| 'columns': ['concept_id', 'openalex', 'wikidata', 'wikipedia', 'umls_aui', 'umls_cui', 'mag'] | |
| }, | |
| 'related_concepts': { | |
| 'name': os.path.join(CSV_DIR, 'concepts_related_concepts.csv.gz'), | |
| 'columns': ['concept_id', 'related_concept_id', 'score'] | |
| } | |
| }, | |
| 'venues': { | |
| 'venues': { | |
| 'name': os.path.join(CSV_DIR, 'venues.csv.gz'), | |
| 'columns': [ | |
| 'id', 'issn_l', 'issn', 'display_name', 'publisher', 'works_count', 'cited_by_count', 'is_oa', | |
| 'is_in_doaj', 'homepage_url', 'works_api_url', 'updated_date' | |
| ] | |
| }, | |
| 'ids': { | |
| 'name': os.path.join(CSV_DIR, 'venues_ids.csv.gz'), | |
| 'columns': ['venue_id', 'openalex', 'issn_l', 'issn', 'mag'] | |
| }, | |
| 'counts_by_year': { | |
| 'name': os.path.join(CSV_DIR, 'venues_counts_by_year.csv.gz'), | |
| 'columns': ['venue_id', 'year', 'works_count', 'cited_by_count'] | |
| }, | |
| }, | |
| 'works': { | |
| 'works': { | |
| 'name': os.path.join(CSV_DIR, 'works.csv.gz'), | |
| 'columns': [ | |
| 'id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'type', 'cited_by_count', | |
| 'is_retracted', 'is_paratext', 'cited_by_api_url', 'abstract_inverted_index' | |
| ] | |
| }, | |
| 'host_venues': { | |
| 'name': os.path.join(CSV_DIR, 'works_host_venues.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'venue_id', 'url', 'is_oa', 'version', 'license' | |
| ] | |
| }, | |
| 'alternate_host_venues': { | |
| 'name': os.path.join(CSV_DIR, 'works_alternate_host_venues.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'venue_id', 'url', 'is_oa', 'version', 'license' | |
| ] | |
| }, | |
| 'authorships': { | |
| 'name': os.path.join(CSV_DIR, 'works_authorships.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'author_position', 'author_id', 'institution_id', 'raw_affiliation_string' | |
| ] | |
| }, | |
| 'biblio': { | |
| 'name': os.path.join(CSV_DIR, 'works_biblio.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'volume', 'issue', 'first_page', 'last_page' | |
| ] | |
| }, | |
| 'concepts': { | |
| 'name': os.path.join(CSV_DIR, 'works_concepts.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'concept_id', 'score' | |
| ] | |
| }, | |
| 'ids': { | |
| 'name': os.path.join(CSV_DIR, 'works_ids.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'openalex', 'doi', 'mag', 'pmid', 'pmcid' | |
| ] | |
| }, | |
| 'mesh': { | |
| 'name': os.path.join(CSV_DIR, 'works_mesh.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'descriptor_ui', 'descriptor_name', 'qualifier_ui', 'qualifier_name', 'is_major_topic' | |
| ] | |
| }, | |
| 'open_access': { | |
| 'name': os.path.join(CSV_DIR, 'works_open_access.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'is_oa', 'oa_status', 'oa_url' | |
| ] | |
| }, | |
| 'referenced_works': { | |
| 'name': os.path.join(CSV_DIR, 'works_referenced_works.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'referenced_work_id' | |
| ] | |
| }, | |
| 'related_works': { | |
| 'name': os.path.join(CSV_DIR, 'works_related_works.csv.gz'), | |
| 'columns': [ | |
| 'work_id', 'related_work_id' | |
| ] | |
| }, | |
| }, | |
| } | |
| def flatten_concepts(): | |
| with gzip.open(csv_files['concepts']['concepts']['name'], 'wt', encoding='utf-8') as concepts_csv, \ | |
| gzip.open(csv_files['concepts']['ancestors']['name'], 'wt', encoding='utf-8') as ancestors_csv, \ | |
| gzip.open(csv_files['concepts']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv, \ | |
| gzip.open(csv_files['concepts']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ | |
| gzip.open(csv_files['concepts']['related_concepts']['name'], 'wt', encoding='utf-8') as related_concepts_csv: | |
| concepts_writer = csv.DictWriter( | |
| concepts_csv, fieldnames=csv_files['concepts']['concepts']['columns'], extrasaction='ignore' | |
| ) | |
| concepts_writer.writeheader() | |
| ancestors_writer = csv.DictWriter(ancestors_csv, fieldnames=csv_files['concepts']['ancestors']['columns']) | |
| ancestors_writer.writeheader() | |
| counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['concepts']['counts_by_year']['columns']) | |
| counts_by_year_writer.writeheader() | |
| ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['concepts']['ids']['columns']) | |
| ids_writer.writeheader() | |
| related_concepts_writer = csv.DictWriter(related_concepts_csv, fieldnames=csv_files['concepts']['related_concepts']['columns']) | |
| related_concepts_writer.writeheader() | |
| seen_concept_ids = set() | |
| files_done = 0 | |
| for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'concepts', '*', '*.gz')): | |
| print(jsonl_file_name) | |
| with gzip.open(jsonl_file_name, 'r') as concepts_jsonl: | |
| for concept_json in concepts_jsonl: | |
| if not concept_json.strip(): | |
| continue | |
| concept = json.loads(concept_json) | |
| concept_id = concept.get('id') | |
| if not concept_id or concept_id in seen_concept_ids: | |
| continue | |
| seen_concept_ids.add(concept_id) | |
| concepts_writer.writerow(concept) | |
| concept_ids = concept.get('ids') | |
| if concept_ids: | |
| concept_ids['concept_id'] = concept_id | |
| concept_ids['umls_aui'] = json.dumps(concept_ids.get('umls_aui'), ensure_ascii=False) | |
| concept_ids['umls_cui'] = json.dumps(concept_ids.get('umls_cui'), ensure_ascii=False) | |
| ids_writer.writerow(concept_ids) | |
| ancestors = concept.get('ancestors') | |
| if ancestors: | |
| for ancestor in ancestors: | |
| ancestor_id = ancestor.get('id') | |
| if ancestor_id: | |
| ancestors_writer.writerow({ | |
| 'concept_id': concept_id, | |
| 'ancestor_id': ancestor_id | |
| }) | |
| counts_by_year = concept.get('counts_by_year') | |
| if counts_by_year: | |
| for count_by_year in counts_by_year: | |
| count_by_year['concept_id'] = concept_id | |
| counts_by_year_writer.writerow(count_by_year) | |
| related_concepts = concept.get('related_concepts') | |
| if related_concepts: | |
| for related_concept in related_concepts: | |
| related_concept_id = related_concept.get('id') | |
| if related_concept_id: | |
| related_concepts_writer.writerow({ | |
| 'concept_id': concept_id, | |
| 'related_concept_id': related_concept_id, | |
| 'score': related_concept.get('score') | |
| }) | |
| files_done += 1 | |
| if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: | |
| break | |
| def flatten_venues(): | |
| with gzip.open(csv_files['venues']['venues']['name'], 'wt', encoding='utf-8') as venues_csv, \ | |
| gzip.open(csv_files['venues']['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ | |
| gzip.open(csv_files['venues']['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: | |
| venues_writer = csv.DictWriter( | |
| venues_csv, fieldnames=csv_files['venues']['venues']['columns'], extrasaction='ignore' | |
| ) | |
| venues_writer.writeheader() | |
| ids_writer = csv.DictWriter(ids_csv, fieldnames=csv_files['venues']['ids']['columns']) | |
| ids_writer.writeheader() | |
| counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=csv_files['venues']['counts_by_year']['columns']) | |
| counts_by_year_writer.writeheader() | |
| seen_venue_ids = set() | |
| files_done = 0 | |
| for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'venues', '*', '*.gz')): | |
| print(jsonl_file_name) | |
| with gzip.open(jsonl_file_name, 'r') as venues_jsonl: | |
| for venue_json in venues_jsonl: | |
| if not venue_json.strip(): | |
| continue | |
| venue = json.loads(venue_json) | |
| venue_id = venue.get('id') | |
| if not venue_id or venue_id in seen_venue_ids: | |
| continue | |
| seen_venue_ids.add(venue_id) | |
| venue['issn'] = json.dumps(venue.get('issn')) | |
| venues_writer.writerow(venue) | |
| venue_ids = venue.get('ids') | |
| if venue_ids: | |
| venue_ids['venue_id'] = venue_id | |
| venue_ids['issn'] = json.dumps(venue_ids.get('issn')) | |
| ids_writer.writerow(venue_ids) | |
| counts_by_year = venue.get('counts_by_year') | |
| if counts_by_year: | |
| for count_by_year in counts_by_year: | |
| count_by_year['venue_id'] = venue_id | |
| counts_by_year_writer.writerow(count_by_year) | |
| files_done += 1 | |
| if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: | |
| break | |
| def flatten_institutions(): | |
| file_spec = csv_files['institutions'] | |
| with gzip.open(file_spec['institutions']['name'], 'wt', encoding='utf-8') as institutions_csv, \ | |
| gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ | |
| gzip.open(file_spec['geo']['name'], 'wt', encoding='utf-8') as geo_csv, \ | |
| gzip.open(file_spec['associated_institutions']['name'], 'wt', encoding='utf-8') as associated_institutions_csv, \ | |
| gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: | |
| institutions_writer = csv.DictWriter( | |
| institutions_csv, fieldnames=file_spec['institutions']['columns'], extrasaction='ignore' | |
| ) | |
| institutions_writer.writeheader() | |
| ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns']) | |
| ids_writer.writeheader() | |
| geo_writer = csv.DictWriter(geo_csv, fieldnames=file_spec['geo']['columns']) | |
| geo_writer.writeheader() | |
| associated_institutions_writer = csv.DictWriter( | |
| associated_institutions_csv, fieldnames=file_spec['associated_institutions']['columns'] | |
| ) | |
| associated_institutions_writer.writeheader() | |
| counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns']) | |
| counts_by_year_writer.writeheader() | |
| seen_institution_ids = set() | |
| files_done = 0 | |
| for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'institutions', '*', '*.gz')): | |
| print(jsonl_file_name) | |
| with gzip.open(jsonl_file_name, 'r') as institutions_jsonl: | |
| for institution_json in institutions_jsonl: | |
| if not institution_json.strip(): | |
| continue | |
| institution = json.loads(institution_json) | |
| institution_id = institution.get('id') | |
| if not institution_id or institution_id in seen_institution_ids: | |
| continue | |
| seen_institution_ids.add(institution_id) | |
| # institutions | |
| institution['display_name_acroynyms'] = json.dumps(institution.get('display_name_acroynyms'), ensure_ascii=False) | |
| institution['display_name_alternatives'] = json.dumps(institution.get('display_name_alternatives'), ensure_ascii=False) | |
| institutions_writer.writerow(institution) | |
| # ids | |
| institution_ids = institution.get('ids') | |
| if institution_ids: | |
| institution_ids['institution_id'] = institution_id | |
| ids_writer.writerow(institution_ids) | |
| # geo | |
| institution_geo = institution.get('geo') | |
| if institution_geo: | |
| institution_geo['institution_id'] = institution_id | |
| geo_writer.writerow(institution_geo) | |
| # associated_institutions | |
| associated_institutions = institution.get( | |
| 'associated_institutions', institution.get('associated_insitutions') # typo in api | |
| ) | |
| if associated_institutions: | |
| for associated_institution in associated_institutions: | |
| associated_institution_id = associated_institution.get('id') | |
| if associated_institution_id: | |
| associated_institutions_writer.writerow({ | |
| 'institution_id': institution_id, | |
| 'associated_institution_id': associated_institution_id, | |
| 'relationship': associated_institution.get('relationship') | |
| }) | |
| # counts_by_year | |
| counts_by_year = institution.get('counts_by_year') | |
| if counts_by_year: | |
| for count_by_year in counts_by_year: | |
| count_by_year['institution_id'] = institution_id | |
| counts_by_year_writer.writerow(count_by_year) | |
| files_done += 1 | |
| if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: | |
| break | |
| def flatten_authors(): | |
| file_spec = csv_files['authors'] | |
| with gzip.open(file_spec['authors']['name'], 'wt', encoding='utf-8') as authors_csv, \ | |
| gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ | |
| gzip.open(file_spec['counts_by_year']['name'], 'wt', encoding='utf-8') as counts_by_year_csv: | |
| authors_writer = csv.DictWriter( | |
| authors_csv, fieldnames=file_spec['authors']['columns'], extrasaction='ignore' | |
| ) | |
| authors_writer.writeheader() | |
| ids_writer = csv.DictWriter(ids_csv, fieldnames=file_spec['ids']['columns']) | |
| ids_writer.writeheader() | |
| counts_by_year_writer = csv.DictWriter(counts_by_year_csv, fieldnames=file_spec['counts_by_year']['columns']) | |
| counts_by_year_writer.writeheader() | |
| files_done = 0 | |
| for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'authors', '*', '*.gz')): | |
| print(jsonl_file_name) | |
| with gzip.open(jsonl_file_name, 'r') as authors_jsonl: | |
| for author_json in authors_jsonl: | |
| if not author_json.strip(): | |
| continue | |
| author = json.loads(author_json) | |
| author_id = author.get('id') | |
| if not author_id: | |
| continue | |
| # authors | |
| author['display_name_alternatives'] = json.dumps(author.get('display_name_alternatives'), ensure_ascii=False) | |
| author['last_known_institution'] = (author.get('last_known_institution') or {}).get('id') | |
| authors_writer.writerow(author) | |
| # ids | |
| author_ids = author.get('ids') | |
| if author_ids: | |
| author_ids['author_id'] = author_id | |
| ids_writer.writerow(author_ids) | |
| # counts_by_year | |
| counts_by_year = author.get('counts_by_year') | |
| if counts_by_year: | |
| for count_by_year in counts_by_year: | |
| count_by_year['author_id'] = author_id | |
| counts_by_year_writer.writerow(count_by_year) | |
| files_done += 1 | |
| if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: | |
| break | |
| def flatten_works(): | |
| file_spec = csv_files['works'] | |
| with gzip.open(file_spec['works']['name'], 'wt', encoding='utf-8') as works_csv, \ | |
| gzip.open(file_spec['host_venues']['name'], 'wt', encoding='utf-8') as host_venues_csv, \ | |
| gzip.open(file_spec['alternate_host_venues']['name'], 'wt', encoding='utf-8') as alternate_host_venues_csv, \ | |
| gzip.open(file_spec['authorships']['name'], 'wt', encoding='utf-8') as authorships_csv, \ | |
| gzip.open(file_spec['biblio']['name'], 'wt', encoding='utf-8') as biblio_csv, \ | |
| gzip.open(file_spec['concepts']['name'], 'wt', encoding='utf-8') as concepts_csv, \ | |
| gzip.open(file_spec['ids']['name'], 'wt', encoding='utf-8') as ids_csv, \ | |
| gzip.open(file_spec['mesh']['name'], 'wt', encoding='utf-8') as mesh_csv, \ | |
| gzip.open(file_spec['open_access']['name'], 'wt', encoding='utf-8') as open_access_csv, \ | |
| gzip.open(file_spec['referenced_works']['name'], 'wt', encoding='utf-8') as referenced_works_csv, \ | |
| gzip.open(file_spec['related_works']['name'], 'wt', encoding='utf-8') as related_works_csv: | |
| works_writer = init_dict_writer(works_csv, file_spec['works'], extrasaction='ignore') | |
| host_venues_writer = init_dict_writer(host_venues_csv, file_spec['host_venues']) | |
| alternate_host_venues_writer = init_dict_writer(alternate_host_venues_csv, file_spec['alternate_host_venues']) | |
| authorships_writer = init_dict_writer(authorships_csv, file_spec['authorships']) | |
| biblio_writer = init_dict_writer(biblio_csv, file_spec['biblio']) | |
| concepts_writer = init_dict_writer(concepts_csv, file_spec['concepts']) | |
| ids_writer = init_dict_writer(ids_csv, file_spec['ids'], extrasaction='ignore') | |
| mesh_writer = init_dict_writer(mesh_csv, file_spec['mesh']) | |
| open_access_writer = init_dict_writer(open_access_csv, file_spec['open_access']) | |
| referenced_works_writer = init_dict_writer(referenced_works_csv, file_spec['referenced_works']) | |
| related_works_writer = init_dict_writer(related_works_csv, file_spec['related_works']) | |
| files_done = 0 | |
| for jsonl_file_name in glob.glob(os.path.join(SNAPSHOT_DIR, 'data', 'works', '*', '*.gz')): | |
| print(jsonl_file_name) | |
| with gzip.open(jsonl_file_name, 'r') as works_jsonl: | |
| for work_json in works_jsonl: | |
| if not work_json.strip(): | |
| continue | |
| work = json.loads(work_json) | |
| work_id = work.get('id') | |
| if not work_id: | |
| continue | |
| # works | |
| abstract = work.get('abstract_inverted_index') | |
| if abstract is not None: | |
| work['abstract_inverted_index'] = json.dumps(abstract, ensure_ascii=False) | |
| works_writer.writerow(work) | |
| # host_venues | |
| host_venue = work.get('host_venue') or {} | |
| if host_venue: | |
| host_venue_id = host_venue.get('id') | |
| if host_venue_id: | |
| host_venues_writer.writerow({ | |
| 'work_id': work_id, | |
| 'venue_id': host_venue_id, | |
| 'url': host_venue.get('url'), | |
| 'is_oa': host_venue.get('is_oa'), | |
| 'version': host_venue.get('version'), | |
| 'license': host_venue.get('license'), | |
| }) | |
| # alternate_host_venues | |
| alternate_host_venues = work.get('alternate_host_venues') | |
| if alternate_host_venues: | |
| for alternate_host_venue in alternate_host_venues: | |
| venue_id = alternate_host_venue.get('id') | |
| if venue_id: | |
| alternate_host_venues_writer.writerow({ | |
| 'work_id': work_id, | |
| 'venue_id': venue_id, | |
| 'url': alternate_host_venue.get('url'), | |
| 'is_oa': alternate_host_venue.get('is_oa'), | |
| 'version': alternate_host_venue.get('version'), | |
| 'license': alternate_host_venue.get('license'), | |
| }) | |
| # authorships | |
| authorships = work.get('authorships') | |
| if authorships: | |
| for authorship in authorships: | |
| author_id = authorship.get('author', {}).get('id') | |
| if author_id: | |
| institutions = authorship.get('institutions') | |
| institution_ids = [i.get('id') for i in institutions] | |
| institution_ids = [i for i in institution_ids if i] | |
| institution_ids = institution_ids or [None] | |
| for institution_id in institution_ids: | |
| authorships_writer.writerow({ | |
| 'work_id': work_id, | |
| 'author_position': authorship.get('author_position'), | |
| 'author_id': author_id, | |
| 'institution_id': institution_id, | |
| 'raw_affiliation_string': authorship.get('raw_affiliation_string'), | |
| }) | |
| # biblio | |
| biblio = work.get('biblio') | |
| if biblio: | |
| biblio['work_id'] = work_id | |
| biblio_writer.writerow(biblio) | |
| # concepts | |
| for concept in work.get('concepts'): | |
| concept_id = concept.get('id') | |
| if concept_id: | |
| concepts_writer.writerow({ | |
| 'work_id': work_id, | |
| 'concept_id': concept_id, | |
| 'score': concept.get('score'), | |
| }) | |
| # ids | |
| ids = work.get('ids') | |
| if ids: | |
| ids['work_id'] = work_id | |
| ids_writer.writerow(ids) | |
| # mesh | |
| for mesh in work.get('mesh'): | |
| mesh['work_id'] = work_id | |
| mesh_writer.writerow(mesh) | |
| # open_access | |
| open_access = work.get('open_access') | |
| if open_access: | |
| open_access['work_id'] = work_id | |
| open_access_writer.writerow(open_access) | |
| # referenced_works | |
| for referenced_work in work.get('referenced_works'): | |
| if referenced_work: | |
| referenced_works_writer.writerow({ | |
| 'work_id': work_id, | |
| 'referenced_work_id': referenced_work | |
| }) | |
| # related_works | |
| for related_work in work.get('related_works'): | |
| if related_work: | |
| related_works_writer.writerow({ | |
| 'work_id': work_id, | |
| 'related_work_id': related_work | |
| }) | |
| files_done += 1 | |
| if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY: | |
| break | |
| def init_dict_writer(csv_file, file_spec, **kwargs): | |
| writer = csv.DictWriter( | |
| csv_file, fieldnames=file_spec['columns'], **kwargs | |
| ) | |
| writer.writeheader() | |
| return writer | |
| if __name__ == '__main__': | |
| flatten_concepts() | |
| flatten_venues() | |
| flatten_institutions() | |
| flatten_authors() | |
| flatten_works() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment