Created
February 10, 2025 20:26
-
-
Save grujdin/4821030a8b69b16fd7ad46dcc4fe54e4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import xml.etree.ElementTree as ET | |
| ############################################################################### | |
| # 1) Define file paths | |
| ############################################################################### | |
| excel_file_path = 'D:/ProjDB/GAUL/public_emdat_GDIS_GAUL_aligned.xlsx' | |
| # GAUL Level-2 XML (for FID_2) | |
| xml_file_path_2 = 'D:/ProjDB/GAUL/g2015_2014_2_GeomExtract.xml' | |
| # GAUL Level-1 XML (for FID_1) | |
| xml_file_path_1 = 'D:/ProjDB/GAUL/g2015_2014_1_GeomExtract.xml' | |
| # Final single output (with both FID_1 and FID_2 columns) | |
| output_file_path = 'D:/ProjDB/GAUL/public_emdat_GDIS_GAUL_FIDs.xlsx' | |
| ############################################################################### | |
| # 2) Load Excel file into DataFrame | |
| ############################################################################### | |
| df = pd.read_excel(excel_file_path) | |
| ############################################################################### | |
| # 3) Register XML namespaces (required for XPath lookups) | |
| ############################################################################### | |
| namespaces = { | |
| 'gml': 'http://www.opengis.net/gml', | |
| 'gaul': 'http://www.fao.org/tempref/AG/Reserved/PPLPF/ftpOUT/GLiPHA/Gaulmaps/gaul_2008/documentation/GAUL%20Doc01%20Ver16.pdf', | |
| 'wfs': 'http://www.opengis.net/wfs', | |
| 'xs': 'http://www.w3.org/2001/XMLSchema', | |
| 'xsi': 'http://www.w3.org/2001/XMLSchema-instance' | |
| } | |
| for prefix, uri in namespaces.items(): | |
| ET.register_namespace(prefix, uri) | |
| ############################################################################### | |
| # 4) Extract FID_2 from GAUL Level-2 XML | |
| ############################################################################### | |
| tree_2 = ET.parse(xml_file_path_2) | |
| root_2 = tree_2.getroot() | |
| fids_2 = [] | |
| for _, row in df.iterrows(): | |
| fid_2 = None # Default if no match found | |
| # Check that row has adm2_code and adm2_name | |
| if pd.notnull(row.get('adm2_code')) and pd.notnull(row.get('adm2_name')): | |
| # Convert to string and escape any double quotes | |
| escaped_adm2_code = str(row['adm2_code']).replace('"', '\\"') | |
| escaped_adm2_name = str(row['adm2_name']).replace('"', '\\"') | |
| # Try matching by adm2_code first (using double quotes in the XPath) | |
| xpath_query = f'.//gaul:adm2_code[.="{escaped_adm2_code}"]/..' | |
| matching_features = root_2.findall(xpath_query, namespaces=namespaces) | |
| # If no match, try adm2_name | |
| if not matching_features: | |
| xpath_query = f'.//gaul:adm2_name[.="{escaped_adm2_name}"]/..' | |
| matching_features = root_2.findall(xpath_query, namespaces=namespaces) | |
| # If we have matches, take the first one's gaul:FID | |
| for feature in matching_features: | |
| fid_element = feature.find('.//gaul:FID', namespaces=namespaces) | |
| if fid_element is not None: | |
| fid_2 = fid_element.text | |
| break # Only the first match | |
| fids_2.append(fid_2) | |
| df['FID_2'] = fids_2 # Add column for FID_2 | |
| ############################################################################### | |
| # 5) Extract FID_1 from GAUL Level-1 XML | |
| ############################################################################### | |
| tree_1 = ET.parse(xml_file_path_1) | |
| root_1 = tree_1.getroot() | |
| fids_1 = [] | |
| for _, row in df.iterrows(): | |
| fid_1 = None # Default if no match found | |
| # Check that row has adm1_code and adm1_name | |
| if pd.notnull(row.get('adm1_code')) and pd.notnull(row.get('adm1_name')): | |
| # Convert to string and escape any double quotes | |
| escaped_adm1_code = str(row['adm1_code']).replace('"', '\\"') | |
| escaped_adm1_name = str(row['adm1_name']).replace('"', '\\"') | |
| # Try matching by adm1_code first | |
| xpath_query = f'.//gaul:adm1_code[.="{escaped_adm1_code}"]/..' | |
| matching_features = root_1.findall(xpath_query, namespaces=namespaces) | |
| # If no match, try adm1_name | |
| if not matching_features: | |
| xpath_query = f'.//gaul:adm1_name[.="{escaped_adm1_name}"]/..' | |
| matching_features = root_1.findall(xpath_query, namespaces=namespaces) | |
| # If we have matches, take the first one's gaul:FID | |
| for feature in matching_features: | |
| fid_element = feature.find('.//gaul:FID', namespaces=namespaces) | |
| if fid_element is not None: | |
| fid_1 = fid_element.text | |
| break # Only the first match | |
| fids_1.append(fid_1) | |
| df['FID_1'] = fids_1 # Add column for FID_1 | |
| ############################################################################### | |
| # 6) Save the final DataFrame with both FID_1 and FID_2 columns | |
| ############################################################################### | |
| df.to_excel(output_file_path, index=False) | |
| print(f"Final file saved with 'FID_1' and 'FID_2' columns: {output_file_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment