Created
February 19, 2026 17:08
-
-
Save do-me/a3c68f9a63c7171a0d202cd0db23390a to your computer and use it in GitHub Desktop.
Download all EO metadata from WEkEO platform via API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "requests", | |
| # "pandas", | |
| # "pyarrow", | |
| # "tqdm", | |
| # ] | |
| # /// | |
| import requests | |
| import pandas as pd | |
| from tqdm import tqdm | |
| import time | |
| import json | |
| # --- CONFIGURATION --- | |
| BASE_URL = "https://gateway.prod.wekeo2.eu/hda-broker/api/v1" | |
| OUTPUT_FILE = "wekeo_datasets_metadata.parquet" | |
| # If the API requires authentication, enter your Bearer token here. | |
| # For many metadata operations on WEkEO, a token is required. | |
| API_TOKEN = "your token" | |
| def get_headers(): | |
| headers = {"Accept": "application/json"} | |
| if API_TOKEN and API_TOKEN != "YOUR_WEKEO_API_TOKEN_HERE": | |
| headers["Authorization"] = f"Bearer {API_TOKEN}" | |
| return headers | |
| def fetch_all_dataset_ids(): | |
| """Fetches all dataset IDs by iterating through paginated results.""" | |
| dataset_ids = [] | |
| start_index = 0 | |
| items_per_page = 50 # Fetching in batches | |
| print("Fetching dataset list...") | |
| while True: | |
| params = { | |
| "startIndex": start_index, | |
| "itemsPerPage": items_per_page | |
| } | |
| response = requests.get(f"{BASE_URL}/datasets", params=params, headers=get_headers()) | |
| response.raise_for_status() | |
| data = response.json() | |
| features = data.get("features", []) | |
| if not features: | |
| break | |
| for item in features: | |
| if "dataset_id" in item: | |
| dataset_ids.append(item["dataset_id"]) | |
| start_index += items_per_page | |
| # Optional: Break if we reached totalResults | |
| if start_index >= data.get("properties", {}).get("totalResults", 0): | |
| break | |
| print(f"Found {len(dataset_ids)} datasets.") | |
| return dataset_ids | |
| def fetch_dataset_details(dataset_id): | |
| """Fetches full metadata for a specific dataset ID.""" | |
| try: | |
| response = requests.get(f"{BASE_URL}/datasets/{dataset_id}", headers=get_headers()) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| print(f"Error fetching {dataset_id}: {response.status_code}") | |
| return None | |
| except Exception as e: | |
| print(f"Exception for {dataset_id}: {e}") | |
| return None | |
| def main(): | |
| # 1. Get IDs | |
| ids = fetch_all_dataset_ids() | |
| # 2. Loop through individual datasets | |
| detailed_data = [] | |
| print("Fetching detailed metadata for each dataset...") | |
| for d_id in tqdm(ids[:5]): | |
| details = fetch_dataset_details(d_id) | |
| if details: | |
| detailed_data.append(details) | |
| # Small sleep to be polite to the API | |
| time.sleep(1.1) | |
| if not detailed_data: | |
| print("No data collected. Check your API Token or Connection.") | |
| return | |
| # 3. Process and Flatten | |
| df = pd.json_normalize(detailed_data) | |
| # --- FIX: Sanitize columns for Parquet --- | |
| # Identify columns that contain lists or dicts and convert them to JSON strings | |
| for col in df.columns: | |
| # Check if the column has any list or dict objects | |
| is_complex = df[col].apply(lambda x: isinstance(x, (list, dict))).any() | |
| if is_complex: | |
| df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x) | |
| # ----------------------------------------- | |
| # 4. Save to Parquet | |
| print(f"Saving {len(df)} records to {OUTPUT_FILE}...") | |
| df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False) | |
| print("Done!") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment