Skip to content

Instantly share code, notes, and snippets.

@do-me
Created February 19, 2026 17:08
Show Gist options
  • Select an option

  • Save do-me/a3c68f9a63c7171a0d202cd0db23390a to your computer and use it in GitHub Desktop.

Select an option

Save do-me/a3c68f9a63c7171a0d202cd0db23390a to your computer and use it in GitHub Desktop.
Download all EO metadata from WEkEO platform via API
# /// script
# dependencies = [
# "requests",
# "pandas",
# "pyarrow",
# "tqdm",
# ]
# ///
import requests
import pandas as pd
from tqdm import tqdm
import time
import json
# --- CONFIGURATION ---
BASE_URL = "https://gateway.prod.wekeo2.eu/hda-broker/api/v1"
OUTPUT_FILE = "wekeo_datasets_metadata.parquet"
# If the API requires authentication, enter your Bearer token here.
# For many metadata operations on WEkEO, a token is required.
API_TOKEN = "your token"
def get_headers():
headers = {"Accept": "application/json"}
if API_TOKEN and API_TOKEN != "YOUR_WEKEO_API_TOKEN_HERE":
headers["Authorization"] = f"Bearer {API_TOKEN}"
return headers
def fetch_all_dataset_ids():
"""Fetches all dataset IDs by iterating through paginated results."""
dataset_ids = []
start_index = 0
items_per_page = 50 # Fetching in batches
print("Fetching dataset list...")
while True:
params = {
"startIndex": start_index,
"itemsPerPage": items_per_page
}
response = requests.get(f"{BASE_URL}/datasets", params=params, headers=get_headers())
response.raise_for_status()
data = response.json()
features = data.get("features", [])
if not features:
break
for item in features:
if "dataset_id" in item:
dataset_ids.append(item["dataset_id"])
start_index += items_per_page
# Optional: Break if we reached totalResults
if start_index >= data.get("properties", {}).get("totalResults", 0):
break
print(f"Found {len(dataset_ids)} datasets.")
return dataset_ids
def fetch_dataset_details(dataset_id):
"""Fetches full metadata for a specific dataset ID."""
try:
response = requests.get(f"{BASE_URL}/datasets/{dataset_id}", headers=get_headers())
if response.status_code == 200:
return response.json()
else:
print(f"Error fetching {dataset_id}: {response.status_code}")
return None
except Exception as e:
print(f"Exception for {dataset_id}: {e}")
return None
def main():
# 1. Get IDs
ids = fetch_all_dataset_ids()
# 2. Loop through individual datasets
detailed_data = []
print("Fetching detailed metadata for each dataset...")
for d_id in tqdm(ids[:5]):
details = fetch_dataset_details(d_id)
if details:
detailed_data.append(details)
# Small sleep to be polite to the API
time.sleep(1.1)
if not detailed_data:
print("No data collected. Check your API Token or Connection.")
return
# 3. Process and Flatten
df = pd.json_normalize(detailed_data)
# --- FIX: Sanitize columns for Parquet ---
# Identify columns that contain lists or dicts and convert them to JSON strings
for col in df.columns:
# Check if the column has any list or dict objects
is_complex = df[col].apply(lambda x: isinstance(x, (list, dict))).any()
if is_complex:
df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)
# -----------------------------------------
# 4. Save to Parquet
print(f"Saving {len(df)} records to {OUTPUT_FILE}...")
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
print("Done!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment