do-me/download.py

## download.py
# /// script
# dependencies = [
#   "requests",
#   "pandas",
#   "pyarrow",
#   "tqdm",
# ]
# ///

import requests
import pandas as pd
from tqdm import tqdm
import time
import json

# --- CONFIGURATION ---
BASE_URL = "https://gateway.prod.wekeo2.eu/hda-broker/api/v1"
OUTPUT_FILE = "wekeo_datasets_metadata.parquet"
# If the API requires authentication, enter your Bearer token here.
# For many metadata operations on WEkEO, a token is required.
API_TOKEN = "your token"

def get_headers():
    headers = {"Accept": "application/json"}
    if API_TOKEN and API_TOKEN != "YOUR_WEKEO_API_TOKEN_HERE":
        headers["Authorization"] = f"Bearer {API_TOKEN}"
    return headers

def fetch_all_dataset_ids():
    """Fetches all dataset IDs by iterating through paginated results."""
    dataset_ids = []
    start_index = 0
    items_per_page = 50  # Fetching in batches

    print("Fetching dataset list...")
    while True:
        params = {
            "startIndex": start_index,
            "itemsPerPage": items_per_page
        }

        response = requests.get(f"{BASE_URL}/datasets", params=params, headers=get_headers())
        response.raise_for_status()
        data = response.json()

        features = data.get("features", [])
        if not features:
            break

        for item in features:
            if "dataset_id" in item:
                dataset_ids.append(item["dataset_id"])

        start_index += items_per_page
        # Optional: Break if we reached totalResults
        if start_index >= data.get("properties", {}).get("totalResults", 0):
            break

    print(f"Found {len(dataset_ids)} datasets.")
    return dataset_ids

def fetch_dataset_details(dataset_id):
    """Fetches full metadata for a specific dataset ID."""
    try:
        response = requests.get(f"{BASE_URL}/datasets/{dataset_id}", headers=get_headers())
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error fetching {dataset_id}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Exception for {dataset_id}: {e}")
        return None

def main():
    # 1. Get IDs
    ids = fetch_all_dataset_ids()

    # 2. Loop through individual datasets
    detailed_data = []
    print("Fetching detailed metadata for each dataset...")
    for d_id in tqdm(ids[:5]):
        details = fetch_dataset_details(d_id)
        if details:
            detailed_data.append(details)
        # Small sleep to be polite to the API
        time.sleep(1.1)

    if not detailed_data:
        print("No data collected. Check your API Token or Connection.")
        return

    # 3. Process and Flatten
    df = pd.json_normalize(detailed_data)

    # --- FIX: Sanitize columns for Parquet ---
    # Identify columns that contain lists or dicts and convert them to JSON strings
    for col in df.columns:
        # Check if the column has any list or dict objects
        is_complex = df[col].apply(lambda x: isinstance(x, (list, dict))).any()
        if is_complex:
            df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)
    # -----------------------------------------

    # 4. Save to Parquet
    print(f"Saving {len(df)} records to {OUTPUT_FILE}...")
    df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
    print("Done!")

if __name__ == "__main__":
    main()
	# /// script
	# dependencies = [
	# "requests",
	# "pandas",
	# "pyarrow",
	# "tqdm",
	# ]
	# ///

	import requests
	import pandas as pd
	from tqdm import tqdm
	import time
	import json

	# --- CONFIGURATION ---
	BASE_URL = "https://gateway.prod.wekeo2.eu/hda-broker/api/v1"
	OUTPUT_FILE = "wekeo_datasets_metadata.parquet"
	# If the API requires authentication, enter your Bearer token here.
	# For many metadata operations on WEkEO, a token is required.
	API_TOKEN = "your token"

	def get_headers():
	headers = {"Accept": "application/json"}
	if API_TOKEN and API_TOKEN != "YOUR_WEKEO_API_TOKEN_HERE":
	headers["Authorization"] = f"Bearer {API_TOKEN}"
	return headers

	def fetch_all_dataset_ids():
	"""Fetches all dataset IDs by iterating through paginated results."""
	dataset_ids = []
	start_index = 0
	items_per_page = 50 # Fetching in batches

	print("Fetching dataset list...")
	while True:
	params = {
	"startIndex": start_index,
	"itemsPerPage": items_per_page
	}

	response = requests.get(f"{BASE_URL}/datasets", params=params, headers=get_headers())
	response.raise_for_status()
	data = response.json()

	features = data.get("features", [])
	if not features:
	break

	for item in features:
	if "dataset_id" in item:
	dataset_ids.append(item["dataset_id"])

	start_index += items_per_page
	# Optional: Break if we reached totalResults
	if start_index >= data.get("properties", {}).get("totalResults", 0):
	break

	print(f"Found {len(dataset_ids)} datasets.")
	return dataset_ids

	def fetch_dataset_details(dataset_id):
	"""Fetches full metadata for a specific dataset ID."""
	try:
	response = requests.get(f"{BASE_URL}/datasets/{dataset_id}", headers=get_headers())
	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error fetching {dataset_id}: {response.status_code}")
	return None
	except Exception as e:
	print(f"Exception for {dataset_id}: {e}")
	return None

	def main():
	# 1. Get IDs
	ids = fetch_all_dataset_ids()

	# 2. Loop through individual datasets
	detailed_data = []
	print("Fetching detailed metadata for each dataset...")
	for d_id in tqdm(ids[:5]):
	details = fetch_dataset_details(d_id)
	if details:
	detailed_data.append(details)
	# Small sleep to be polite to the API
	time.sleep(1.1)

	if not detailed_data:
	print("No data collected. Check your API Token or Connection.")
	return

	# 3. Process and Flatten
	df = pd.json_normalize(detailed_data)

	# --- FIX: Sanitize columns for Parquet ---
	# Identify columns that contain lists or dicts and convert them to JSON strings
	for col in df.columns:
	# Check if the column has any list or dict objects
	is_complex = df[col].apply(lambda x: isinstance(x, (list, dict))).any()
	if is_complex:
	df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x)
	# -----------------------------------------

	# 4. Save to Parquet
	print(f"Saving {len(df)} records to {OUTPUT_FILE}...")
	df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
	print("Done!")

	if __name__ == "__main__":
	main()
No results found