Skip to content

Instantly share code, notes, and snippets.

@rsignell
Created November 7, 2025 10:01
Show Gist options
  • Select an option

  • Save rsignell/11b67b82845b9cd9df84d956ed1ac901 to your computer and use it in GitHub Desktop.

Select an option

Save rsignell/11b67b82845b9cd9df84d956ed1ac901 to your computer and use it in GitHub Desktop.
CORDEX_create_icechunk_s3_working.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "2bb23cda-d2dc-4ce0-a773-b0936451aa9d",
"metadata": {},
"source": [
"# Create Icechunk virtual dataset from CORDEX NetCDF files\n",
"* Originally on azure, now on s3\n",
"* The native NetCDF file chunks are tiny (**60kiB**): `Float:(1, 133, 116)`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8af6b6a8-728e-4f4c-adbe-4bc1045dc9ce",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<donfig.config_obj.ConfigSet at 0x7462ca509450>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import zarr\n",
"zarr.config.set({'async.concurrency': 128})"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "27632d73-826b-4e48-befb-1a5cfaa5c471",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"import os\n",
"import fsspec\n",
"import icechunk\n",
"import xarray as xr\n",
"from obstore.store import from_url\n",
"\n",
"from virtualizarr import open_virtual_dataset\n",
"from virtualizarr.parsers import HDFParser\n",
"from virtualizarr.registry import ObjectStoreRegistry\n",
"\n",
"import warnings\n",
"from zarr.errors import ZarrUserWarning, UnstableSpecificationWarning\n",
"warnings.filterwarnings(\"ignore\",category=ZarrUserWarning, \n",
" message=\"Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations.\")\n",
"warnings.filterwarnings(\"ignore\",category=UnstableSpecificationWarning, \n",
" message=r\"The data type \\(NullTerminatedBytes\\(length=1\\)\\) does not have a Zarr V3 specification.*\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "bccbad9f-3380-4c57-a3b0-8b9a81173dbc",
"metadata": {},
"outputs": [],
"source": [
"storage_endpoint = 'https://usgs.osn.mghpcc.org'\n",
"storage_bucket = 'esip'\n",
"\n",
"fs = fsspec.filesystem('s3', anon=True, endpoint_url=storage_endpoint)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0121616e-38e8-4bd2-b581-de25c69797e1",
"metadata": {},
"outputs": [],
"source": [
"flist = fs.glob(f's3://{storage_bucket}/rsignell/cordex/arctic/*.nc')\n",
"flist = [f's3://{f}' for f in flist]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "72ff27fa-5b44-41e3-b327-1132c9ce3cf6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"19\n",
"s3://esip/rsignell/cordex/arctic/tasmax_ARC-44_ICHEC-EC-EARTH_rcp85_r12i1p1_SMHI-RCA4_v1_day_20960101-21001231.nc\n"
]
}
],
"source": [
"print(len(flist))\n",
"print(flist[-1])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "8cd91ecf-fd2e-462c-aee5-7e8e8d5b0922",
"metadata": {},
"outputs": [],
"source": [
"bucket = \"s3://esip\"\n",
"store = from_url(bucket, region=\"not-used\", skip_signature=True, endpoint=storage_endpoint)\n",
"registry = ObjectStoreRegistry({bucket: store})\n",
"parser = HDFParser()"
]
},
{
"cell_type": "markdown",
"id": "2ab43f66-8b40-4be9-a5f4-d1601c709bfb",
"metadata": {},
"source": [
"#### My original attempt, which fails"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77c8e6a3-4c7e-4ad5-81d8-53ebb2f5d2a0",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"ds_list = [\n",
" open_virtual_dataset(\n",
" url=url,\n",
" parser=parser,\n",
" registry=registry,\n",
" loadable_variables=[\"time\"],\n",
" )\n",
" for url in flist[:2]\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ea502e7-ba50-429d-aae6-defa29491b71",
"metadata": {},
"outputs": [],
"source": [
"combined_ds = xr.concat(\n",
" ds_list,\n",
" dim=\"time\",\n",
" coords=\"minimal\",\n",
" compat=\"override\",\n",
" combine_attrs=\"override\",\n",
")"
]
},
{
"cell_type": "markdown",
"id": "891826d2-a961-44dd-847c-5ada12429de8",
"metadata": {},
"source": [
"#### The variable types don't seem to change..."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a8dfcc1-3bbb-4e65-b03d-483215bd875d",
"metadata": {},
"outputs": [],
"source": [
"xr.open_dataset(fs.open(flist[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e549f26d-7b20-4bf3-a477-bd21fa10ac92",
"metadata": {},
"outputs": [],
"source": [
"xr.open_dataset(fs.open(flist[1]))"
]
},
{
"cell_type": "markdown",
"id": "9bb561af-a709-4eab-b94b-78ed06aa9469",
"metadata": {},
"source": [
"#### BUT, ff `time_bnds` and `rotated_pole` are included as loadable_variables, it works! "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2907690e-ebbe-4a4a-ba5f-cd3f2403d119",
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"ds_list = [\n",
" open_virtual_dataset(\n",
" url=url,\n",
" parser=parser,\n",
" registry=registry,\n",
" loadable_variables=[\"time\", \"time_bnds\", \"rotated_pole\"],\n",
" )\n",
" for url in flist[:2]\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ee2d54e-113b-489d-8e31-8686ec0ccadc",
"metadata": {},
"outputs": [],
"source": [
"combined_ds = xr.concat(\n",
" ds_list,\n",
" dim=\"time\",\n",
" coords=\"minimal\",\n",
" compat=\"override\",\n",
" combine_attrs=\"override\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment