Created
November 7, 2025 10:01
-
-
Save rsignell/11b67b82845b9cd9df84d956ed1ac901 to your computer and use it in GitHub Desktop.
CORDEX_create_icechunk_s3_working.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "2bb23cda-d2dc-4ce0-a773-b0936451aa9d", | |
| "metadata": {}, | |
| "source": [ | |
| "# Create Icechunk virtual dataset from CORDEX NetCDF files\n", | |
| "* Originally on azure, now on s3\n", | |
| "* The native NetCDF file chunks are tiny (**60kiB**): `Float:(1, 133, 116)`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "8af6b6a8-728e-4f4c-adbe-4bc1045dc9ce", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<donfig.config_obj.ConfigSet at 0x7462ca509450>" | |
| ] | |
| }, | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import zarr\n", | |
| "zarr.config.set({'async.concurrency': 128})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "27632d73-826b-4e48-befb-1a5cfaa5c471", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import warnings\n", | |
| "import os\n", | |
| "import fsspec\n", | |
| "import icechunk\n", | |
| "import xarray as xr\n", | |
| "from obstore.store import from_url\n", | |
| "\n", | |
| "from virtualizarr import open_virtual_dataset\n", | |
| "from virtualizarr.parsers import HDFParser\n", | |
| "from virtualizarr.registry import ObjectStoreRegistry\n", | |
| "\n", | |
| "import warnings\n", | |
| "from zarr.errors import ZarrUserWarning, UnstableSpecificationWarning\n", | |
| "warnings.filterwarnings(\"ignore\",category=ZarrUserWarning, \n", | |
| " message=\"Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations.\")\n", | |
| "warnings.filterwarnings(\"ignore\",category=UnstableSpecificationWarning, \n", | |
| " message=r\"The data type \\(NullTerminatedBytes\\(length=1\\)\\) does not have a Zarr V3 specification.*\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "bccbad9f-3380-4c57-a3b0-8b9a81173dbc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "storage_endpoint = 'https://usgs.osn.mghpcc.org'\n", | |
| "storage_bucket = 'esip'\n", | |
| "\n", | |
| "fs = fsspec.filesystem('s3', anon=True, endpoint_url=storage_endpoint)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "0121616e-38e8-4bd2-b581-de25c69797e1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "flist = fs.glob(f's3://{storage_bucket}/rsignell/cordex/arctic/*.nc')\n", | |
| "flist = [f's3://{f}' for f in flist]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "72ff27fa-5b44-41e3-b327-1132c9ce3cf6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "19\n", | |
| "s3://esip/rsignell/cordex/arctic/tasmax_ARC-44_ICHEC-EC-EARTH_rcp85_r12i1p1_SMHI-RCA4_v1_day_20960101-21001231.nc\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(len(flist))\n", | |
| "print(flist[-1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "8cd91ecf-fd2e-462c-aee5-7e8e8d5b0922", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "bucket = \"s3://esip\"\n", | |
| "store = from_url(bucket, region=\"not-used\", skip_signature=True, endpoint=storage_endpoint)\n", | |
| "registry = ObjectStoreRegistry({bucket: store})\n", | |
| "parser = HDFParser()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "2ab43f66-8b40-4be9-a5f4-d1601c709bfb", | |
| "metadata": {}, | |
| "source": [ | |
| "#### My original attempt, which fails" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "77c8e6a3-4c7e-4ad5-81d8-53ebb2f5d2a0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%time\n", | |
| "ds_list = [\n", | |
| " open_virtual_dataset(\n", | |
| " url=url,\n", | |
| " parser=parser,\n", | |
| " registry=registry,\n", | |
| " loadable_variables=[\"time\"],\n", | |
| " )\n", | |
| " for url in flist[:2]\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "8ea502e7-ba50-429d-aae6-defa29491b71", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "combined_ds = xr.concat(\n", | |
| " ds_list,\n", | |
| " dim=\"time\",\n", | |
| " coords=\"minimal\",\n", | |
| " compat=\"override\",\n", | |
| " combine_attrs=\"override\",\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "891826d2-a961-44dd-847c-5ada12429de8", | |
| "metadata": {}, | |
| "source": [ | |
| "#### The variable types don't seem to change..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "8a8dfcc1-3bbb-4e65-b03d-483215bd875d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "xr.open_dataset(fs.open(flist[0]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "e549f26d-7b20-4bf3-a477-bd21fa10ac92", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "xr.open_dataset(fs.open(flist[1]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "9bb561af-a709-4eab-b94b-78ed06aa9469", | |
| "metadata": {}, | |
| "source": [ | |
| "#### BUT, ff `time_bnds` and `rotated_pole` are included as loadable_variables, it works! " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "2907690e-ebbe-4a4a-ba5f-cd3f2403d119", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%time\n", | |
| "ds_list = [\n", | |
| " open_virtual_dataset(\n", | |
| " url=url,\n", | |
| " parser=parser,\n", | |
| " registry=registry,\n", | |
| " loadable_variables=[\"time\", \"time_bnds\", \"rotated_pole\"],\n", | |
| " )\n", | |
| " for url in flist[:2]\n", | |
| "]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "4ee2d54e-113b-489d-8e31-8686ec0ccadc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "combined_ds = xr.concat(\n", | |
| " ds_list,\n", | |
| " dim=\"time\",\n", | |
| " coords=\"minimal\",\n", | |
| " compat=\"override\",\n", | |
| " combine_attrs=\"override\",\n", | |
| ")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.13.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment