Created
December 21, 2023 22:38
-
-
Save betolink/fa7c053c540f3e610596e2eda3a6c380 to your computer and use it in GitHub Desktop.
Benchmarking different access patterns to MUR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d5464971-75d2-4dfc-9af9-a773efa4f1a7", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%pip install earthaccess[kerchunk]==0.8.2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "6ca826ed-ff77-475a-bb39-abe1e13a1d72", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import xarray as xr\n", | |
| "import rasterio\n", | |
| "import rioxarray\n", | |
| "from pqdm.threads import pqdm\n", | |
| "import earthaccess\n", | |
| "import os\n", | |
| "from timebudget import timebudget\n", | |
| "from pathlib import Path\n", | |
| "from dask.distributed import LocalCluster\n", | |
| "\n", | |
| "\n", | |
| "# assumes we have a ~/.netrc created\n", | |
| "\n", | |
| "cookies = os.path.expanduser(\"~/.urs_cookies\")\n", | |
| "Path(cookies).touch()\n", | |
| "\n", | |
| "auth = earthaccess.login() " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "3a3169b2-6502-4eaf-b9fb-0be9363f08a2", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Granules found: 31\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "dataset=\"MUR-JPL-L4-GLOB-v4.1\",\n", | |
| "\n", | |
| "results = earthaccess.search_data(\n", | |
| " short_name=dataset,\n", | |
| " temporal=(\"2019-01-01\", \"2019-01-31\"),\n", | |
| ")\n", | |
| "\n", | |
| "data_links = [granule.data_links(access=\"external\") for granule in results]\n", | |
| "url_links = [f'{link[0]}' for link in data_links]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "e9110b4e-d375-441d-8934-589395d4471a", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# kerchunking with earthaccess\n", | |
| "if __name__ == \"__main__\":\n", | |
| " # Create a local Dask cluster for parallel metadata consolidation\n", | |
| " # (but works with any Dask cluster)\n", | |
| " cluster = LocalCluster()\n", | |
| " client = cluster.get_client()\n", | |
| " # needs a progressbar! \n", | |
| " outfile = earthaccess.consolidate_metadata(\n", | |
| " results,\n", | |
| " outfile=f\"./direct-mur-metadata.json\", # Writing to a local file for demo purposes\n", | |
| " # outfile=f\"s3://my-bucket/{short_name}-metadata.json\", # We could also write to a remote file\n", | |
| " access=\"direct\",\n", | |
| " # kerchunk_options={\"coo_map\": []}\n", | |
| " kerchunk_options={\"concat_dims\": \"time\"}\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "9dec9fe7-ac0e-446e-8eeb-b8b3d3ac4ad8", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "@timebudget\n", | |
| "def via_gdalvsi():\n", | |
| " with rasterio.Env(GDAL_HTTP_COOKIEFILE=cookies, \n", | |
| " GDAL_HTTP_COOKIEJAR=cookies, \n", | |
| " GDAL_HTTP_NETRC=True):\n", | |
| " ds = xr.open_mfdataset(url_links, engine = \"rasterio\", decode_times=False, parallel=True)\n", | |
| " return(ds)\n", | |
| "\n", | |
| "@timebudget\n", | |
| "def via_earthaccess():\n", | |
| " files = earthaccess.open(results)\n", | |
| " # using Paralllel=True actually slows things down here, serialization overhead? bug in xarray?\n", | |
| " ds = xr.open_mfdataset(files, engine=\"h5netcdf\", decode_times=False) \n", | |
| " return(ds)\n", | |
| "\n", | |
| "@timebudget\n", | |
| "def via_earthaccess_kerchunk():\n", | |
| " fs = earthaccess.get_s3fs_session(\"GES_DISC\")\n", | |
| " ds = xr.open_dataset(\n", | |
| " \"reference://\",\n", | |
| " engine=\"zarr\",\n", | |
| " chunks={},\n", | |
| " decode_coords=False, # tricky, the coords are there but encoded in a way xarray can't decode for some reason. Similar to https://github.com/fsspec/kerchunk/issues/177\n", | |
| " backend_kwargs={\n", | |
| " \"consolidated\": False,\n", | |
| " \"storage_options\": {\n", | |
| " \"fo\": \"direct-mur-metadata.json\",\n", | |
| " \"remote_protocol\": \"s3\",\n", | |
| " \"remote_options\": fs.storage_options,\n", | |
| " }\n", | |
| " },\n", | |
| " )\n", | |
| " return(ds)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "31904fe7-3137-48ab-9fb6-5ef4025fc2fc", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/srv/conda/envs/notebook/lib/python3.10/site-packages/rioxarray/_io.py:1132: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.\n", | |
| " warnings.warn(str(rio_warning.message), type(rio_warning.message)) # type: ignore\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "via_gdalvsi took 54.563sec\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# And here we go!\n", | |
| "ds1 = via_gdalvsi()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "eefdf990-5435-411f-a04d-ff0015502ea8", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Opening 31 granules, approx size: 11.63 GB\n", | |
| "using endpoint: https://archive.podaac.earthdata.nasa.gov/s3credentials\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "022baa89fcb748ad9da897e77e8ef117", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "QUEUEING TASKS | : 0%| | 0/31 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "09902662053945048ad8afbaa5f05caf", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "PROCESSING TASKS | : 0%| | 0/31 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "9407c01469da4e33a086a397636417d1", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "COLLECTING RESULTS | : 0%| | 0/31 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "via_earthaccess took 12.264sec\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ds2 = via_earthaccess()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "93835307-1c2a-4aff-ab02-2dad31f97cc6", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "via_earthaccess_kerchunk took 3.393sec\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ds3 = via_earthaccess_kerchunk()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.13" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment