Skip to content

Instantly share code, notes, and snippets.

@betolink
Created December 21, 2023 22:38
Show Gist options
  • Select an option

  • Save betolink/fa7c053c540f3e610596e2eda3a6c380 to your computer and use it in GitHub Desktop.

Select an option

Save betolink/fa7c053c540f3e610596e2eda3a6c380 to your computer and use it in GitHub Desktop.
Benchmarking different access patterns to MUR
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "d5464971-75d2-4dfc-9af9-a773efa4f1a7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%pip install earthaccess[kerchunk]==0.8.2"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6ca826ed-ff77-475a-bb39-abe1e13a1d72",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import xarray as xr\n",
"import rasterio\n",
"import rioxarray\n",
"from pqdm.threads import pqdm\n",
"import earthaccess\n",
"import os\n",
"from timebudget import timebudget\n",
"from pathlib import Path\n",
"from dask.distributed import LocalCluster\n",
"\n",
"\n",
"# assumes we have a ~/.netrc created\n",
"\n",
"cookies = os.path.expanduser(\"~/.urs_cookies\")\n",
"Path(cookies).touch()\n",
"\n",
"auth = earthaccess.login() "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3a3169b2-6502-4eaf-b9fb-0be9363f08a2",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Granules found: 31\n"
]
}
],
"source": [
"dataset=\"MUR-JPL-L4-GLOB-v4.1\",\n",
"\n",
"results = earthaccess.search_data(\n",
" short_name=dataset,\n",
" temporal=(\"2019-01-01\", \"2019-01-31\"),\n",
")\n",
"\n",
"data_links = [granule.data_links(access=\"external\") for granule in results]\n",
"url_links = [f'{link[0]}' for link in data_links]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9110b4e-d375-441d-8934-589395d4471a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# kerchunking with earthaccess\n",
"if __name__ == \"__main__\":\n",
" # Create a local Dask cluster for parallel metadata consolidation\n",
" # (but works with any Dask cluster)\n",
" cluster = LocalCluster()\n",
" client = cluster.get_client()\n",
" # needs a progressbar! \n",
" outfile = earthaccess.consolidate_metadata(\n",
" results,\n",
" outfile=f\"./direct-mur-metadata.json\", # Writing to a local file for demo purposes\n",
" # outfile=f\"s3://my-bucket/{short_name}-metadata.json\", # We could also write to a remote file\n",
" access=\"direct\",\n",
" # kerchunk_options={\"coo_map\": []}\n",
" kerchunk_options={\"concat_dims\": \"time\"}\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9dec9fe7-ac0e-446e-8eeb-b8b3d3ac4ad8",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"@timebudget\n",
"def via_gdalvsi():\n",
" with rasterio.Env(GDAL_HTTP_COOKIEFILE=cookies, \n",
" GDAL_HTTP_COOKIEJAR=cookies, \n",
" GDAL_HTTP_NETRC=True):\n",
" ds = xr.open_mfdataset(url_links, engine = \"rasterio\", decode_times=False, parallel=True)\n",
" return(ds)\n",
"\n",
"@timebudget\n",
"def via_earthaccess():\n",
" files = earthaccess.open(results)\n",
" # using Paralllel=True actually slows things down here, serialization overhead? bug in xarray?\n",
" ds = xr.open_mfdataset(files, engine=\"h5netcdf\", decode_times=False) \n",
" return(ds)\n",
"\n",
"@timebudget\n",
"def via_earthaccess_kerchunk():\n",
" fs = earthaccess.get_s3fs_session(\"GES_DISC\")\n",
" ds = xr.open_dataset(\n",
" \"reference://\",\n",
" engine=\"zarr\",\n",
" chunks={},\n",
" decode_coords=False, # tricky, the coords are there but encoded in a way xarray can't decode for some reason. Similar to https://github.com/fsspec/kerchunk/issues/177\n",
" backend_kwargs={\n",
" \"consolidated\": False,\n",
" \"storage_options\": {\n",
" \"fo\": \"direct-mur-metadata.json\",\n",
" \"remote_protocol\": \"s3\",\n",
" \"remote_options\": fs.storage_options,\n",
" }\n",
" },\n",
" )\n",
" return(ds)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "31904fe7-3137-48ab-9fb6-5ef4025fc2fc",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/srv/conda/envs/notebook/lib/python3.10/site-packages/rioxarray/_io.py:1132: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.\n",
" warnings.warn(str(rio_warning.message), type(rio_warning.message)) # type: ignore\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"via_gdalvsi took 54.563sec\n"
]
}
],
"source": [
"# And here we go!\n",
"ds1 = via_gdalvsi()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "eefdf990-5435-411f-a04d-ff0015502ea8",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Opening 31 granules, approx size: 11.63 GB\n",
"using endpoint: https://archive.podaac.earthdata.nasa.gov/s3credentials\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "022baa89fcb748ad9da897e77e8ef117",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"QUEUEING TASKS | : 0%| | 0/31 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "09902662053945048ad8afbaa5f05caf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"PROCESSING TASKS | : 0%| | 0/31 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9407c01469da4e33a086a397636417d1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"COLLECTING RESULTS | : 0%| | 0/31 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"via_earthaccess took 12.264sec\n"
]
}
],
"source": [
"ds2 = via_earthaccess()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "93835307-1c2a-4aff-ab02-2dad31f97cc6",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"via_earthaccess_kerchunk took 3.393sec\n"
]
}
],
"source": [
"ds3 = via_earthaccess_kerchunk()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment