lets download all stac files for a particular polarisation product
s5cmd cp s3://deant-data-public-dev/experimental/for_zhengshu/ga_s1_nrb_iw_hh_0/*/*/*/*/*stac-item.json .let's use stac_geoparquet
>>> import stac_geoparquet
>>> import pystac
>>> import glob
>>> dirs = glob.glob('for_zhengshu/**', recursive=True)
>>> import os
>>> leaf_dirs = [d for d in dirs if not glob.glob(os.path.join(d, "*"))]
>>> items = [pystac.read_file(x) for x in leaf_dirs]
>>> record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(items)
Traceback (most recent call last):
File "<python-input-75>", line 1, in <module>
record_batch_reader = stac_geoparquet.arrow.parse_stac_items_to_arrow(items)
File "/home/jonathan/micromamba/envs/stac-geoparquet/lib/python3.13/site-packages/stac_geoparquet/arrow/_api.py", line 63, in parse_stac_items_to_arrow
batch = stac_items_to_arrow(items)
File "/home/jonathan/micromamba/envs/stac-geoparquet/lib/python3.13/site-packages/stac_geoparquet/arrow/_api.py", line 192, in stac_items_to_arrow
raw_batch = StacJsonBatch.from_dicts(items, schema=schema)
File "/home/jonathan/micromamba/envs/stac-geoparquet/lib/python3.13/site-packages/stac_geoparquet/arrow/_batch.py", line 119, in from_dicts
array = pa.array(wkb_items)
File "pyarrow/array.pxi", line 370, in pyarrow.lib.array
File "pyarrow/array.pxi", line 42, in pyarrow.lib._sequence_to_array
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: cannot mix list and non-list, non-null valuesok lets just get rid of all potentially inconsistent fields
find . -type f -name '*.json' -print0 \
| xargs -0 -n1 -P"$(nproc)" -I{} sh -c '
f="{}"
t=$(mktemp)
jq '\''del(
.properties."proj:wkt2",
.properties."sarard:geometric_accuracy_ALE",
.properties."sarard:geometric_accuracy_range",
.properties."sarard:geometric_accuracy_azimuth",
.properties."sarard:geometric_accuracy_absolute",
.properties."sarard:geometric_accuracy_north_bias",
.properties."sarard:geometric_accuracy_east_bias",
.properties."sarard:geometric_accuracy_north_std",
.properties."sarard:geometric_accuracy_east_std",
.properties."sarard:pixel_coordinate_convention",
.properties."sarard:near_range_incidence_angle",
.properties."sarard:far_range_incidence_angle",
.properties."sarard:source_id"
) | (.assets[]? |= del(."raster:pixel_coordinate_convention"))'\'' "$f" > "$t" && mv "$t" "$f"
'ok lets keep going
>>> table = record_batch_reader.read_all()
>>> pyarrow.parquet.write_table(table, "ga_s1_nrb_iw_hh_0.parquet")for some reason geometries weren't getting placed properly
>>> df = geopandas.read_parquet("ga_s1_nrb_iw_hh_0.parquet")
Traceback (most recent call last):
File "<python-input-134>", line 1, in <module>
df = geopandas.read_parquet("ga_s1_nrb_iw_hh_0.parquet")
File "/home/jonathan/micromamba/envs/stac-geoparquet/lib/python3.13/site-packages/geopandas/io/arrow.py", line 762, in _read_parquet
geo_metadata = _validate_and_decode_metadata(metadata)
File "/home/jonathan/micromamba/envs/stac-geoparquet/lib/python3.13/site-packages/geopandas/io/arrow.py", line 619, in _validate_and_decode_metadata
raise ValueError(
...<2 lines>...
)
ValueError: Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead.even though it seems to be in there
$ duckdb
D select * from read_parquet('ga_s1_nrb_iw_hh_0.parquet');
┌──────────────────────┬──────────────────────┬───────────────────┬──────────────────────┬───┬─────────────────┬─────────────────┬────────────────────┬──────────────────────┬──────────────────────┐
│ assets │ bbox │ collection │ geometry │ … │ sat:orbit_cycle │ sat:orbit_state │ sat:relative_orbit │ start_datetime │ storage:schemes │
│ struct(hh_gamma0 s… │ struct(xmin double… │ varchar │ blob │ │ int64 │ varchar │ int64 │ timestamp with tim… │ struct(aws struct(… │
├──────────────────────┼──────────────────────┼───────────────────┼──────────────────────┼───┼─────────────────┼─────────────────┼────────────────────┼──────────────────────┼──────────────────────┤
│ {'HH_gamma0': {'de… │ {'xmin': 150.64981… │ ga_s1_nrb_iw_hh_0 │ \x01\x03\x00\x00\x… │ … │ 12 │ ascending │ 111 │ 2024-08-09 18:32:4… │ {'aws': {'bucket':… │
│ {'HH_gamma0': {'de… │ {'xmin': 150.64981… │ ga_s1_nrb_iw_hh_0 │ \x01\x03\x00\x00\x… │ … │ 12 │ ascending │ 111 │ 2024-07-16 18:32:4… │ {'aws': {'bucket':… │
│ {'HH_gamma0': {'de… │ {'xmin': 150.64981… │ ga_s1_nrb_iw_hh_0 │ \x01\x03\x00\x00\x… │ … │ 12 │ ascending │ 111 │ 2024-10-20 19:32:4… │ {'aws': {'bucket':… │
│ {'HH_gamma0': {'de… │ {'xmin': 150.64981… │ ga_s1_nrb_iw_hh_0 │ \x01\x03\x00\x00\x… │ … │ 12 │ ascending │ 111 │ 2025-04-06 18:32:3… │ {'aws': {'bucket':… │
...so get geopandas to fix it
>>> import geopandas as gpd
>>> import pandas as pd
>>> df = pd.read_parquet("ga_s1_nrb_iw_hh_0.parquet")
>>> gdf = gpd.GeoDataFrame(
... df,
... geometry=gpd.GeoSeries.from_wkb(df["geometry"]),
... crs="EPSG:4326",
... )
>>> gdf.to_parquet("ga_s1_nrb_iw_hh_0_geo.parquet", index=False)