-
-
Save ajelenak/80354a95b449cedea5cca508004f97a9 to your computer and use it in GitHub Desktop.
| # Requirements: | |
| # HDF5 library version 1.10.5 or later | |
| # h5py version 3.0 or later | |
| # pip install git+https://github.com/HDFGroup/zarr-python.git@hdf5 | |
| import logging | |
| from urllib.parse import urlparse, urlunparse | |
| import numpy as np | |
| import h5py | |
| import zarr | |
| from zarr.storage import FileChunkStore | |
| from zarr.meta import encode_fill_value | |
| from numcodecs import Zlib | |
| import fsspec | |
| lggr = logging.getLogger('h5-to-zarr') | |
| lggr.addHandler(logging.NullHandler()) | |
| class Hdf5ToZarr: | |
| """Translate the content of one HDF5 file into Zarr metadata. | |
| HDF5 groups become Zarr groups. HDF5 datasets become Zarr arrays. Zarr array | |
| chunks remain in the HDF5 file. | |
| Parameters | |
| ---------- | |
| h5f : file-like or str | |
| Input HDF5 file as a string or file-like Python object. | |
| store : MutableMapping | |
| Zarr store. | |
| xarray : bool, optional | |
| Produce atributes required by the `xarray <http://xarray.pydata.org>`_ | |
| package to correctly identify dimensions (HDF5 dimension scales) of a | |
| Zarr array. Default is ``False``. | |
| """ | |
| def __init__(self, h5f, store, xarray=False): | |
| # Open HDF5 file in read mode... | |
| lggr.debug(f'HDF5 file: {h5f}') | |
| lggr.debug(f'Zarr store: {store}') | |
| lggr.debug(f'xarray: {xarray}') | |
| self._h5f = h5py.File(h5f, mode='r') | |
| self._xr = xarray | |
| # Create Zarr store's root group... | |
| self._zroot = zarr.group(store=store, overwrite=True) | |
| # Figure out HDF5 file's URI... | |
| if hasattr(h5f, 'name'): | |
| self._uri = h5f.name | |
| elif hasattr(h5f, 'url'): | |
| parts = urlparse(h5f.url()) | |
| self._uri = urlunparse(parts[:3] + ('',) * 3) | |
| else: | |
| self._uri = None | |
| lggr.debug(f'Source URI: {self._uri}') | |
| def translate(self): | |
| """Translate content of one HDF5 file into Zarr storage format. | |
| No data is copied out of the HDF5 file. | |
| """ | |
| lggr.debug('Translation begins') | |
| self.transfer_attrs(self._h5f, self._zroot) | |
| self._h5f.visititems(self.translator) | |
| def transfer_attrs(self, h5obj, zobj): | |
| """Transfer attributes from an HDF5 object to its equivalent Zarr object. | |
| Parameters | |
| ---------- | |
| h5obj : h5py.Group or h5py.Dataset | |
| An HDF5 group or dataset. | |
| zobj : zarr.hierarchy.Group or zarr.core.Array | |
| An equivalent Zarr group or array to the HDF5 group or dataset with | |
| attributes. | |
| """ | |
| for n, v in h5obj.attrs.items(): | |
| if n in ('REFERENCE_LIST', 'DIMENSION_LIST'): | |
| continue | |
| # Fix some attribute values to avoid JSON encoding exceptions... | |
| if isinstance(v, bytes): | |
| v = v.decode('utf-8') | |
| elif isinstance(v, (np.ndarray, np.number)): | |
| if n == '_FillValue': | |
| v = encode_fill_value(v, v.dtype) | |
| elif v.size == 1: | |
| v = v.flatten()[0].tolist() | |
| else: | |
| v = v.tolist() | |
| if self._xr and v == 'DIMENSION_SCALE': | |
| continue | |
| try: | |
| zobj.attrs[n] = v | |
| except TypeError: | |
| print(f'Caught TypeError: {n}@{h5obj.name} = {v} ({type(v)})') | |
| def translator(self, name, h5obj): | |
| """Produce Zarr metadata for all groups and datasets in the HDF5 file. | |
| """ | |
| if isinstance(h5obj, h5py.Dataset): | |
| lggr.debug(f'Dataset: {h5obj.name}') | |
| if h5obj.id.get_create_plist().get_layout() == h5py.h5d.COMPACT: | |
| RuntimeError( | |
| f'Compact HDF5 datasets not yet supported: <{h5obj.name} ' | |
| f'{h5obj.shape} {h5obj.dtype} {h5obj.nbytes} bytes>') | |
| return | |
| if (h5obj.scaleoffset or h5obj.fletcher32 or h5obj.shuffle or | |
| h5obj.compression in ('szip', 'lzf')): | |
| raise RuntimeError( | |
| f'{h5obj.name} uses unsupported HDF5 filters') | |
| if h5obj.compression == 'gzip': | |
| compression = Zlib(level=h5obj.compression_opts) | |
| else: | |
| compression = None | |
| # Get storage info of this HDF5 dataset... | |
| cinfo = self.storage_info(h5obj) | |
| if self._xr and h5py.h5ds.is_scale(h5obj.id) and not cinfo: | |
| return | |
| # Create a Zarr array equivalent to this HDF5 dataset... | |
| za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape, | |
| dtype=h5obj.dtype, | |
| chunks=h5obj.chunks or False, | |
| fill_value=h5obj.fillvalue, | |
| compression=compression, | |
| overwrite=True) | |
| lggr.debug(f'Created Zarr array: {za}') | |
| self.transfer_attrs(h5obj, za) | |
| if self._xr: | |
| # Do this for xarray... | |
| adims = self._get_array_dims(h5obj) | |
| za.attrs['_ARRAY_DIMENSIONS'] = adims | |
| lggr.debug(f'_ARRAY_DIMENSIONS = {adims}') | |
| # Store chunk location metadata... | |
| if cinfo: | |
| cinfo['source'] = {'uri': self._uri, | |
| 'array_name': h5obj.name} | |
| FileChunkStore.chunks_info(za, cinfo) | |
| elif isinstance(h5obj, h5py.Group): | |
| lggr.debug(f'Group: {h5obj.name}') | |
| zgrp = self._zroot.create_group(h5obj.name) | |
| self.transfer_attrs(h5obj, zgrp) | |
| def _get_array_dims(self, dset): | |
| """Get a list of dimension scale names attached to input HDF5 dataset. | |
| This is required by the xarray package to work with Zarr arrays. Only | |
| one dimension scale per dataset dimension is allowed. If dataset is | |
| dimension scale, it will be considered as the dimension to itself. | |
| Parameters | |
| ---------- | |
| dset : h5py.Dataset | |
| HDF5 dataset. | |
| Returns | |
| ------- | |
| list | |
| List with HDF5 path names of dimension scales attached to input | |
| dataset. | |
| """ | |
| dims = list() | |
| rank = len(dset.shape) | |
| if rank: | |
| for n in range(rank): | |
| num_scales = len(dset.dims[n]) | |
| if num_scales == 1: | |
| dims.append(dset.dims[n][0].name[1:]) | |
| elif h5py.h5ds.is_scale(dset.id): | |
| dims.append(dset.name[1:]) | |
| elif num_scales > 1: | |
| raise RuntimeError( | |
| f'{dset.name}: {len(dset.dims[n])} ' | |
| f'dimension scales attached to dimension #{n}') | |
| return dims | |
| def storage_info(self, dset): | |
| """Get storage information of an HDF5 dataset in the HDF5 file. | |
| Storage information consists of file offset and size (length) for every | |
| chunk of the HDF5 dataset. | |
| Parameters | |
| ---------- | |
| dset : h5py.Dataset | |
| HDF5 dataset for which to collect storage information. | |
| Returns | |
| ------- | |
| dict | |
| HDF5 dataset storage information. Dict keys are chunk array offsets | |
| as tuples. Dict values are pairs with chunk file offset and size | |
| integers. | |
| """ | |
| # Empty (null) dataset... | |
| if dset.shape is None: | |
| return dict() | |
| dsid = dset.id | |
| if dset.chunks is None: | |
| # Contiguous dataset... | |
| if dsid.get_offset() is None: | |
| # No data ever written... | |
| return dict() | |
| else: | |
| key = (0,) * (len(dset.shape) or 1) | |
| return {key: {'offset': dsid.get_offset(), | |
| 'size': dsid.get_storage_size()}} | |
| else: | |
| # Chunked dataset... | |
| num_chunks = dsid.get_num_chunks() | |
| if num_chunks == 0: | |
| # No data ever written... | |
| return dict() | |
| # Go over all the dataset chunks... | |
| stinfo = dict() | |
| chunk_size = dset.chunks | |
| for index in range(num_chunks): | |
| blob = dsid.get_chunk_info(index) | |
| key = tuple( | |
| [a // b for a, b in zip(blob.chunk_offset, chunk_size)]) | |
| stinfo[key] = {'offset': blob.byte_offset, | |
| 'size': blob.size} | |
| return stinfo | |
| if __name__ == '__main__': | |
| lggr.setLevel(logging.DEBUG) | |
| lggr_handler = logging.StreamHandler() | |
| lggr_handler.setFormatter(logging.Formatter( | |
| '%(levelname)s:%(name)s:%(funcName)s:%(message)s')) | |
| lggr.addHandler(lggr_handler) | |
| with fsspec.open('s3://pangeo-data-uswest2/esip/adcirc/adcirc_01d.nc', | |
| mode='rb', anon=False, requester_pays=True, | |
| default_fill_cache=False) as f: | |
| store = zarr.DirectoryStore('../adcirc_01d.nc.chunkstore') | |
| h5chunks = Hdf5ToZarr(f, store, xarray=True) | |
| h5chunks.translate() | |
| # Consolidate Zarr metadata... | |
| lggr.info('Consolidating Zarr dataset metadata') | |
| zarr.convenience.consolidate_metadata(store) | |
| lggr.info('Done') |
You have to use the modified zarr package and install it using this command:
pip install git+https://github.com/HDFGroup/zarr-python.git@hdf5. Remove first the previous zarr package from your environment: pip uninstall zarr.
Thank you. Got another error
Cloning https://github.com/HDFGroup/zarr-python.git (to revision hdf5.) to /tmp/pip-req-build-x0lkywiy Did not find branch or tag 'hdf5.', assuming revision or ref. error: pathspec 'hdf5.' did not match any file(s) known to git Command "git checkout -q hdf5." failed with error code 1 in /tmp/pip-req-build-x0lkywiy
My bad, there is an extra "." in the end, worked fine thank you
I start getting this error
AttributeError: 'h5py.h5d.DatasetID' object has no attribute 'get_num_chunks'
Name: h5py Version: 2.10.0 Summary: Read and write HDF5 files from Python Home-page: http://www.h5py.org
You need to install h5py from the master branch: pip install git+https://github.com/h5py/h5py.git. The h5py version 2.10 does not support that method.
Also make sure that HDF5 library version is at least 1.10.5: h5py.h5.get_libversion() should produce (1, 10, 5) or greater.
Thank you so much for the help, I will try that.
Building it from master didn't help :( and the version is still 2.10
https://github.com/h5py/h5py/blob/master/setup.py#L30
still getting the same exact error
What is your HDF5 library's version?
`In [1]: import h5py
In [2]: h5py.h5.get_libversion()
Out[2]: (1, 10, 4)`
I will try install 1.10.5but I thoughtget_num_chunksnot implemented in the version2.10``
Installing h5py from the master branch is what matters, don't worry about the version number. Try installing HDF5-1.10.6 if you can since it's the latest official 1.10 release.
I have implemented fsspec/filesystem_spec#464 to use the files generated by this, at the filesystem layer rather than in zarr (i.e., you could use the offset reference idea for other uses too).
Here's a gist that demonstrates Cloud-performant access, reading both the Zarr dataset and the NetCDF4/HDF5 file in comparable times.
You can run this test yourself on the Pangeo AWS binder (it take about 5 minutes for the cluster to spin up at the beginning):
Hello @rsignell-usgs. Thank you very much for sharing the binder. It is great to be able too read chunked netcdf file as its zarr.
I tried the pangeo binder, but at the line
cluster = gateway.new_cluster()
I get following err;
GatewayClusterError: Cluster 'prod.1716d3af703f48bbb8c7428b13823e65' failed to start, see logs for more information
Sorry, I could not find the logs so I couldn't fix by myself.
I skipped the usage of Dask Gateway cluster, I shortened the computation as
#max_var = ds['zeta'].max(dim='time')
max_var = ds['zeta'].sel(node=slice(0,425919)).max(dim='time').compute()
and I could verify so it is ok for me.
If I should report this somewhere else (pangeo Gitter?), plz let me know
@ajelenak. Hello, I try to use your code, and after by passing the initial errors (use of the modified zarr package etc ), I get the following error :
module 'zarr' has no attribute 'DirectoryStore'
Thanks in advance
@ajelenak. Hello, I try to use your code, and after by passing the initial errors (use of the modified zarr package etc ), I get the following error : module 'zarr' has no attribute 'DirectoryStore' Thanks in advance
It was solved, re-installation of git+https://github.com/HDFGroup/zarr-python.git@hdf5
@Haris-auth Glad that you finally made it to work. This idea/code now lives in the kerchunk package. It also enables using official zarr package releases.
Running this script got this error:
from zarr.storage import FileChunkStoreImportError: cannot import name 'FileChunkStore' from 'zarr.storage'
I am using
zarr==2.4.0