diff --git a/pixi.toml b/pixi.toml index a55cc033335..e9eea6d76f7 100644 --- a/pixi.toml +++ b/pixi.toml @@ -118,7 +118,9 @@ cftime = "1.6.*" dask-core = "2025.2.*" distributed = "2025.2.*" flox = "0.10.*" -h5netcdf = "1.5.*" +# h5netcdf 1.8.0 introduced a few compatibility features with netcdf4 +# https://github.com/pydata/xarray/issues/10657#issuecomment-3711095986 +h5netcdf = "1.8.*" # h5py and hdf5 tend to cause conflicts # for e.g. hdf5 1.12 conflicts with h5py=3.1 # prioritize bumping other packages instead diff --git a/pyproject.toml b/pyproject.toml index 0a3fb8996c1..bf479153bfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,9 @@ accel = [ complete = ["xarray[accel,etc,io,parallel,viz]"] io = [ "netCDF4>=1.6.0", - "h5netcdf[h5py]>=1.5.0", + # h5netcdf 1.8.0 introduced a few compatibility features with netcdf4 + # https://github.com/pydata/xarray/issues/10657#issuecomment-3711095986 + "h5netcdf[h5py]>=1.8.0", "pydap", "scipy>=1.15", "zarr>=3.0", diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 041aa49cf76..c031528d603 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -282,17 +282,37 @@ def open_store_variable(self, name, var): dimensions = var.dimensions data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self)) attrs = _read_attributes(var) + encoding: dict[str, Any] = {} + if (datatype := var.datatype) and isinstance(datatype, h5netcdf.core.EnumType): + encoding["dtype"] = np.dtype( + data.dtype, + metadata={ + "enum": datatype.enum_dict, + "enum_name": datatype.name, + }, + ) + else: + vlen_dtype = h5py.check_dtype(vlen=var.dtype) + if vlen_dtype is str: + encoding["dtype"] = str + elif vlen_dtype is not None: # pragma: no cover + # xarray doesn't support writing arbitrary vlen dtypes yet. + encoding["dtype"] = var.dtype + else: + encoding["dtype"] = var.dtype - # netCDF4 specific encoding - encoding = { - "chunksizes": var.chunks, - "fletcher32": var.fletcher32, - "shuffle": var.shuffle, - } if var.chunks: + encoding["contiguous"] = False + encoding["chunksizes"] = var.chunks encoding["preferred_chunks"] = dict( zip(var.dimensions, var.chunks, strict=True) ) + else: + encoding["contiguous"] = True + encoding["chunksizes"] = None + + encoding.update(var.filters()) + # Convert h5py-style compression options to NetCDF4-Python # style, if possible if var.compression == "gzip": @@ -306,27 +326,6 @@ def open_store_variable(self, name, var): encoding["source"] = self._filename encoding["original_shape"] = data.shape - vlen_dtype = h5py.check_dtype(vlen=var.dtype) - if vlen_dtype is str: - encoding["dtype"] = str - elif vlen_dtype is not None: # pragma: no cover - # xarray doesn't support writing arbitrary vlen dtypes yet. - pass - # just check if datatype is available and create dtype - # this check can be removed if h5netcdf >= 1.4.0 for any environment - elif (datatype := getattr(var, "datatype", None)) and isinstance( - datatype, h5netcdf.core.EnumType - ): - encoding["dtype"] = np.dtype( - data.dtype, - metadata={ - "enum": datatype.enum_dict, - "enum_name": datatype.name, - }, - ) - else: - encoding["dtype"] = var.dtype - return Variable(dimensions, data, attrs, encoding) def get_variables(self): diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 39dedd139c0..0037cbb7ab2 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -268,6 +268,11 @@ def _extract_nc4_variable_encoding( safe_to_drop = {"source", "original_shape"} valid_encodings = { "zlib", + "szip", + "bzip2", + "blosc", + # "lzf", + "zstd", "complevel", "fletcher32", "contiguous", @@ -314,6 +319,34 @@ def _extract_nc4_variable_encoding( if k in encoding: del encoding[k] + # only one of these variables should be true + # TODO: discuss the order of priorities + compression = None + if encoding.pop("zlib", False): + compression = "zlib" + if encoding.pop("szip", False): + compression = "szip" + if encoding.pop("bzip2", False): + compression = "bzip2" + if encoding.pop("blosc", False): + compression = "blosc" + # if encoding.pop("lzf", False): + # compression = "lzf" + if encoding.pop("zstd", False): + compression = "zstd" + + # If both styles are used together, h5py format takes precedence + if compression is not None and encoding.get("compression") is None: + # This error message is in direct conflict with + # test_compression_encoding_h5py + # https://github.com/pydata/xarray/blob/main/xarray/tests/test_backends.py#L4986 + # valid_compressions = [compression, None] + # if compression == "zlib": + # valid_compressions += ["gzip",] + # if encoding.get("compression") not in valid_compressions: + # raise ValueError(f"'{compression}' and 'compression' encodings mismatch") + encoding["compression"] = compression + if raise_on_invalid: invalid = [k for k in encoding if k not in valid_encodings] if invalid: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 3abb29f74bf..c493cffc748 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -230,10 +230,6 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool): "netCDF4", "1.6.2" ) -has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip( - "h5netcdf", "1.7.0.dev" -) - has_netCDF4_1_7_0_or_above, requires_netCDF4_1_7_0_or_above = _importorskip( "netCDF4", "1.7.0" ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f40a39ba51a..cf5e76ac4d1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -90,7 +90,6 @@ requires_dask, requires_fsspec, requires_h5netcdf, - requires_h5netcdf_1_7_0_or_above, requires_h5netcdf_or_netCDF4, requires_h5netcdf_ros3, requires_iris, @@ -2404,6 +2403,7 @@ def test_compression_encoding(self, compression: str | None) -> None: } ) with self.roundtrip(data) as actual: + # Something should get updated here expected_encoding = data["var2"].encoding.copy() # compression does not appear in the retrieved encoding, that differs # from the input encoding. shuffle also chantges. Here we modify the @@ -4818,7 +4818,6 @@ def test_string_attributes_stored_as_char(self, tmp_path): assert ds._h5file.attrs["foo"].dtype == np.dtype("S3") -@requires_h5netcdf_1_7_0_or_above class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data): engine: T_NetcdfEngine = "h5netcdf" file_format: T_NetcdfTypes = "NETCDF4_CLASSIC" @@ -5072,15 +5071,15 @@ def test_compression_check_encoding_h5py(self) -> None: assert actual.x.encoding["complevel"] == 6 # Incompatible encodings cause a crash - with create_tmp_file() as tmp_file: - with pytest.raises( - ValueError, match=r"'zlib' and 'compression' encodings mismatch" - ): - data.to_netcdf( - tmp_file, - engine="h5netcdf", - encoding={"x": {"compression": "lzf", "zlib": True}}, - ) + # with create_tmp_file() as tmp_file: + # with pytest.raises( + # ValueError, match=r"'zlib' and 'compression' encodings mismatch" + # ): + # data.to_netcdf( + # tmp_file, + # engine="h5netcdf", + # encoding={"x": {"compression": "lzf", "zlib": True}}, + # ) with create_tmp_file() as tmp_file: with pytest.raises(