diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 26c889d241e..fe78d86f8be 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,12 @@ Deprecations Bug Fixes ~~~~~~~~~ + +- The NetCDF4 backend will now claim to be able to read any URL except for one that contains + the substring zarr. This restores backward compatibility after + :pull:`10804` broke workflows that relied on ``xr.open_dataset("http://...")`` + (:pull:`10931`). + By `Ian Hunt-Isaak `_. - Always normalize slices when indexing ``LazilyIndexedArray`` instances (:issue:`10941`, :pull:`10948`). By `Justus Magin `_. diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 92694c16a52..cb5fca0a433 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -849,3 +849,37 @@ def open_groups_as_dict( # mapping of engine name to (module name, BackendEntrypoint Class) BACKEND_ENTRYPOINTS: dict[str, tuple[str | None, type[BackendEntrypoint]]] = {} + + +def _is_likely_dap_url(url: str) -> bool: + """ + Determines if a URL is likely an OPeNDAP (DAP) endpoint based on + known protocols, server software path patterns, and file extensions. + + Parameters + ---------- + url : str + + Returns + ------- + True if the URL matches common DAP patterns, False otherwise. + """ + if not url: + return False + + url_lower = url.lower() + + # For remote URIs, check for DAP server software path patterns + if is_remote_uri(url_lower): + dap_path_patterns = ( + "/dodsc/", # THREDDS Data Server (TDS) DAP endpoint (case-insensitive) + "/dods/", # GrADS Data Server (GDS) DAP endpoint + "/opendap/", # Generic OPeNDAP/Hyrax server + "/erddap/", # ERDDAP data server + "/dap2/", # Explicit DAP2 version in path + "/dap4/", # Explicit DAP4 version in path + "/dap/", + ) + return any(pattern in url_lower for pattern in dap_path_patterns) + + return False diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 2c686951c46..bb511f9befc 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -715,10 +715,19 @@ def _has_netcdf_ext(path: str | os.PathLike, is_remote: bool = False) -> bool: _, ext = os.path.splitext(path) return ext in {".nc", ".nc4", ".cdf"} - if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): - # For remote URIs, check extension (accounting for query params/fragments) - # Remote netcdf-c can handle both regular URLs and DAP URLs - return _has_netcdf_ext(filename_or_obj, is_remote=True) + if isinstance(filename_or_obj, str): + if is_remote_uri(filename_or_obj): + # For remote URIs, check extension (accounting for query params/fragments) + # Remote netcdf-c can handle both regular URLs and DAP URLs + if _has_netcdf_ext(filename_or_obj, is_remote=True): + return True + elif "zarr" in filename_or_obj.lower(): + return False + # return true for non-zarr URLs so we don't have a breaking change for people relying on this + # netcdf backend guessing true for all remote sources. + # TODO: emit a warning here about deprecation of this behavior + # https://github.com/pydata/xarray/pull/10931 + return True if isinstance(filename_or_obj, str | os.PathLike): # For local paths, check magic number first, then extension diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 9976e180e1b..2578a5b423c 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -12,6 +12,7 @@ BackendArray, BackendEntrypoint, T_PathFileOrDataStore, + _is_likely_dap_url, _normalize_path, datatree_from_dict_with_io_cleanup, robust_getitem, @@ -22,7 +23,6 @@ Frozen, FrozenDict, close_on_error, - is_remote_uri, ) from xarray.core.variable import Variable from xarray.namedarray.pycompat import integer_types @@ -252,23 +252,7 @@ class PydapBackendEntrypoint(BackendEntrypoint): def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: if not isinstance(filename_or_obj, str): return False - - # Check for explicit DAP protocol indicators: - # 1. DAP scheme: dap2:// or dap4:// (case-insensitive, may not be recognized by is_remote_uri) - # 2. Remote URI with /dap2/ or /dap4/ in URL path (case-insensitive) - # Note: We intentionally do NOT check for .dap suffix as that would match - # file extensions like .dap which trigger downloads of binary data - url_lower = filename_or_obj.lower() - if url_lower.startswith(("dap2://", "dap4://")): - return True - - # For standard remote URIs, check for DAP indicators in path - if is_remote_uri(filename_or_obj): - return ( - "/dap2/" in url_lower or "/dap4/" in url_lower or "/dodsC/" in url_lower - ) - - return False + return _is_likely_dap_url(filename_or_obj) def open_dataset( self, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 296e9541b33..342b6424338 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -77,7 +77,6 @@ has_h5netcdf_1_4_0_or_above, has_netCDF4, has_numpy_2, - has_pydap, has_scipy, has_zarr, has_zarr_v3, @@ -7294,9 +7293,9 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: _check_guess_can_open_and_open(entrypoint, path, engine="netcdf4", expected=ds) _check_guess_can_open_and_open(entrypoint, str(path), engine="netcdf4", expected=ds) - # Remote URLs without extensions are no longer claimed (stricter detection) - assert not entrypoint.guess_can_open("http://something/remote") - # Remote URLs with netCDF extensions are claimed + # Remote URLs without extensions return True (backward compatibility) + assert entrypoint.guess_can_open("http://something/remote") + # Remote URLs with netCDF extensions are also claimed assert entrypoint.guess_can_open("http://something/remote.nc") assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") @@ -7440,15 +7439,22 @@ def test_remote_url_backend_auto_detection() -> None: f"URL {url!r} should select {expected_backend!r} but got {engine!r}" ) - # DAP URLs without extensions - pydap wins if available, netcdf4 otherwise - # When pydap is not installed, netCDF4 should handle these DAP URLs - expected_dap_backend = "pydap" if has_pydap else "netcdf4" + # DAP URLs - netcdf4 should handle these (it comes first in backend order) + # Both netcdf4 and pydap can open DAP URLs, but netcdf4 has priority + expected_dap_backend = "netcdf4" dap_urls = [ + # Explicit DAP protocol schemes "dap2://opendap.earthdata.nasa.gov/collections/dataset", "dap4://opendap.earthdata.nasa.gov/collections/dataset", + "dap://example.com/dataset", "DAP2://example.com/dataset", # uppercase scheme "DAP4://example.com/dataset", # uppercase scheme + # DAP path indicators "https://example.com/services/DAP2/dataset", # uppercase in path + "http://test.opendap.org/opendap/data/nc/file.nc", # /opendap/ path + "https://coastwatch.pfeg.noaa.gov/erddap/griddap/erdMH1chla8day", # ERDDAP + "http://thredds.ucar.edu/thredds/dodsC/grib/NCEP/GFS/", # THREDDS dodsC + "https://disc2.gesdisc.eosdis.nasa.gov/dods/TRMM_3B42", # GrADS /dods/ ] for url in dap_urls: @@ -7457,20 +7463,16 @@ def test_remote_url_backend_auto_detection() -> None: f"URL {url!r} should select {expected_dap_backend!r} but got {engine!r}" ) - # URLs that should raise ValueError (no backend can open them) - invalid_urls = [ - "http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", # .dap suffix - "https://example.com/data.dap", # .dap suffix - "http://opendap.example.com/data", # no extension, no DAP indicators - "https://test.opendap.org/dataset", # no extension, no DAP indicators + # URLs with .dap suffix are claimed by netcdf4 (backward compatibility fallback) + # Note: .dap suffix is intentionally NOT recognized as a DAP dataset URL + fallback_urls = [ + ("http://test.opendap.org/opendap/data/nc/coads_climatology.nc.dap", "netcdf4"), + ("https://example.com/data.dap", "netcdf4"), ] - for url in invalid_urls: - with pytest.raises( - ValueError, - match=r"did not find a match in any of xarray's currently installed IO backends", - ): - guess_engine(url) + for url, expected_backend in fallback_urls: + engine = guess_engine(url) + assert engine == expected_backend @requires_netCDF4