diff --git a/doc/whats-new.rst b/doc/whats-new.rst index effb199f18e..52abc0d4219 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,6 +29,10 @@ Bug Fixes - Fix a major performance regression in :py:meth:`Coordinates.to_index` (and consequently :py:meth:`Dataset.to_dataframe`) caused by converting the cached code ndarrays into Python lists (:issue:`11305`). +- Fix :py:func:`concat` failing with ``TypeError: Cannot interpret + '' as a data type`` when a ``pandas.Index`` with a + ``StringDtype`` is used as the new concat dimension and another input has a + numpy string-dtype coord (:issue:`11317`). Documentation diff --git a/xarray/structure/concat.py b/xarray/structure/concat.py index 9609dbcb5a9..d3cd3cf99fd 100644 --- a/xarray/structure/concat.py +++ b/xarray/structure/concat.py @@ -363,6 +363,13 @@ def _calc_concat_dim_index( else: (dim,) = dim_or_data.dims coord_dtype = getattr(dim_or_data, "dtype", None) + # pandas may produce a StringDtype-backed Index, which xarray does + # not treat as an allowed extension array dtype for coords. Let + # PandasIndex compute a valid numpy coord_dtype in that case so a + # subsequent concat against a numpy-string-dtype coord does not fail + # in ``np.result_type``. See GH#11317. + if isinstance(coord_dtype, pd.StringDtype): + coord_dtype = None index = PandasIndex(dim_or_data, dim, coord_dtype=coord_dtype) return dim, index diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index bc98d72d50c..5072d1989f0 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -1491,6 +1491,19 @@ def test_concat_index_not_same_dim() -> None: concat([ds1, ds2], dim="x") +def test_concat_pandas_index_string_dtype() -> None: + # Regression test for GH#11317. When pandas yields a StringDtype-backed + # Index (e.g. with ``future.infer_string`` enabled), passing it as the + # new concat dim must not break a subsequent concat against a coord with + # a numpy string dtype. + with pd.option_context("future.infer_string", True): + da = DataArray([0], dims=["dim_a"], coords={"dim_a": ["a"]}) + db = DataArray([0], dims=["dim_b"], coords={"dim_b": ["b"]}) + db2 = concat([db], pd.Index(["b"], name="dim_a")) + result = concat([da, db2], dim="dim_a") + assert list(result["dim_a"].values) == ["a", "b"] + + class TestNewDefaults: def test_concat_second_empty_with_scalar_data_var_only_on_first(self) -> None: ds1 = Dataset(data_vars={"a": ("y", [0.1]), "b": 0.1}, coords={"x": 0.1})