diff --git a/requirements-test-minimal.txt b/requirements-test-minimal.txt index 106bb58fe1..2e75de4888 100644 --- a/requirements-test-minimal.txt +++ b/requirements-test-minimal.txt @@ -1,6 +1,7 @@ fsspec>=2022.11.0;sys_platform != "win32" numpy==1.19.3 -pyarrow==7.0.0 +pandas==1.1.3 +pyarrow==14.0.0 pytest>=6 pytest-cov pytest-xdist diff --git a/src/awkward/operations/str/__init__.py b/src/awkward/operations/str/__init__.py index 1b28aafcb6..6a770ab92b 100644 --- a/src/awkward/operations/str/__init__.py +++ b/src/awkward/operations/str/__init__.py @@ -236,48 +236,53 @@ def _get_split_action( bytestring_to_string=False, **kwargs, ): - from awkward._backends.typetracer import TypeTracerBackend - from awkward.forms import ListOffsetForm, NumpyForm - - typetracer = TypeTracerBackend.instance() - - # FIXME: this workaround for typetracer is required because - # split_XXX does not support length-zero arrays - # c.f. https://github.com/apache/arrow/issues/37437 def action(layout, **_): - if layout.backend is typetracer: - if layout.is_list and layout.parameter("__array__") == "string": - return ( - ListOffsetForm( - "i32", - ListOffsetForm( - layout.form.offsets, - NumpyForm("uint8", parameters={"__array__": "char"}), + if layout.is_list and layout.parameter("__array__") == "string": + return _drop_option_preserving_form( + _apply_through_arrow( + utf8_function, + layout, + *args, + generate_bitmasks=generate_bitmasks, + **kwargs, + ) + ) + + elif layout.is_list and layout.parameter("__array__") == "bytestring": + if bytestring_to_string: + out = _drop_option_preserving_form( + _apply_through_arrow( + ascii_function, + layout.copy( + content=layout.content.copy( + parameters={"__array__": "char"} + ), parameters={"__array__": "string"}, ), + *args, + generate_bitmasks=generate_bitmasks, + **kwargs, ) - .length_zero_array() - .to_typetracer(forget_length=True) ) + assert out.is_list - elif layout.is_list and layout.parameter("__array__") == "bytestring": - return ( - ListOffsetForm( - "i32", - ListOffsetForm( - layout.form.offsets, - NumpyForm("uint8", parameters={"__array__": "byte"}), - parameters={"__array__": "bytestring"}, + assert ( + out.content.is_list + and out.content.parameter("__array__") == "string" + ) + return out.copy( + content=out.content.copy( + content=out.content.content.copy( + parameters={"__array__": "byte"} ), - ) - .length_zero_array() - .to_typetracer(forget_length=True) + parameters={"__array__": "bytestring"}, + ), ) - else: - if layout.is_list and layout.parameter("__array__") == "string": + + else: return _drop_option_preserving_form( _apply_through_arrow( - utf8_function, + ascii_function, layout, *args, generate_bitmasks=generate_bitmasks, @@ -285,46 +290,4 @@ def action(layout, **_): ) ) - elif layout.is_list and layout.parameter("__array__") == "bytestring": - if bytestring_to_string: - out = _drop_option_preserving_form( - _apply_through_arrow( - ascii_function, - layout.copy( - content=layout.content.copy( - parameters={"__array__": "char"} - ), - parameters={"__array__": "string"}, - ), - *args, - generate_bitmasks=generate_bitmasks, - **kwargs, - ) - ) - assert out.is_list - - assert ( - out.content.is_list - and out.content.parameter("__array__") == "string" - ) - return out.copy( - content=out.content.copy( - content=out.content.content.copy( - parameters={"__array__": "byte"} - ), - parameters={"__array__": "bytestring"}, - ), - ) - - else: - return _drop_option_preserving_form( - _apply_through_arrow( - ascii_function, - layout, - *args, - generate_bitmasks=generate_bitmasks, - **kwargs, - ) - ) - return action diff --git a/tests/test_2616_use_pyarrow_for_strings.py b/tests/test_2616_use_pyarrow_for_strings.py index 2e58f73f78..c512f69cde 100644 --- a/tests/test_2616_use_pyarrow_for_strings.py +++ b/tests/test_2616_use_pyarrow_for_strings.py @@ -881,16 +881,15 @@ def test_slice(): == ak.str.slice(ak.to_backend(bytestring, "typetracer"), 1, 3).layout.form ) - # ArrowInvalid: Negative buffer resize: -40 (looks like an Arrow bug) - # assert ak.str.slice(string, 1).tolist() == [ - # ["αβγ"[1:], ""[1:]], - # [], - # ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]], - # ] - # assert ( - # ak.str.slice(string, 1).layout.form - # == ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form - # ) + assert ak.str.slice(string, 1).tolist() == [ + ["αβγ"[1:], ""[1:]], + [], + ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]], + ] + assert ( + ak.str.slice(string, 1).layout.form + == ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form + ) assert ak.str.slice(bytestring, 1).tolist() == [ ["αβγ".encode()[1:], b""[1:]], [],