Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements-test-minimal.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
fsspec>=2022.11.0;sys_platform != "win32"
numpy==1.19.3
pyarrow==7.0.0
pandas==1.1.3
pyarrow==14.0.0
pytest>=6
pytest-cov
pytest-xdist
111 changes: 37 additions & 74 deletions src/awkward/operations/str/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,95 +236,58 @@ def _get_split_action(
bytestring_to_string=False,
**kwargs,
):
from awkward._backends.typetracer import TypeTracerBackend
from awkward.forms import ListOffsetForm, NumpyForm

typetracer = TypeTracerBackend.instance()

# FIXME: this workaround for typetracer is required because
# split_XXX does not support length-zero arrays
# c.f. https://github.com/apache/arrow/issues/37437
def action(layout, **_):
if layout.backend is typetracer:
if layout.is_list and layout.parameter("__array__") == "string":
return (
ListOffsetForm(
"i32",
ListOffsetForm(
layout.form.offsets,
NumpyForm("uint8", parameters={"__array__": "char"}),
if layout.is_list and layout.parameter("__array__") == "string":
return _drop_option_preserving_form(
_apply_through_arrow(
utf8_function,
layout,
*args,
generate_bitmasks=generate_bitmasks,
**kwargs,
)
)

elif layout.is_list and layout.parameter("__array__") == "bytestring":
if bytestring_to_string:
out = _drop_option_preserving_form(
_apply_through_arrow(
ascii_function,
layout.copy(
content=layout.content.copy(
parameters={"__array__": "char"}
),
parameters={"__array__": "string"},
),
*args,
generate_bitmasks=generate_bitmasks,
**kwargs,
)
.length_zero_array()
.to_typetracer(forget_length=True)
)
assert out.is_list

elif layout.is_list and layout.parameter("__array__") == "bytestring":
return (
ListOffsetForm(
"i32",
ListOffsetForm(
layout.form.offsets,
NumpyForm("uint8", parameters={"__array__": "byte"}),
parameters={"__array__": "bytestring"},
assert (
out.content.is_list
and out.content.parameter("__array__") == "string"
)
return out.copy(
content=out.content.copy(
content=out.content.content.copy(
parameters={"__array__": "byte"}
),
)
.length_zero_array()
.to_typetracer(forget_length=True)
parameters={"__array__": "bytestring"},
),
)
else:
if layout.is_list and layout.parameter("__array__") == "string":

else:
return _drop_option_preserving_form(
_apply_through_arrow(
utf8_function,
ascii_function,
layout,
*args,
generate_bitmasks=generate_bitmasks,
**kwargs,
)
)

elif layout.is_list and layout.parameter("__array__") == "bytestring":
if bytestring_to_string:
out = _drop_option_preserving_form(
_apply_through_arrow(
ascii_function,
layout.copy(
content=layout.content.copy(
parameters={"__array__": "char"}
),
parameters={"__array__": "string"},
),
*args,
generate_bitmasks=generate_bitmasks,
**kwargs,
)
)
assert out.is_list

assert (
out.content.is_list
and out.content.parameter("__array__") == "string"
)
return out.copy(
content=out.content.copy(
content=out.content.content.copy(
parameters={"__array__": "byte"}
),
parameters={"__array__": "bytestring"},
),
)

else:
return _drop_option_preserving_form(
_apply_through_arrow(
ascii_function,
layout,
*args,
generate_bitmasks=generate_bitmasks,
**kwargs,
)
)

return action
19 changes: 9 additions & 10 deletions tests/test_2616_use_pyarrow_for_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,16 +881,15 @@ def test_slice():
== ak.str.slice(ak.to_backend(bytestring, "typetracer"), 1, 3).layout.form
)

# ArrowInvalid: Negative buffer resize: -40 (looks like an Arrow bug)
# assert ak.str.slice(string, 1).tolist() == [
# ["αβγ"[1:], ""[1:]],
# [],
# ["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]],
# ]
# assert (
# ak.str.slice(string, 1).layout.form
# == ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form
# )
assert ak.str.slice(string, 1).tolist() == [
["αβγ"[1:], ""[1:]],
[],
["→δε←"[1:], "ζz zζ"[1:], "abc"[1:]],
]
assert (
ak.str.slice(string, 1).layout.form
== ak.str.slice(ak.to_backend(string, "typetracer"), 1).layout.form
)
assert ak.str.slice(bytestring, 1).tolist() == [
["αβγ".encode()[1:], b""[1:]],
[],
Expand Down
Loading