Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 38 additions & 21 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1957,8 +1957,17 @@ def _consolidate_check(self) -> None:
self._is_consolidated = True
self._known_consolidated = True
return
dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
self._is_consolidated = len(dtypes) == len(set(dtypes))
# Exit early on first duplicate dtype rather than collecting all dtypes
dtypes: set[DtypeObj] = set()
for blk in self.blocks:
if blk._can_consolidate:
dtype = blk.dtype
if dtype in dtypes:
self._is_consolidated = False
self._known_consolidated = True
return
dtypes.add(dtype)
self._is_consolidated = True
self._known_consolidated = True

def _consolidate_inplace(self) -> None:
Expand Down Expand Up @@ -2374,17 +2383,18 @@ def raise_construction_error(
# -----------------------------------------------------------------------


def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]:
def _grouping_key(tup: tuple[int, ArrayLike]) -> Hashable:
dtype = tup[1].dtype

if is_1d_only_ea_dtype(dtype):
# We know these won't be consolidated, so don't need to group these.
# This avoids expensive comparisons of CategoricalDtype objects
sep = id(dtype)
Comment thread
jorisvandenbossche marked this conversation as resolved.
if isinstance(dtype, np.dtype):
# Only numpy dtypes get stacked into 2D blocks in _form_blocks,
# so only they need real grouping by dtype.
return dtype.name
else:
sep = 0

return sep, dtype
# Extension dtypes each get their own block regardless, so grouping
# doesn't matter. Use id() to avoid potentially expensive __hash__
# (e.g. CategoricalDtype hashes all categories).
return id(dtype)


def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
Expand All @@ -2396,11 +2406,18 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list
# when consolidating, we can ignore refs (either stacking always copies,
# or the EA is already copied in the calling dict_to_mgr)

# group by dtype
grouper = itertools.groupby(tuples, _grouping_func)
# group by dtype using a dict faster than old itertools.groupby
groups: dict[Hashable, list[tuple[int, ArrayLike]]] = {}
for tup in tuples:
key = _grouping_key(tup)
try:
groups[key].append(tup)
except KeyError:
groups[key] = [tup]
Comment thread
jorisvandenbossche marked this conversation as resolved.

nbs: list[Block] = []
for (_, dtype), tup_block in grouper:
for tup_block in groups.values():
dtype = tup_block[0][1].dtype
block_type = get_block_type(dtype)

if isinstance(dtype, np.dtype):
Expand Down Expand Up @@ -2487,19 +2504,19 @@ def _merge_blocks(
new_values: ArrayLike

if isinstance(blocks[0].dtype, np.dtype):
# error: List comprehension has incompatible type List[Union[ndarray,
# ExtensionArray]]; expected List[Union[complex, generic,
# Sequence[Union[int, float, complex, str, bytes, generic]],
# Sequence[Sequence[Any]], SupportsArray]]
new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
# Use np.concatenate directly instead of np.vstack to avoid the
# overhead of atleast_2d calls (block values are always 2D)
new_values = np.concatenate([b.values for b in blocks], axis=0)
else:
bvals = [blk.values for blk in blocks]
bvals2 = cast("Sequence[NDArrayBackedExtensionArray]", bvals)
new_values = bvals2[0]._concat_same_type(bvals2, axis=0)

argsort = np.argsort(new_mgr_locs)
new_values = new_values[argsort]
new_mgr_locs = new_mgr_locs[argsort]
# Only sort if locations are not already in order
if not libalgos.is_monotonic(new_mgr_locs, False)[0]:
argsort = np.argsort(new_mgr_locs)
new_values = new_values[argsort]
new_mgr_locs = new_mgr_locs[argsort]

bp = BlockPlacement(new_mgr_locs)
return [new_block_2d(new_values, placement=bp)], True
Expand Down
18 changes: 11 additions & 7 deletions pandas/tests/frame/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@ def test_dataframe_not_equal(self):
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
assert df1.equals(df2) is False

def test_equals_different_blocks(self, using_infer_string):
def test_equals_different_blocks(self):
# GH#9330
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
if not using_infer_string:
# this assert verifies that the above operations have
# induced a block rearrangement
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype
df0 = DataFrame(
{"A": [1.0, 2.0], "B": np.array([1, 2], dtype=np.int64), "C": [3.0, 4.0]}
)
# build df1 via sequential __setitem__ so the float columns end up
# in separate blocks instead of being consolidated upfront
df1 = DataFrame({"A": [1.0, 2.0]})
df1["B"] = np.array([1, 2], dtype=np.int64)
df1["C"] = np.array([3.0, 4.0])
assert len(df0._mgr.blocks) == 2
assert len(df1._mgr.blocks) == 3

# do the real tests
tm.assert_frame_equal(df0, df1)
Expand Down
Loading