pytorch
diff --git a/‎tests/unit_tests/test_varlen_cp/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎tests/unit_tests/test_varlen_cp/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/unit_tests/test_varlen_cp/test_dispatch_ops.py‎
Lines changed: 70 additions & 0 deletions b/‎tests/unit_tests/test_varlen_cp/test_dispatch_ops.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎tests/unit_tests/test_varlen_cp/test_dispatch_solver.py‎
Lines changed: 123 additions & 0 deletions b/‎tests/unit_tests/test_varlen_cp/test_dispatch_solver.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎tests/unit_tests/test_varlen_cp/test_mask_primitives.py‎
Lines changed: 188 additions & 0 deletions b/‎tests/unit_tests/test_varlen_cp/test_mask_primitives.py‎
Lines changed: 188 additions & 0 deletions
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from torchtitan.distributed.varlen_cp.dispatch_ops import (
+    compute_local_cu_seqlens,
+    shard_sequence,
+)
+
+
+class TestComputeLocalCuSeqlens(unittest.TestCase):
+    def test_doc_within_chunk(self):
+        """Document entirely within the chunk."""
+        global_cu = torch.tensor([0, 128, 256], dtype=torch.int32)
+        local_cu, max_seqlen = compute_local_cu_seqlens(global_cu, 0, 128)
+        self.assertEqual(local_cu.tolist(), [0, 128])
+        self.assertEqual(max_seqlen, 128)
+
+    def test_doc_spanning_chunk(self):
+        """Document spans across the chunk boundary."""
+        global_cu = torch.tensor([0, 300, 512], dtype=torch.int32)
+        # Chunk [0, 256): contains part of doc 0 (0-256)
+        local_cu, max_seqlen = compute_local_cu_seqlens(global_cu, 0, 256)
+        self.assertEqual(local_cu.tolist(), [0, 256])
+        self.assertEqual(max_seqlen, 256)
+
+        # Chunk [256, 512): contains rest of doc 0 (256-300) and all of doc 1 (300-512)
+        local_cu, max_seqlen = compute_local_cu_seqlens(global_cu, 256, 512)
+        self.assertEqual(local_cu.tolist(), [0, 44, 256])
+        self.assertEqual(max_seqlen, 212)  # doc 1 has 212 tokens in this chunk
+
+    def test_multiple_docs_in_chunk(self):
+        """Multiple documents fit within one chunk."""
+        global_cu = torch.tensor([0, 64, 128, 192, 256], dtype=torch.int32)
+        local_cu, max_seqlen = compute_local_cu_seqlens(global_cu, 0, 256)
+        self.assertEqual(local_cu.tolist(), [0, 64, 128, 192, 256])
+        self.assertEqual(max_seqlen, 64)
+
+    def test_chunk_with_no_doc_boundaries(self):
+        """Chunk is entirely within a single document."""
+        global_cu = torch.tensor([0, 512], dtype=torch.int32)
+        local_cu, max_seqlen = compute_local_cu_seqlens(global_cu, 128, 384)
+        self.assertEqual(local_cu.tolist(), [0, 256])
+        self.assertEqual(max_seqlen, 256)
+
+
+class TestShardSequence(unittest.TestCase):
+    def test_basic_sharding(self):
+        x = torch.arange(8).float()
+        shard_0 = shard_sequence(x, cp_rank=0, cp_world_size=2, seq_dim=0)
+        shard_1 = shard_sequence(x, cp_rank=1, cp_world_size=2, seq_dim=0)
+        torch.testing.assert_close(shard_0, torch.tensor([0.0, 1, 2, 3]))
+        torch.testing.assert_close(shard_1, torch.tensor([4.0, 5, 6, 7]))
+
+    def test_2d_sharding(self):
+        x = torch.arange(16).float().reshape(2, 8)
+        shard_0 = shard_sequence(x, cp_rank=0, cp_world_size=2, seq_dim=1)
+        shard_1 = shard_sequence(x, cp_rank=1, cp_world_size=2, seq_dim=1)
+        self.assertEqual(shard_0.shape, (2, 4))
+        self.assertEqual(shard_1.shape, (2, 4))
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from torchtitan.distributed.varlen_cp.dispatch_solver import solve_dispatch
+from torchtitan.distributed.varlen_cp.mask_primitives import (
+    cu_seqlens_to_attn_slices,
+)
+
+
+class TestSolveDispatch(unittest.TestCase):
+    def test_uniform_docs(self):
+        """Uniform document lengths should be balanced by default."""
+        cu_seqlens = [0, 128, 256, 384, 512]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=512, chunk_size=256, cp_world_size=2)
+
+        self.assertEqual(plan.cp_world_size, 2)
+        self.assertEqual(plan.chunk_size, 256)
+        self.assertEqual(plan.total_seqlen, 512)
+        self.assertEqual(plan.pad_size, 0)
+
+        # All ranks should have some work
+        for rank in range(2):
+            self.assertGreater(plan.get_rank_work(rank), 0)
+
+    def test_skewed_docs(self):
+        """Skewed document lengths should still be distributed."""
+        cu_seqlens = [0, 400, 450, 500, 512]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=512, chunk_size=256, cp_world_size=2)
+
+        # Both ranks should get work
+        work_0 = plan.get_rank_work(0)
+        work_1 = plan.get_rank_work(1)
+        self.assertGreater(work_0, 0)
+        self.assertGreater(work_1, 0)
+
+    def test_all_slices_covered(self):
+        """All global slices should appear in some rank's assignment."""
+        cu_seqlens = [0, 128, 300, 512]
+        global_slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(
+            global_slices, total_seqlen=512, chunk_size=256, cp_world_size=2
+        )
+
+        # Count total sub-slices across all ranks
+        total_sub_slices = sum(
+            len(cp.slices)
+            for rank_assignments in plan.assignments
+            for cp in rank_assignments
+        )
+        self.assertGreater(total_sub_slices, 0)
+
+    def test_single_rank(self):
+        """Single CP rank should get all work."""
+        cu_seqlens = [0, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=256, chunk_size=256, cp_world_size=1)
+
+        self.assertEqual(len(plan.assignments), 1)
+        self.assertGreater(plan.get_rank_work(0), 0)
+
+    def test_minimax_property(self):
+        """Greedy min-heap should produce reasonable load balance."""
+        cu_seqlens = [0, 100, 200, 300, 400, 512]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=512, chunk_size=128, cp_world_size=4)
+
+        works = [plan.get_rank_work(r) for r in range(4)]
+        # Max work should be within 3x of min work (loose bound)
+        if min(works) > 0:
+            ratio = max(works) / min(works)
+            self.assertLess(ratio, 3.0)
+
+    def test_num_chunks(self):
+        plan = solve_dispatch(
+            cu_seqlens_to_attn_slices([0, 512]),
+            total_seqlen=512,
+            chunk_size=128,
+            cp_world_size=4,
+        )
+        self.assertEqual(plan.num_chunks, 4)
+
+    def test_pair_has_work(self):
+        """pair_has_work returns True for valid pairs and False for above-diagonal pairs."""
+        # Single doc of length 256, chunk_size=128, 2 chunks
+        # Valid pairs: (0,0) diagonal, (1,1) diagonal, (1,0) below diagonal
+        # Invalid: (0,1) above diagonal
+        cu_seqlens = [0, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=256, chunk_size=128, cp_world_size=2)
+
+        # Diagonal and below-diagonal pairs should have work
+        self.assertTrue(plan.pair_has_work(0, 0))
+        self.assertTrue(plan.pair_has_work(1, 1))
+        self.assertTrue(plan.pair_has_work(1, 0))
+
+        # Above diagonal should NOT have work
+        self.assertFalse(plan.pair_has_work(0, 1))
+
+    def test_pair_has_work_no_spanning_docs(self):
+        """pair_has_work returns False for off-diagonal when no doc spans chunks."""
+        # Two docs, each exactly one chunk, no doc spans both chunks
+        cu_seqlens = [0, 128, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        plan = solve_dispatch(slices, total_seqlen=256, chunk_size=128, cp_world_size=2)
+
+        # Diagonal pairs have work
+        self.assertTrue(plan.pair_has_work(0, 0))
+        self.assertTrue(plan.pair_has_work(1, 1))
+
+        # Off-diagonal pairs have NO work (no doc spans both chunks)
+        self.assertFalse(plan.pair_has_work(1, 0))
+        self.assertFalse(plan.pair_has_work(0, 1))
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from torchtitan.distributed.varlen_cp.mask_primitives import (
+    AttnSlice,
+    cu_seqlens_to_attn_slices,
+    make_slice_mask,
+    MaskType,
+    split_slice_at_chunk_boundary,
+)
+
+
+class TestMaskType(unittest.TestCase):
+    def test_mask_type_values(self):
+        self.assertEqual(MaskType.FULL, 0)
+        self.assertEqual(MaskType.CAUSAL, 1)
+        self.assertEqual(MaskType.INVCAUSAL, 2)
+        self.assertEqual(MaskType.BICAUSAL, 3)
+
+
+class TestAttnSlice(unittest.TestCase):
+    def test_basic_properties(self):
+        s = AttnSlice(q_start=0, q_end=10, k_start=0, k_end=10, mask_type=MaskType.FULL)
+        self.assertEqual(s.q_len, 10)
+        self.assertEqual(s.k_len, 10)
+
+    def test_work_estimate_full(self):
+        s = AttnSlice(q_start=0, q_end=100, k_start=0, k_end=100, mask_type=MaskType.FULL)
+        self.assertAlmostEqual(s.work_estimate, 10000.0)
+
+    def test_work_estimate_causal(self):
+        s = AttnSlice(q_start=0, q_end=100, k_start=0, k_end=100, mask_type=MaskType.CAUSAL)
+        self.assertAlmostEqual(s.work_estimate, 5000.0)
+
+    def test_work_estimate_invcausal(self):
+        s = AttnSlice(
+            q_start=0, q_end=100, k_start=0, k_end=100, mask_type=MaskType.INVCAUSAL
+        )
+        self.assertAlmostEqual(s.work_estimate, 5000.0)
+
+    def test_work_estimate_bicausal(self):
+        s = AttnSlice(
+            q_start=0, q_end=100, k_start=0, k_end=100, mask_type=MaskType.BICAUSAL
+        )
+        self.assertAlmostEqual(s.work_estimate, 2500.0)
+
+    def test_work_estimate_minimum(self):
+        """Empty slices should have work_estimate >= 1.0."""
+        s = AttnSlice(q_start=0, q_end=1, k_start=0, k_end=1, mask_type=MaskType.CAUSAL)
+        self.assertGreaterEqual(s.work_estimate, 1.0)
+
+
+class TestCuSeqlensToAttnSlices(unittest.TestCase):
+    def test_single_doc(self):
+        cu_seqlens = [0, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens, is_causal=True)
+        self.assertEqual(len(slices), 1)
+        self.assertEqual(slices[0].q_start, 0)
+        self.assertEqual(slices[0].q_end, 256)
+        self.assertEqual(slices[0].mask_type, MaskType.CAUSAL)
+
+    def test_multi_doc(self):
+        cu_seqlens = [0, 128, 300, 512]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens, is_causal=True)
+        self.assertEqual(len(slices), 3)
+        self.assertEqual(slices[0], AttnSlice(0, 128, 0, 128, MaskType.CAUSAL))
+        self.assertEqual(slices[1], AttnSlice(128, 300, 128, 300, MaskType.CAUSAL))
+        self.assertEqual(slices[2], AttnSlice(300, 512, 300, 512, MaskType.CAUSAL))
+
+    def test_full_mask(self):
+        cu_seqlens = [0, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens, is_causal=False)
+        self.assertEqual(len(slices), 1)
+        self.assertEqual(slices[0].mask_type, MaskType.FULL)
+
+    def test_tensor_input(self):
+        cu_seqlens = torch.tensor([0, 128, 256])
+        slices = cu_seqlens_to_attn_slices(cu_seqlens, is_causal=True)
+        self.assertEqual(len(slices), 2)
+
+    def test_empty_doc(self):
+        """Adjacent equal values in cu_seqlens create zero-length docs."""
+        cu_seqlens = [0, 128, 128, 256]
+        slices = cu_seqlens_to_attn_slices(cu_seqlens)
+        # Zero-length docs should be skipped
+        self.assertEqual(len(slices), 2)
+
+
+class TestSplitSliceAtChunkBoundary(unittest.TestCase):
+    def test_doc_within_one_chunk(self):
+        """Document fits entirely within one chunk."""
+        s = AttnSlice(q_start=10, q_end=50, k_start=10, k_end=50, mask_type=MaskType.CAUSAL)
+        result = split_slice_at_chunk_boundary(s, chunk_size=64, total_seqlen=128)
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0].mask_type, MaskType.CAUSAL)
+        self.assertEqual(result[0].q_start, 10)
+        self.assertEqual(result[0].q_end, 50)
+
+    def test_doc_spanning_two_chunks(self):
+        """Document spans two chunks: diagonal blocks are CAUSAL, below-diagonal are FULL."""
+        s = AttnSlice(q_start=48, q_end=80, k_start=48, k_end=80, mask_type=MaskType.CAUSAL)
+        result = split_slice_at_chunk_boundary(s, chunk_size=64, total_seqlen=128)
+
+        # Should produce 3 sub-slices:
+        # (chunk 0, chunk 0): q=[48,64), k=[48,64), CAUSAL
+        # (chunk 1, chunk 0): q=[64,80), k=[48,64), FULL (below diagonal)
+        # (chunk 1, chunk 1): q=[64,80), k=[64,80), CAUSAL (diagonal)
+        self.assertEqual(len(result), 3)
+
+        # Check that we have the expected types
+        types = {(r.q_start // 64, r.k_start // 64): r.mask_type for r in result}
+        self.assertEqual(types[(0, 0)], MaskType.CAUSAL)
+        self.assertEqual(types[(1, 0)], MaskType.FULL)
+        self.assertEqual(types[(1, 1)], MaskType.CAUSAL)
+
+    def test_full_mask_stays_full(self):
+        """FULL mask type sub-blocks are all FULL."""
+        s = AttnSlice(q_start=48, q_end=80, k_start=48, k_end=80, mask_type=MaskType.FULL)
+        result = split_slice_at_chunk_boundary(s, chunk_size=64, total_seqlen=128)
+        for r in result:
+            self.assertEqual(r.mask_type, MaskType.FULL)
+
+
+class TestMakeSliceMask(unittest.TestCase):
+    def test_full_mask(self):
+        mask = make_slice_mask(4, 4, MaskType.FULL)
+        self.assertTrue(mask.all())
+
+    def test_causal_square(self):
+        mask = make_slice_mask(4, 4, MaskType.CAUSAL)
+        expected = torch.tensor(
+            [
+                [True, False, False, False],
+                [True, True, False, False],
+                [True, True, True, False],
+                [True, True, True, True],
+            ]
+        )
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_causal_rectangular(self):
+        """Bottom-right aligned causal for q_len < k_len."""
+        mask = make_slice_mask(2, 4, MaskType.CAUSAL)
+        # q_len=2, k_len=4, offset = k_len - q_len = 2
+        # Row 0: j <= 0+2 → j in {0,1,2}
+        # Row 1: j <= 1+2 → j in {0,1,2,3}
+        expected = torch.tensor(
+            [
+                [True, True, True, False],
+                [True, True, True, True],
+            ]
+        )
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_invcausal_mask(self):
+        mask = make_slice_mask(4, 4, MaskType.INVCAUSAL)
+        expected = torch.tensor(
+            [
+                [True, True, True, True],
+                [False, True, True, True],
+                [False, False, True, True],
+                [False, False, False, True],
+            ]
+        )
+        self.assertTrue(torch.equal(mask, expected))
+
+    def test_bicausal_mask(self):
+        mask = make_slice_mask(4, 4, MaskType.BICAUSAL)
+        expected = torch.tensor(
+            [
+                [True, False, False, False],
+                [False, True, False, False],
+                [False, False, True, False],
+                [False, False, False, True],
+            ]
+        )
+        self.assertTrue(torch.equal(mask, expected))
+
+
+if __name__ == "__main__":
+    unittest.main()