alibaba
diff --git a/‎tzrec/models/dlrm_hstu.py‎
Lines changed: 467 additions & 0 deletions b/‎tzrec/models/dlrm_hstu.py‎
Lines changed: 467 additions & 0 deletions
diff --git a/‎tzrec/models/dlrm_hstu_test.py‎
Lines changed: 265 additions & 0 deletions b/‎tzrec/models/dlrm_hstu_test.py‎
Lines changed: 265 additions & 0 deletions
diff --git a/‎tzrec/models/rank_model.py‎
Lines changed: 10 additions & 0 deletions b/‎tzrec/models/rank_model.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tzrec/modules/norm.py‎
Lines changed: 2 additions & 14 deletions b/‎tzrec/modules/norm.py‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎tzrec/ops/hstu_attention.py‎
Lines changed: 10 additions & 4 deletions b/‎tzrec/ops/hstu_attention.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎tzrec/ops/hstu_compute.py‎
Lines changed: 12 additions & 6 deletions b/‎tzrec/ops/hstu_compute.py‎
Lines changed: 12 additions & 6 deletions
@@ -0,0 +1,265 @@
+# Copyright (c) 2025, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from parameterized import parameterized
+from torchrec import JaggedTensor, KeyedJaggedTensor
+
+from tzrec.datasets.utils import BASE_DATA_GROUP, Batch
+from tzrec.features.feature import create_features
+from tzrec.models.dlrm_hstu import DlrmHSTU
+from tzrec.ops import Kernel
+from tzrec.protos import (
+    feature_pb2,
+    loss_pb2,
+    model_pb2,
+    module_pb2,
+    tower_pb2,
+)
+from tzrec.protos.models import multi_task_rank_pb2
+from tzrec.utils.state_dict_util import init_parameters
+from tzrec.utils.test_util import TestGraphType, create_test_model
+
+
+class DlrmHSTUTest(unittest.TestCase):
+    @parameterized.expand(
+        [[TestGraphType.NORMAL], [TestGraphType.FX_TRACE], [TestGraphType.JIT_SCRIPT]]
+    )
+    def test_dlrm_hstu(self, graph_type) -> None:
+        feature_cfgs = [
+            feature_pb2.FeatureConfig(
+                id_feature=feature_pb2.IdFeature(
+                    feature_name="user_id", embedding_dim=16, num_buckets=100
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                id_feature=feature_pb2.IdFeature(
+                    feature_name="user_active_degree",
+                    embedding_dim=16,
+                    num_buckets=1000,
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="video_id",
+                    embedding_dim=16,
+                    embedding_name="video_id_emb",
+                    num_buckets=1000,
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="item_video_id",
+                    embedding_dim=16,
+                    embedding_name="video_id_emb",
+                    num_buckets=1000,
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="action_timestamp"
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="item_query_time"
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="action_weight",
+                    num_buckets=1000,
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_id_feature=feature_pb2.SequenceIdFeature(
+                    feature_name="item_action_weight",
+                    num_buckets=1000,
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_raw_feature=feature_pb2.SequenceRawFeature(
+                    feature_name="watch_time"
+                )
+            ),
+            feature_pb2.FeatureConfig(
+                sequence_raw_feature=feature_pb2.SequenceRawFeature(
+                    feature_name="item_target_watchtime"
+                )
+            ),
+        ]
+        features = create_features(feature_cfgs)
+        feature_groups = [
+            model_pb2.FeatureGroupConfig(
+                group_name="contextual",
+                feature_names=["user_id", "user_active_degree"],
+                group_type=model_pb2.FeatureGroupType.SEQUENCE,
+            ),
+            model_pb2.FeatureGroupConfig(
+                group_name="uih",
+                feature_names=[
+                    "video_id",
+                ],
+                group_type=model_pb2.FeatureGroupType.SEQUENCE,
+            ),
+            model_pb2.FeatureGroupConfig(
+                group_name="candidate",
+                feature_names=[
+                    "item_video_id",
+                ],
+                group_type=model_pb2.FeatureGroupType.SEQUENCE,
+            ),
+        ]
+
+        model_config = model_pb2.ModelConfig(
+            feature_groups=feature_groups,
+            dlrm_hstu=multi_task_rank_pb2.DlrmHSTU(
+                uih_id_feature_name="video_id",
+                uih_action_time_feature_name="action_timestamp",
+                uih_action_weight_feature_name="action_weight",
+                uih_watchtime_feature_name="watch_time",
+                candidates_id_feature_name="item_video_id",
+                candidates_query_time_feature_name="item_query_time",
+                candidates_action_weight_feature_name="item_action_weight",
+                candidates_watchtime_feature_name="item_target_watchtime",
+                hstu=module_pb2.HSTU(
+                    stu=module_pb2.STU(
+                        embedding_dim=512,
+                        num_heads=4,
+                        hidden_dim=128,
+                        attention_dim=128,
+                        output_dropout_ratio=0.2,
+                    ),
+                    positional_encoder=module_pb2.GRPositionalEncoder(
+                        num_position_buckets=8192,
+                        num_time_buckets=2048,
+                        use_time_encoding=True,
+                    ),
+                    input_preprocessor=module_pb2.GRInputPreprocessor(
+                        contextual_preprocessor=module_pb2.GRContextualPreprocessor(
+                            action_encoder=module_pb2.GRActionEncoder(
+                                action_embedding_dim=8,
+                                action_feature_name="action_weight",
+                                action_weights=[1, 2, 4],
+                            ),
+                            action_mlp=module_pb2.GRContextualizedMLP(
+                                simple_mlp=module_pb2.GRSimpleContextualizedMLP(
+                                    hidden_dim=256
+                                )
+                            ),
+                            content_mlp=module_pb2.GRContextualizedMLP(
+                                simple_mlp=module_pb2.GRSimpleContextualizedMLP(
+                                    hidden_dim=256
+                                )
+                            ),
+                        )
+                    ),
+                    output_postprocessor=module_pb2.GROutputPostprocessor(
+                        layernorm_postprocessor=module_pb2.GRLayerNormPostprocessor()
+                    ),
+                ),
+                fusion_mtl_tower=tower_pb2.FusionMTLTower(
+                    mlp=module_pb2.MLP(hidden_units=[512], activation="nn.SiLU"),
+                    task_configs=[
+                        tower_pb2.FusionSubTaskConfig(
+                            task_name="is_click",
+                            label_name="item_action_weight",
+                            task_bitmask=1,
+                            losses=[
+                                loss_pb2.LossConfig(
+                                    binary_cross_entropy=loss_pb2.BinaryCrossEntropy()
+                                )
+                            ],
+                        ),
+                        tower_pb2.FusionSubTaskConfig(
+                            task_name="is_like",
+                            label_name="item_action_weight",
+                            task_bitmask=2,
+                            losses=[
+                                loss_pb2.LossConfig(
+                                    binary_cross_entropy=loss_pb2.BinaryCrossEntropy()
+                                )
+                            ],
+                        ),
+                        tower_pb2.FusionSubTaskConfig(
+                            task_name="is_comment",
+                            label_name="item_action_weight",
+                            task_bitmask=4,
+                            losses=[
+                                loss_pb2.LossConfig(
+                                    binary_cross_entropy=loss_pb2.BinaryCrossEntropy()
+                                )
+                            ],
+                        ),
+                        tower_pb2.FusionSubTaskConfig(
+                            task_name="watchtime",
+                            label_name="item_target_watchtime",
+                            losses=[loss_pb2.LossConfig(l2_loss=loss_pb2.L2Loss())],
+                        ),
+                    ],
+                ),
+                max_seq_len=100,
+            ),
+        )
+        dlrm_hstu = DlrmHSTU(
+            model_config=model_config,
+            features=features,
+            labels=["item_action_weight", "item_target_watchtime"],
+        )
+        dlrm_hstu.set_kernel(Kernel.PYTORCH)
+        init_parameters(dlrm_hstu, device=torch.device("cpu"))
+        dlrm_hstu = create_test_model(dlrm_hstu, graph_type)
+
+        sparse_feature = KeyedJaggedTensor.from_lengths_sync(
+            keys=[
+                "user_id",
+                "user_active_degree",
+                "video_id",
+                "item_video_id",
+                "action_weight",
+                "item_action_weight",
+                "action_timestamp",
+                "item_query_time",
+            ],
+            values=torch.tensor(list(range(37))),
+            lengths=torch.tensor([1, 1, 1, 1, 2, 3, 2, 4, 2, 3, 2, 4, 2, 3, 2, 4]),
+        )
+        sequence_dense_features = {
+            "watch_time": JaggedTensor(
+                values=torch.tensor([[0.1], [0.2], [0.3], [0.4], [0.5]]),
+                lengths=torch.tensor([2, 3]),
+            ),
+            "item_target_watchtime": JaggedTensor(
+                values=torch.tensor([[0.1], [0.2], [0.3], [0.4], [0.5], [0.6]]),
+                lengths=torch.tensor([2, 4]),
+            ),
+        }
+        batch = Batch(
+            sequence_dense_features=sequence_dense_features,
+            sparse_features={BASE_DATA_GROUP: sparse_feature},
+            labels={},
+        )
+        if graph_type == TestGraphType.JIT_SCRIPT:
+            predictions = dlrm_hstu(batch.to_dict())
+        else:
+            predictions = dlrm_hstu(batch)
+        self.assertEqual(predictions["logits_is_click"].size(), (6,))
+        self.assertEqual(predictions["probs_is_click"].size(), (6,))
+        self.assertEqual(predictions["logits_is_like"].size(), (6,))
+        self.assertEqual(predictions["probs_is_like"].size(), (6,))
+        self.assertEqual(predictions["logits_is_comment"].size(), (6,))
+        self.assertEqual(predictions["probs_is_comment"].size(), (6,))
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -37,6 +37,16 @@ def _update_tensor_dict(
     tensor_dict[key] = new_tensor
 
 
+def _is_classification_loss(loss_cfg: LossConfig) -> bool:
+    loss_type = loss_cfg.WhichOneof("loss")
+    return loss_type in [
+        "binary_cross_entropy",
+        "softmax_cross_entropy",
+        "jrc_loss",
+        "binary_focal_loss",
+    ]
+
+
 class RankModel(BaseModel):
     """Base model for ranking.
 
 
@@ -15,12 +15,7 @@
 import torch
 
 from tzrec.modules.utils import BaseModule
-from tzrec.ops import Kernel
-from tzrec.ops.layer_norm import (
-    layer_norm,
-    swish_layer_norm,
-)
-from tzrec.ops.triton.triton_layer_norm import triton_rms_norm
+from tzrec.ops.layer_norm import layer_norm, rms_norm, swish_layer_norm
 
 
 class LayerNorm(BaseModule):
@@ -78,16 +73,9 @@ def __init__(
         self._eps = eps
         self._weight = torch.nn.Parameter(torch.ones(dim))
 
-    def _norm(self, x: torch.Tensor) -> torch.Tensor:
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self._eps)
-
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward the module."""
-        if self.kernel() == Kernel.TRITON:
-            return triton_rms_norm(x, self._weight, self._eps)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self._weight
+        return rms_norm(x=x, weight=self._weight, eps=self._eps, kernel=self.kernel())
 
 
 class SwishLayerNorm(BaseModule):
 
@@ -17,16 +17,22 @@
 
 import torch
 from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._triton import has_triton
 
 from tzrec.ops import Kernel
 from tzrec.ops.pytorch.pt_hstu_attention import (
     pytorch_cached_hstu_mha,
     pytorch_hstu_mha,
 )
-from tzrec.ops.triton.triton_hstu_attention import (
-    triton_cached_hstu_mha,
-    triton_hstu_mha,
-)
+
+if has_triton():
+    from tzrec.ops.triton.triton_hstu_attention import (
+        triton_cached_hstu_mha,
+        triton_hstu_mha,
+    )
+else:
+    triton_cached_hstu_mha = pytorch_cached_hstu_mha
+    triton_hstu_mha = pytorch_hstu_mha
 from tzrec.ops.utils import switch_to_contiguous_if_needed
 
 
 
@@ -18,6 +18,7 @@
 import torch
 import torch.nn.functional as F
 from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._triton import has_triton
 
 from tzrec.ops import Kernel
 from tzrec.ops.hstu_attention import hstu_mha
@@ -26,12 +27,13 @@
 from tzrec.ops.pytorch.pt_hstu_linear import (
     pytorch_hstu_compute_output,
 )
-from tzrec.ops.triton.triton_hstu_linear import (
-    triton_hstu_compute_output,
-)
-from tzrec.ops.triton.triton_hstu_preprocess_and_attention import (
-    triton_hstu_preprocess_and_attention,
-)
+
+if has_triton():
+    from tzrec.ops.triton.triton_hstu_linear import (
+        triton_hstu_compute_output,
+    )
+else:
+    triton_hstu_compute_output = pytorch_hstu_compute_output
 
 
 def hstu_compute_uqvk(
@@ -164,6 +166,10 @@ def hstu_preprocess_and_attention(
             "uvqk_weight.shape[1] must equal 2 * num_heads * (hidden_dim + attn_dim)",
         )
     if kernel == Kernel.TRITON and prefill is False:
+        from tzrec.ops.triton.triton_hstu_preprocess_and_attention import (
+            triton_hstu_preprocess_and_attention,
+        )
+
         u, attn_output = triton_hstu_preprocess_and_attention(
             x=x,
             norm_weight=norm_weight,