[feat] support ignore unused features in negative sampler (#102)

tiankongdeguiji · web-flow · commit a52dcfad3df5 · 2025-02-11T19:07:11.000+08:00
diff --git a/tzrec/datasets/sampler.py b/tzrec/datasets/sampler.py
@@ -29,6 +29,7 @@
 from tzrec.protos import sampler_pb2
 from tzrec.utils.env_util import use_hash_node_id
 from tzrec.utils.load_class import get_register_class_meta
+from tzrec.utils.logging_util import logger
 from tzrec.utils.misc_util import get_free_port
 
 
@@ -205,12 +206,25 @@ def __init__(
         self._attr_types = []
         self._attr_gl_types = []
         self._attr_np_types = []
+        self._valid_attr_names = []
+        self._ignore_attr_names = set()
         for field_name in config.attr_fields:
-            field = input_fields[field_name]
+            if field_name in input_fields:
+                field = input_fields[field_name]
+                self._valid_attr_names.append(field.name)
+            else:
+                field = pa.field(name=field_name, type=pa.string())
+                self._ignore_attr_names.add(field_name)
             self._attr_names.append(field.name)
             self._attr_types.append(field.type)
             self._attr_gl_types.append(_get_gl_type(field.type))
             self._attr_np_types.append(_get_np_type(field.type))
+        if len(self._ignore_attr_names) > 0:
+            logger.warning(
+                f"Features {self._ignore_attr_names} in "
+                # pyre-ignore [16]
+                f"{self.__class__.__name__} will be ignored."
+            )
 
         if config.HasField("field_delimiter"):
             gl.set_field_delimiter(config.field_delimiter)
@@ -268,9 +282,12 @@ def _parse_nodes(self, nodes: gl.Nodes) -> List[pa.Array]:
         int_idx = 0
         float_idx = 0
         string_idx = 0
-        for attr_type, attr_gl_type, attr_np_type in zip(
-            self._attr_types, self._attr_gl_types, self._attr_np_types
+        for attr_name, attr_type, attr_gl_type, attr_np_type in zip(
+            self._attr_names, self._attr_types, self._attr_gl_types, self._attr_np_types
         ):
+            if attr_name in self._ignore_attr_names:
+                string_idx += 1
+                continue
             if attr_gl_type == "int":
                 feature = nodes.int_attrs[:, :, int_idx]
                 int_idx += 1
@@ -295,9 +312,12 @@ def _parse_sparse_nodes(
         int_idx = 0
         float_idx = 0
         string_idx = 0
-        for attr_type, attr_gl_type, attr_np_type in zip(
-            self._attr_types, self._attr_gl_types, self._attr_np_types
+        for attr_name, attr_type, attr_gl_type, attr_np_type in zip(
+            self._attr_names, self._attr_types, self._attr_gl_types, self._attr_np_types
         ):
+            if attr_name in self._ignore_attr_names:
+                string_idx += 1
+                continue
             if attr_gl_type == "int":
                 feature = nodes.int_attrs[:, int_idx]
                 int_idx += 1
@@ -379,7 +399,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
         ids = np.pad(ids, (0, self._batch_size - len(ids)), "edge")
         nodes = self._sampler.get(ids)
         features = self._parse_nodes(nodes)
-        result_dict = dict(zip(self._attr_names, features))
+        result_dict = dict(zip(self._valid_attr_names, features))
         return result_dict
 
     @property
@@ -470,7 +490,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
         dst_ids = np.pad(dst_ids, (0, self._batch_size - len(dst_ids)), "edge")
         nodes = self._sampler.get(src_ids, dst_ids)
         features = self._parse_nodes(nodes)
-        result_dict = dict(zip(self._attr_names, features))
+        result_dict = dict(zip(self._valid_attr_names, features))
         return result_dict
 
     @property
@@ -565,7 +585,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
         for i, v in enumerate(hard_neg_features):
             results.append(pa.concat_arrays([neg_features[i], v]))
 
-        result_dict = dict(zip(self._attr_names, results))
+        result_dict = dict(zip(self._valid_attr_names, results))
         result_dict["hard_neg_indices"] = pa.array(hard_neg_indices)
         return result_dict
 
@@ -667,7 +687,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
         for i, v in enumerate(hard_neg_features):
             results.append(pa.concat_arrays([neg_features[i], v]))
 
-        result_dict = dict(zip(self._attr_names, results))
+        result_dict = dict(zip(self._valid_attr_names, results))
         result_dict["hard_neg_indices"] = pa.array(hard_neg_indices)
         return result_dict
 
@@ -789,7 +809,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
         """
         ids = _pa_ids_to_npy(input_data[self._item_id_field]).reshape(-1, 1)
         batch_size = len(ids)
-        num_fea = len(self._attr_names[1:])
+        num_fea = len(self._valid_attr_names[1:])
 
         # positive node.
         pos_nodes = self._pos_sampler.get(ids).layer_nodes(1)
@@ -859,8 +879,8 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
             for i in range(num_fea)
         ]
 
-        pos_result_dict = dict(zip(self._attr_names[1:], pos_fea_result))
-        neg_result_dict = dict(zip(self._attr_names[1:], neg_fea_result))
+        pos_result_dict = dict(zip(self._valid_attr_names[1:], pos_fea_result))
+        neg_result_dict = dict(zip(self._valid_attr_names[1:], neg_fea_result))
 
         return pos_result_dict, neg_result_dict
 
@@ -941,7 +961,7 @@ def get(self, input_data: Dict[str, pa.Array]) -> Dict[str, pa.Array]:
 
         pos_nodes = self._pos_sampler.get(ids).layer_nodes(1)
         pos_fea_result = self._parse_nodes(pos_nodes)[1:]
-        pos_result_dict = dict(zip(self._attr_names[1:], pos_fea_result))
+        pos_result_dict = dict(zip(self._valid_attr_names[1:], pos_fea_result))
 
         return pos_result_dict
 
diff --git a/tzrec/datasets/sampler_test.py b/tzrec/datasets/sampler_test.py
@@ -174,6 +174,39 @@ def _sampler_worker(res):
         self.assertEqual(len(res["float_b"]), 8)
         self.assertEqual(len(res["str_c"]), 8)
 
+    def test_negative_sampler_with_ignore_feature(self):
+        f = self._create_item_gl_data()
+
+        def _sampler_worker(res):
+            config = sampler_pb2.NegativeSampler(
+                input_path=f.name,
+                num_sample=8,
+                attr_fields=["int_a", "float_b", "str_c"],
+                item_id_field="item_id",
+            )
+            sampler = NegativeSampler(
+                config=config,
+                fields=[
+                    pa.field(name="int_a", type=pa.int64()),
+                    pa.field(name="str_c", type=pa.string()),
+                ],
+                batch_size=4,
+            )
+            assert sampler.estimated_sample_num == 8
+            sampler.init_cluster()
+            sampler.launch_server()
+            sampler.init()
+            res.update(sampler.get({"item_id": pa.array([0, 1, 2, 3])}))
+
+        res = mp.Manager().dict()
+        p = mp.Process(target=_sampler_worker, args=(res,))
+        p.start()
+        p.join()
+        if p.exitcode != 0:
+            raise RuntimeError("worker failed.")
+        self.assertEqual(len(res["int_a"]), 8)
+        self.assertEqual(len(res["str_c"]), 8)
+
     @unittest.skip("accidental process defunct error")
     def test_negative_sampler_multi_node(self):
         f = self._create_item_gl_data()