Skip to content

Commit 8f79e78

Browse files
[bugfix] support remove bucketizer for sequence feature and add tests (#215)
1 parent 7caf7d4 commit 8f79e78

3 files changed

Lines changed: 205 additions & 24 deletions

File tree

tzrec/features/feature.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -813,8 +813,21 @@ def _copy_assets(
813813
return feature
814814

815815

816+
def _remove_one_feature_bucketizer(fg_json: Dict[str, Any]) -> Dict[str, Any]:
817+
fg_json.pop("hash_bucket_size", None)
818+
fg_json.pop("vocab_dict", None)
819+
fg_json.pop("vocab_list", None)
820+
fg_json.pop("boundaries", None)
821+
fg_json.pop("num_buckets", None)
822+
if fg_json["feature_type"] != "tokenize_feature":
823+
fg_json.pop("vocab_file", None)
824+
return fg_json
825+
826+
816827
def create_fg_json(
817-
features: List[BaseFeature], asset_dir: Optional[str] = None
828+
features: List[BaseFeature],
829+
asset_dir: Optional[str] = None,
830+
remove_bucketizer: bool = False,
818831
) -> Dict[str, Any]:
819832
"""Create feature generate config for features."""
820833
results = []
@@ -835,10 +848,14 @@ def create_fg_json(
835848
)
836849
seq_to_idx[feature.sequence_name] = len(results) - 1
837850
fg_json = feature.fg_json()
851+
if remove_bucketizer:
852+
fg_json = [_remove_one_feature_bucketizer(x) for x in fg_json]
838853
idx = seq_to_idx[feature.sequence_name]
839854
results[idx]["features"].extend(fg_json)
840855
else:
841856
fg_json = feature.fg_json()
857+
if remove_bucketizer:
858+
fg_json = [_remove_one_feature_bucketizer(x) for x in fg_json]
842859
results.extend(fg_json)
843860
return {"features": results}
844861

tzrec/features/feature_test.py

Lines changed: 184 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import shutil
1515
import tempfile
1616
import unittest
17+
from collections import OrderedDict
1718

1819
import numpy as np
1920
import pyarrow as pa
@@ -238,23 +239,27 @@ def _create_test_feature_cfgs(self):
238239
return [
239240
feature_pb2.FeatureConfig(
240241
id_feature=feature_pb2.IdFeature(
241-
feature_name="cat_a", expression="item:cat_a", hash_bucket_size=100
242+
feature_name="cat_a", expression="item:cat_a", num_buckets=100
242243
)
243244
),
244245
feature_pb2.FeatureConfig(
245246
raw_feature=feature_pb2.RawFeature(
246-
feature_name="int_a", expression="item:int_a"
247+
feature_name="int_a", expression="item:int_a", boundaries=[1, 2, 3]
247248
)
248249
),
249250
feature_pb2.FeatureConfig(
250251
combo_feature=feature_pb2.ComboFeature(
251252
feature_name="combo_c",
252253
expression=["user:combo_uc", "item:combo_ic"],
254+
hash_bucket_size=1000,
253255
)
254256
),
255257
feature_pb2.FeatureConfig(
256258
lookup_feature=feature_pb2.LookupFeature(
257-
feature_name="lookup_d", map="user:map_d", key="item:key_d"
259+
feature_name="lookup_d",
260+
map="user:map_d",
261+
key="item:key_d",
262+
vocab_list=["a", "b", "c"],
258263
)
259264
),
260265
feature_pb2.FeatureConfig(
@@ -263,13 +268,15 @@ def _create_test_feature_cfgs(self):
263268
nested_map="user:nested_map",
264269
pkey="item:key_e",
265270
skey="item:key_f",
271+
vocab_dict={"e": 2, "f": 3, "g": 4},
266272
)
267273
),
268274
feature_pb2.FeatureConfig(
269275
expr_feature=feature_pb2.ExprFeature(
270276
feature_name="expr_f",
271277
expression="int_g+int_h",
272278
variables=["item:int_g", "item:int_h"],
279+
boundaries=[4, 5, 6],
273280
)
274281
),
275282
feature_pb2.FeatureConfig(
@@ -285,6 +292,8 @@ def _create_test_feature_cfgs(self):
285292
expression="item:click_seq_cat_simple",
286293
sequence_length=50,
287294
sequence_delim=";",
295+
vocab_file="data/test/id_vocab_list_0",
296+
default_bucketize_value=0,
288297
)
289298
),
290299
feature_pb2.FeatureConfig(
@@ -304,12 +313,16 @@ def _create_test_feature_cfgs(self):
304313
features=[
305314
feature_pb2.SeqFeatureConfig(
306315
id_feature=feature_pb2.IdFeature(
307-
feature_name="cat_a", expression="item:cat_a"
316+
feature_name="cat_a",
317+
expression="item:cat_a",
318+
hash_bucket_size=10,
308319
)
309320
),
310321
feature_pb2.SeqFeatureConfig(
311322
raw_feature=feature_pb2.RawFeature(
312-
feature_name="int_a", expression="item:int_a"
323+
feature_name="int_a",
324+
expression="item:int_a",
325+
boundaries=[7, 8, 9],
313326
)
314327
),
315328
],
@@ -321,10 +334,12 @@ def _create_test_feature_cfgs(self):
321334
def test_create_fg_json(self, with_asset_dir=False):
322335
asset_dir = None
323336
token_file = "data/test/tokenizer.json"
337+
vocab_file = "data/test/id_vocab_list_0"
324338
if with_asset_dir:
325339
self.test_dir = tempfile.mkdtemp(prefix="tzrec_", dir="./tmp")
326340
asset_dir = self.test_dir
327341
token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
342+
vocab_file = "id_vocab_list_0_583794bd44eb2c6d83336c71258521e8"
328343
feature_cfgs = self._create_test_feature_cfgs()
329344
features = feature_lib.create_features(feature_cfgs, fg_mode=FgMode.FG_DAG)
330345
fg_json = feature_lib.create_fg_json(features, asset_dir=asset_dir)
@@ -340,15 +355,16 @@ def test_create_fg_json(self, with_asset_dir=False):
340355
"expression": "item:cat_a",
341356
"value_type": "string",
342357
"need_prefix": False,
343-
"hash_bucket_size": 100,
344358
"value_dim": 0,
359+
"num_buckets": 100,
345360
},
346361
{
347362
"feature_type": "raw_feature",
348363
"feature_name": "int_a",
349364
"default_value": "0",
350365
"expression": "item:int_a",
351366
"value_type": "float",
367+
"boundaries": [1.0, 2.0, 3.0],
352368
},
353369
{
354370
"feature_type": "combo_feature",
@@ -358,17 +374,21 @@ def test_create_fg_json(self, with_asset_dir=False):
358374
"value_type": "string",
359375
"need_prefix": False,
360376
"value_dim": 0,
377+
"hash_bucket_size": 1000,
361378
},
362379
{
363380
"feature_type": "lookup_feature",
364381
"feature_name": "lookup_d",
365382
"map": "user:map_d",
366383
"key": "item:key_d",
367384
"default_value": "0",
368-
"value_type": "float",
369-
"needDiscrete": False,
385+
"value_type": "string",
386+
"needDiscrete": True,
370387
"needKey": False,
371-
"combiner": "sum",
388+
"combiner": "",
389+
"value_dim": 1,
390+
"vocab_list": ["0", "<OOV>", "a", "b", "c"],
391+
"default_bucketize_value": 1,
372392
},
373393
{
374394
"feature_type": "match_feature",
@@ -378,10 +398,159 @@ def test_create_fg_json(self, with_asset_dir=False):
378398
"item": "item:key_f",
379399
"matchType": "hit",
380400
"default_value": "0",
401+
"value_type": "string",
402+
"needDiscrete": True,
403+
"show_category": False,
404+
"show_item": False,
405+
"value_dim": 1,
406+
"vocab_dict": OrderedDict(
407+
[("e", 2), ("f", 3), ("g", 4), ("0", 0)]
408+
),
409+
"default_bucketize_value": 1,
410+
},
411+
{
412+
"feature_name": "expr_f",
413+
"feature_type": "expr_feature",
414+
"expression": "int_g+int_h",
415+
"variables": ["item:int_g", "item:int_h"],
416+
"default_value": "0",
417+
"value_type": "float",
418+
"boundaries": [4.0, 5.0, 6.0],
419+
},
420+
{
421+
"feature_name": "token_g",
422+
"feature_type": "tokenize_feature",
423+
"expression": "item:token_g",
424+
"output_delim": "\x03",
425+
"output_type": "word_id",
426+
"tokenizer_type": "bpe",
427+
"vocab_file": token_file,
428+
"default_value": "",
429+
},
430+
{
431+
"feature_name": "click_seq_cat_simple",
432+
"feature_type": "sequence_id_feature",
433+
"sequence_delim": ";",
434+
"sequence_length": 50,
435+
"expression": "item:click_seq_cat_simple",
436+
"default_value": "0",
437+
"need_prefix": False,
438+
"value_type": "string",
439+
"value_dim": 1,
440+
"vocab_file": vocab_file,
441+
"default_bucketize_value": 0,
442+
},
443+
{
444+
"feature_name": "click_seq_int_simple",
445+
"feature_type": "sequence_raw_feature",
446+
"sequence_delim": ";",
447+
"sequence_length": 50,
448+
"expression": "user:click_seq_int_simple",
449+
"default_value": "0",
450+
"value_type": "float",
451+
},
452+
{
453+
"sequence_name": "click_seq",
454+
"sequence_length": 50,
455+
"sequence_delim": ";",
456+
"sequence_pk": "user:click_seq",
457+
"features": [
458+
{
459+
"feature_type": "id_feature",
460+
"feature_name": "cat_a",
461+
"default_value": "0",
462+
"expression": "item:cat_a",
463+
"value_type": "string",
464+
"need_prefix": False,
465+
"value_dim": 1,
466+
"hash_bucket_size": 10,
467+
},
468+
{
469+
"feature_type": "raw_feature",
470+
"feature_name": "int_a",
471+
"default_value": "0",
472+
"expression": "item:int_a",
473+
"value_type": "float",
474+
"boundaries": [7.0, 8.0, 9.0],
475+
},
476+
],
477+
},
478+
]
479+
},
480+
)
481+
if with_asset_dir:
482+
self.assertTrue(os.path.exists(os.path.join(asset_dir, token_file)))
483+
484+
@parameterized.expand([[False], [True]])
485+
def test_create_fg_json_remove_bucketizer(self, with_asset_dir=False):
486+
asset_dir = None
487+
token_file = "data/test/tokenizer.json"
488+
if with_asset_dir:
489+
self.test_dir = tempfile.mkdtemp(prefix="tzrec_", dir="./tmp")
490+
asset_dir = self.test_dir
491+
token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
492+
feature_cfgs = self._create_test_feature_cfgs()
493+
features = feature_lib.create_features(feature_cfgs, fg_mode=FgMode.FG_DAG)
494+
fg_json = feature_lib.create_fg_json(
495+
features, asset_dir=asset_dir, remove_bucketizer=True
496+
)
497+
self.maxDiff = None
498+
self.assertEqual(
499+
fg_json,
500+
{
501+
"features": [
502+
{
503+
"feature_type": "id_feature",
504+
"feature_name": "cat_a",
505+
"default_value": "",
506+
"expression": "item:cat_a",
507+
"value_type": "string",
508+
"need_prefix": False,
509+
"value_dim": 0,
510+
},
511+
{
512+
"feature_type": "raw_feature",
513+
"feature_name": "int_a",
514+
"default_value": "0",
515+
"expression": "item:int_a",
381516
"value_type": "float",
382-
"needDiscrete": False,
517+
},
518+
{
519+
"feature_type": "combo_feature",
520+
"feature_name": "combo_c",
521+
"default_value": "",
522+
"expression": ["user:combo_uc", "item:combo_ic"],
523+
"value_type": "string",
524+
"need_prefix": False,
525+
"value_dim": 0,
526+
},
527+
{
528+
"feature_type": "lookup_feature",
529+
"feature_name": "lookup_d",
530+
"map": "user:map_d",
531+
"key": "item:key_d",
532+
"default_value": "0",
533+
"value_type": "string",
534+
"needDiscrete": True,
535+
"needKey": False,
536+
"combiner": "",
537+
"value_dim": 1,
538+
"default_bucketize_value": 1,
539+
},
540+
{
541+
"feature_type": "match_feature",
542+
"feature_name": "match_e",
543+
"user": "user:nested_map",
544+
"category": "item:key_e",
545+
"item": "item:key_f",
546+
"matchType": "hit",
547+
"default_value": "0",
548+
"value_type": "string",
549+
"needDiscrete": True,
383550
"show_category": False,
384551
"show_item": False,
552+
"value_dim": 1,
553+
"default_bucketize_value": 1,
385554
},
386555
{
387556
"feature_name": "expr_f",
@@ -411,6 +580,7 @@ def test_create_fg_json(self, with_asset_dir=False):
411580
"need_prefix": False,
412581
"value_type": "string",
413582
"value_dim": 1,
583+
"default_bucketize_value": 0,
414584
},
415585
{
416586
"feature_name": "click_seq_int_simple",
@@ -458,10 +628,12 @@ def test_create_feauture_configs(self, with_asset_dir=False):
458628

459629
asset_dir = None
460630
token_file = "data/test/tokenizer.json"
631+
vocab_file = "data/test/id_vocab_list_0"
461632
if with_asset_dir:
462633
self.test_dir = tempfile.mkdtemp(prefix="tzrec_", dir="./tmp")
463634
asset_dir = self.test_dir
464635
token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
636+
vocab_file = "id_vocab_list_0_583794bd44eb2c6d83336c71258521e8"
465637

466638
again_feature_cfgs = feature_lib.create_feature_configs(
467639
features, asset_dir=asset_dir
@@ -470,6 +642,8 @@ def test_create_feauture_configs(self, with_asset_dir=False):
470642
if with_asset_dir:
471643
feature_cfgs[6].tokenize_feature.vocab_file = token_file
472644
feature_cfgs[6].tokenize_feature.asset_dir = asset_dir
645+
feature_cfgs[7].sequence_id_feature.vocab_file = vocab_file
646+
feature_cfgs[7].sequence_id_feature.asset_dir = asset_dir
473647
self.assertTrue(os.path.exists(os.path.join(asset_dir, token_file)))
474648
self.assertEqual(repr(feature_cfgs), repr(again_feature_cfgs))
475649

tzrec/tools/create_fg_json.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
# See the License for the specific language governing permissions and
1010
# limitations under the License.
1111

12-
1312
import argparse
14-
import copy
1513
import json
1614
import os
1715
import shutil
@@ -91,17 +89,9 @@
9189
_ = next(iterator)
9290

9391
tmp_dir = tempfile.mkdtemp(prefix="tzrec_")
94-
fg_json = create_fg_json(features, asset_dir=tmp_dir)
95-
if args.remove_bucketizer:
96-
fg_json = copy.copy(fg_json)
97-
for feature in fg_json["features"]:
98-
feature.pop("hash_bucket_size", None)
99-
feature.pop("vocab_dict", None)
100-
feature.pop("vocab_list", None)
101-
feature.pop("boundaries", None)
102-
feature.pop("num_buckets", None)
103-
if feature["feature_type"] != "tokenize_feature":
104-
feature.pop("vocab_file", None)
92+
fg_json = create_fg_json(
93+
features, asset_dir=tmp_dir, remove_bucketizer=args.remove_bucketizer
94+
)
10595

10696
if args.reserves is not None:
10797
reserves = []

0 commit comments

Comments
 (0)