1414import shutil
1515import tempfile
1616import unittest
17+ from collections import OrderedDict
1718
1819import numpy as np
1920import pyarrow as pa
@@ -238,23 +239,27 @@ def _create_test_feature_cfgs(self):
238239 return [
239240 feature_pb2 .FeatureConfig (
240241 id_feature = feature_pb2 .IdFeature (
241- feature_name = "cat_a" , expression = "item:cat_a" , hash_bucket_size = 100
242+ feature_name = "cat_a" , expression = "item:cat_a" , num_buckets = 100
242243 )
243244 ),
244245 feature_pb2 .FeatureConfig (
245246 raw_feature = feature_pb2 .RawFeature (
246- feature_name = "int_a" , expression = "item:int_a"
247+ feature_name = "int_a" , expression = "item:int_a" , boundaries = [ 1 , 2 , 3 ]
247248 )
248249 ),
249250 feature_pb2 .FeatureConfig (
250251 combo_feature = feature_pb2 .ComboFeature (
251252 feature_name = "combo_c" ,
252253 expression = ["user:combo_uc" , "item:combo_ic" ],
254+ hash_bucket_size = 1000 ,
253255 )
254256 ),
255257 feature_pb2 .FeatureConfig (
256258 lookup_feature = feature_pb2 .LookupFeature (
257- feature_name = "lookup_d" , map = "user:map_d" , key = "item:key_d"
259+ feature_name = "lookup_d" ,
260+ map = "user:map_d" ,
261+ key = "item:key_d" ,
262+ vocab_list = ["a" , "b" , "c" ],
258263 )
259264 ),
260265 feature_pb2 .FeatureConfig (
@@ -263,13 +268,15 @@ def _create_test_feature_cfgs(self):
263268 nested_map = "user:nested_map" ,
264269 pkey = "item:key_e" ,
265270 skey = "item:key_f" ,
271+ vocab_dict = {"e" : 2 , "f" : 3 , "g" : 4 },
266272 )
267273 ),
268274 feature_pb2 .FeatureConfig (
269275 expr_feature = feature_pb2 .ExprFeature (
270276 feature_name = "expr_f" ,
271277 expression = "int_g+int_h" ,
272278 variables = ["item:int_g" , "item:int_h" ],
279+ boundaries = [4 , 5 , 6 ],
273280 )
274281 ),
275282 feature_pb2 .FeatureConfig (
@@ -285,6 +292,8 @@ def _create_test_feature_cfgs(self):
285292 expression = "item:click_seq_cat_simple" ,
286293 sequence_length = 50 ,
287294 sequence_delim = ";" ,
295+ vocab_file = "data/test/id_vocab_list_0" ,
296+ default_bucketize_value = 0 ,
288297 )
289298 ),
290299 feature_pb2 .FeatureConfig (
@@ -304,12 +313,16 @@ def _create_test_feature_cfgs(self):
304313 features = [
305314 feature_pb2 .SeqFeatureConfig (
306315 id_feature = feature_pb2 .IdFeature (
307- feature_name = "cat_a" , expression = "item:cat_a"
316+ feature_name = "cat_a" ,
317+ expression = "item:cat_a" ,
318+ hash_bucket_size = 10 ,
308319 )
309320 ),
310321 feature_pb2 .SeqFeatureConfig (
311322 raw_feature = feature_pb2 .RawFeature (
312- feature_name = "int_a" , expression = "item:int_a"
323+ feature_name = "int_a" ,
324+ expression = "item:int_a" ,
325+ boundaries = [7 , 8 , 9 ],
313326 )
314327 ),
315328 ],
@@ -321,10 +334,12 @@ def _create_test_feature_cfgs(self):
321334 def test_create_fg_json (self , with_asset_dir = False ):
322335 asset_dir = None
323336 token_file = "data/test/tokenizer.json"
337+ vocab_file = "data/test/id_vocab_list_0"
324338 if with_asset_dir :
325339 self .test_dir = tempfile .mkdtemp (prefix = "tzrec_" , dir = "./tmp" )
326340 asset_dir = self .test_dir
327341 token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
342+ vocab_file = "id_vocab_list_0_583794bd44eb2c6d83336c71258521e8"
328343 feature_cfgs = self ._create_test_feature_cfgs ()
329344 features = feature_lib .create_features (feature_cfgs , fg_mode = FgMode .FG_DAG )
330345 fg_json = feature_lib .create_fg_json (features , asset_dir = asset_dir )
@@ -340,15 +355,16 @@ def test_create_fg_json(self, with_asset_dir=False):
340355 "expression" : "item:cat_a" ,
341356 "value_type" : "string" ,
342357 "need_prefix" : False ,
343- "hash_bucket_size" : 100 ,
344358 "value_dim" : 0 ,
359+ "num_buckets" : 100 ,
345360 },
346361 {
347362 "feature_type" : "raw_feature" ,
348363 "feature_name" : "int_a" ,
349364 "default_value" : "0" ,
350365 "expression" : "item:int_a" ,
351366 "value_type" : "float" ,
367+ "boundaries" : [1.0 , 2.0 , 3.0 ],
352368 },
353369 {
354370 "feature_type" : "combo_feature" ,
@@ -358,17 +374,21 @@ def test_create_fg_json(self, with_asset_dir=False):
358374 "value_type" : "string" ,
359375 "need_prefix" : False ,
360376 "value_dim" : 0 ,
377+ "hash_bucket_size" : 1000 ,
361378 },
362379 {
363380 "feature_type" : "lookup_feature" ,
364381 "feature_name" : "lookup_d" ,
365382 "map" : "user:map_d" ,
366383 "key" : "item:key_d" ,
367384 "default_value" : "0" ,
368- "value_type" : "float " ,
369- "needDiscrete" : False ,
385+ "value_type" : "string " ,
386+ "needDiscrete" : True ,
370387 "needKey" : False ,
371- "combiner" : "sum" ,
388+ "combiner" : "" ,
389+ "value_dim" : 1 ,
390+ "vocab_list" : ["0" , "<OOV>" , "a" , "b" , "c" ],
391+ "default_bucketize_value" : 1 ,
372392 },
373393 {
374394 "feature_type" : "match_feature" ,
@@ -378,10 +398,159 @@ def test_create_fg_json(self, with_asset_dir=False):
378398 "item" : "item:key_f" ,
379399 "matchType" : "hit" ,
380400 "default_value" : "0" ,
401+ "value_type" : "string" ,
402+ "needDiscrete" : True ,
403+ "show_category" : False ,
404+ "show_item" : False ,
405+ "value_dim" : 1 ,
406+ "vocab_dict" : OrderedDict (
407+ [("e" , 2 ), ("f" , 3 ), ("g" , 4 ), ("0" , 0 )]
408+ ),
409+ "default_bucketize_value" : 1 ,
410+ },
411+ {
412+ "feature_name" : "expr_f" ,
413+ "feature_type" : "expr_feature" ,
414+ "expression" : "int_g+int_h" ,
415+ "variables" : ["item:int_g" , "item:int_h" ],
416+ "default_value" : "0" ,
417+ "value_type" : "float" ,
418+ "boundaries" : [4.0 , 5.0 , 6.0 ],
419+ },
420+ {
421+ "feature_name" : "token_g" ,
422+ "feature_type" : "tokenize_feature" ,
423+ "expression" : "item:token_g" ,
424+ "output_delim" : "\x03 " ,
425+ "output_type" : "word_id" ,
426+ "tokenizer_type" : "bpe" ,
427+ "vocab_file" : token_file ,
428+ "default_value" : "" ,
429+ },
430+ {
431+ "feature_name" : "click_seq_cat_simple" ,
432+ "feature_type" : "sequence_id_feature" ,
433+ "sequence_delim" : ";" ,
434+ "sequence_length" : 50 ,
435+ "expression" : "item:click_seq_cat_simple" ,
436+ "default_value" : "0" ,
437+ "need_prefix" : False ,
438+ "value_type" : "string" ,
439+ "value_dim" : 1 ,
440+ "vocab_file" : vocab_file ,
441+ "default_bucketize_value" : 0 ,
442+ },
443+ {
444+ "feature_name" : "click_seq_int_simple" ,
445+ "feature_type" : "sequence_raw_feature" ,
446+ "sequence_delim" : ";" ,
447+ "sequence_length" : 50 ,
448+ "expression" : "user:click_seq_int_simple" ,
449+ "default_value" : "0" ,
450+ "value_type" : "float" ,
451+ },
452+ {
453+ "sequence_name" : "click_seq" ,
454+ "sequence_length" : 50 ,
455+ "sequence_delim" : ";" ,
456+ "sequence_pk" : "user:click_seq" ,
457+ "features" : [
458+ {
459+ "feature_type" : "id_feature" ,
460+ "feature_name" : "cat_a" ,
461+ "default_value" : "0" ,
462+ "expression" : "item:cat_a" ,
463+ "value_type" : "string" ,
464+ "need_prefix" : False ,
465+ "value_dim" : 1 ,
466+ "hash_bucket_size" : 10 ,
467+ },
468+ {
469+ "feature_type" : "raw_feature" ,
470+ "feature_name" : "int_a" ,
471+ "default_value" : "0" ,
472+ "expression" : "item:int_a" ,
473+ "value_type" : "float" ,
474+ "boundaries" : [7.0 , 8.0 , 9.0 ],
475+ },
476+ ],
477+ },
478+ ]
479+ },
480+ )
481+ if with_asset_dir :
482+ self .assertTrue (os .path .exists (os .path .join (asset_dir , token_file )))
483+
484+ @parameterized .expand ([[False ], [True ]])
485+ def test_create_fg_json_remove_bucketizer (self , with_asset_dir = False ):
486+ asset_dir = None
487+ token_file = "data/test/tokenizer.json"
488+ if with_asset_dir :
489+ self .test_dir = tempfile .mkdtemp (prefix = "tzrec_" , dir = "./tmp" )
490+ asset_dir = self .test_dir
491+ token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
492+ feature_cfgs = self ._create_test_feature_cfgs ()
493+ features = feature_lib .create_features (feature_cfgs , fg_mode = FgMode .FG_DAG )
494+ fg_json = feature_lib .create_fg_json (
495+ features , asset_dir = asset_dir , remove_bucketizer = True
496+ )
497+ self .maxDiff = None
498+ self .assertEqual (
499+ fg_json ,
500+ {
501+ "features" : [
502+ {
503+ "feature_type" : "id_feature" ,
504+ "feature_name" : "cat_a" ,
505+ "default_value" : "" ,
506+ "expression" : "item:cat_a" ,
507+ "value_type" : "string" ,
508+ "need_prefix" : False ,
509+ "value_dim" : 0 ,
510+ },
511+ {
512+ "feature_type" : "raw_feature" ,
513+ "feature_name" : "int_a" ,
514+ "default_value" : "0" ,
515+ "expression" : "item:int_a" ,
381516 "value_type" : "float" ,
382- "needDiscrete" : False ,
517+ },
518+ {
519+ "feature_type" : "combo_feature" ,
520+ "feature_name" : "combo_c" ,
521+ "default_value" : "" ,
522+ "expression" : ["user:combo_uc" , "item:combo_ic" ],
523+ "value_type" : "string" ,
524+ "need_prefix" : False ,
525+ "value_dim" : 0 ,
526+ },
527+ {
528+ "feature_type" : "lookup_feature" ,
529+ "feature_name" : "lookup_d" ,
530+ "map" : "user:map_d" ,
531+ "key" : "item:key_d" ,
532+ "default_value" : "0" ,
533+ "value_type" : "string" ,
534+ "needDiscrete" : True ,
535+ "needKey" : False ,
536+ "combiner" : "" ,
537+ "value_dim" : 1 ,
538+ "default_bucketize_value" : 1 ,
539+ },
540+ {
541+ "feature_type" : "match_feature" ,
542+ "feature_name" : "match_e" ,
543+ "user" : "user:nested_map" ,
544+ "category" : "item:key_e" ,
545+ "item" : "item:key_f" ,
546+ "matchType" : "hit" ,
547+ "default_value" : "0" ,
548+ "value_type" : "string" ,
549+ "needDiscrete" : True ,
383550 "show_category" : False ,
384551 "show_item" : False ,
552+ "value_dim" : 1 ,
553+ "default_bucketize_value" : 1 ,
385554 },
386555 {
387556 "feature_name" : "expr_f" ,
@@ -411,6 +580,7 @@ def test_create_fg_json(self, with_asset_dir=False):
411580 "need_prefix" : False ,
412581 "value_type" : "string" ,
413582 "value_dim" : 1 ,
583+ "default_bucketize_value" : 0 ,
414584 },
415585 {
416586 "feature_name" : "click_seq_int_simple" ,
@@ -458,10 +628,12 @@ def test_create_feauture_configs(self, with_asset_dir=False):
458628
459629 asset_dir = None
460630 token_file = "data/test/tokenizer.json"
631+ vocab_file = "data/test/id_vocab_list_0"
461632 if with_asset_dir :
462633 self .test_dir = tempfile .mkdtemp (prefix = "tzrec_" , dir = "./tmp" )
463634 asset_dir = self .test_dir
464635 token_file = "tokenizer_b2faab7921bbfb593973632993ca4c85.json"
636+ vocab_file = "id_vocab_list_0_583794bd44eb2c6d83336c71258521e8"
465637
466638 again_feature_cfgs = feature_lib .create_feature_configs (
467639 features , asset_dir = asset_dir
@@ -470,6 +642,8 @@ def test_create_feauture_configs(self, with_asset_dir=False):
470642 if with_asset_dir :
471643 feature_cfgs [6 ].tokenize_feature .vocab_file = token_file
472644 feature_cfgs [6 ].tokenize_feature .asset_dir = asset_dir
645+ feature_cfgs [7 ].sequence_id_feature .vocab_file = vocab_file
646+ feature_cfgs [7 ].sequence_id_feature .asset_dir = asset_dir
473647 self .assertTrue (os .path .exists (os .path .join (asset_dir , token_file )))
474648 self .assertEqual (repr (feature_cfgs ), repr (again_feature_cfgs ))
475649
0 commit comments