@@ -130,14 +130,27 @@ def group_text(self, tokenized_datasets, model_max_length):
130130 block_size = 1024
131131 else :
132132 if data_args .block_size > model_max_length :
133- logger .warning (
134- f"The block_size passed ({ data_args .block_size } ) is larger"
135- f" than the maximum length for the model"
136- f"({ model_max_length } )."
137- f" Using block_size={ model_max_length } ."
138- )
139- block_size = min (data_args .block_size , model_max_length )
140-
133+ if self .model_args .truncate_to_model_max_length :
134+ logger .warning (
135+ f"The block_size passed ({ data_args .block_size } ) is larger"
136+ f" than the maximum length for the model"
137+ f"({ model_max_length } )."
138+ f" Using block_size={ model_max_length } ."
139+ f"If you would like to use a longer 'block_size' that is"
140+ f" longer than the maximum length supported by the model,"
141+ f" you can override this behavior with"
142+ f"default with `--truncate_to_model_max_length False`."
143+ )
144+ block_size = model_max_length
145+ else :
146+ logger .warning (
147+ f"The block_size passed ({ data_args .block_size } ) is larger"
148+ f"than the maximum length for the model"
149+ f"({ model_max_length } )."
150+ f"Using block_size={ data_args .block_size } ." )
151+ block_size = data_args .block_size
152+ else :
153+ block_size = data_args .block_size
141154 # Main data processing function that will concatenate all texts from
142155 # our dataset and generate chunks of block_size.
143156 def group_texts (examples ):
0 commit comments