@@ -17,6 +17,42 @@ use journal_registry::File;
1717use std:: num:: NonZeroU64 ;
1818use tracing:: { error, warn} ;
1919
20+ /// Default maximum number of unique values to index per field.
21+ pub const DEFAULT_MAX_UNIQUE_VALUES_PER_FIELD : usize = 500 ;
22+
23+ /// Default maximum payload size (in bytes) for field values to index.
24+ pub const DEFAULT_MAX_FIELD_PAYLOAD_SIZE : usize = 100 ;
25+
26+ /// Configuration limits for the indexing process.
27+ ///
28+ /// These limits protect against unbounded memory growth when indexing
29+ /// journal files with high-cardinality fields or large payloads.
30+ #[ derive( Debug , Clone , Copy ) ]
31+ pub struct IndexingLimits {
32+ /// Maximum number of unique values to index per field.
33+ ///
34+ /// Fields with more unique values than this limit will have their indexing
35+ /// truncated. This protects against high-cardinality fields (e.g., MESSAGE
36+ /// with millions of unique values) causing memory exhaustion.
37+ pub max_unique_values_per_field : usize ,
38+
39+ /// Maximum payload size (in bytes) for field values to index.
40+ ///
41+ /// Field values with payloads larger than this limit (or compressed values)
42+ /// will be skipped. This prevents large binary data or encoded content
43+ /// from consuming excessive memory.
44+ pub max_field_payload_size : usize ,
45+ }
46+
47+ impl Default for IndexingLimits {
48+ fn default ( ) -> Self {
49+ Self {
50+ max_unique_values_per_field : DEFAULT_MAX_UNIQUE_VALUES_PER_FIELD ,
51+ max_field_payload_size : DEFAULT_MAX_FIELD_PAYLOAD_SIZE ,
52+ }
53+ }
54+ }
55+
2056/// Reusable indexer for creating searchable indexes from journal files.
2157///
2258/// # Indexing Process
@@ -39,9 +75,12 @@ use tracing::{error, warn};
3975/// The indexer captures the journal file's `tail_object_offset` at the start of indexing
4076/// to create a consistent snapshot. Any entries written to the file after indexing begins
4177/// are ignored, preventing race conditions with concurrent writers.
42- #[ derive( Debug , Default ) ]
78+ #[ derive( Debug ) ]
4379#[ cfg_attr( feature = "allocative" , derive( allocative:: Allocative ) ) ]
4480pub struct FileIndexer {
81+ /// Configuration limits for the indexing process.
82+ limits : IndexingLimits ,
83+
4584 // Associates a source timestamp value with its inlined cursor
4685 source_timestamp_cursor_pairs : Vec < ( Microseconds , InlinedCursor ) > ,
4786
@@ -64,8 +103,35 @@ pub struct FileIndexer {
64103 entry_offset_index : HashMap < NonZeroU64 , u64 > ,
65104}
66105
106+ impl Default for FileIndexer {
107+ fn default ( ) -> Self {
108+ Self :: new ( IndexingLimits :: default ( ) )
109+ }
110+ }
111+
112+ impl FileIndexer {
113+ /// Create a new indexer with the specified configuration limits.
114+ pub fn new ( limits : IndexingLimits ) -> Self {
115+ Self {
116+ limits,
117+ source_timestamp_cursor_pairs : Vec :: new ( ) ,
118+ entry_offsets : Vec :: new ( ) ,
119+ source_timestamp_entry_offset_pairs : Vec :: new ( ) ,
120+ realtime_entry_offset_pairs : Vec :: new ( ) ,
121+ entry_indices : Vec :: new ( ) ,
122+ entry_offset_index : HashMap :: default ( ) ,
123+ }
124+ }
125+ }
126+
67127impl FileIndexer {
68128 /// Create a searchable index from a journal file.
129+ ///
130+ /// # Arguments
131+ /// * `file` - The journal file to index
132+ /// * `source_timestamp_field` - Optional field to use for timestamps
133+ /// * `field_names` - Fields to create bitmap indexes for
134+ /// * `bucket_duration` - Duration of histogram buckets
69135 pub fn index (
70136 & mut self ,
71137 file : & File ,
@@ -168,6 +234,9 @@ impl FileIndexer {
168234 ///
169235 /// Only entries with offsets <= `tail_object_offset` are included in the
170236 /// bitmaps, ensuring a consistent snapshot.
237+ ///
238+ /// Fields with more than `self.limits.max_unique_values_per_field` unique values
239+ /// will have their indexing truncated to prevent unbounded memory growth.
171240 fn build_entries_index (
172241 & mut self ,
173242 journal_file : & JournalFile < Mmap > ,
@@ -176,6 +245,8 @@ impl FileIndexer {
176245 tail_object_offset : NonZeroU64 ,
177246 ) -> Result < HashMap < FieldValuePair , Bitmap > > {
178247 let mut entries_index = HashMap :: default ( ) ;
248+ let mut truncated_fields: Vec < & FieldName > = Vec :: new ( ) ;
249+ let mut fields_with_large_payloads: Vec < & FieldName > = Vec :: new ( ) ;
179250
180251 for field_name in field_names {
181252 let Some ( systemd_field) = field_map. get ( field_name. as_str ( ) ) else {
@@ -197,13 +268,32 @@ impl FileIndexer {
197268 }
198269 } ;
199270
271+ // Track the number of unique values indexed for this field
272+ let mut unique_values_count: usize = 0 ;
273+ let mut ignored_large_payloads: usize = 0 ;
274+ let mut was_truncated = false ;
275+
200276 for data_object in field_data_iterator {
277+ // Check cardinality limit before processing this value
278+ if unique_values_count >= self . limits . max_unique_values_per_field {
279+ was_truncated = true ;
280+ break ;
281+ }
282+
201283 // Get the payload and the inlined cursor for this data object
202284 let ( data_payload, inlined_cursor) = {
203285 let Ok ( data_object) = data_object else {
204286 continue ;
205287 } ;
206288
289+ // Do not create indexes with fields that contain large payloads.
290+ if data_object. raw_payload ( ) . len ( ) >= self . limits . max_field_payload_size
291+ || data_object. is_compressed ( )
292+ {
293+ ignored_large_payloads += 1 ;
294+ continue ;
295+ }
296+
207297 // Skip the remapping value
208298 if data_object. raw_payload ( ) . ends_with ( field_name. as_bytes ( ) ) {
209299 continue ;
@@ -262,7 +352,41 @@ impl FileIndexer {
262352 let field_name = FieldName :: new_unchecked ( field_name) ;
263353 let k = FieldValuePair :: new_unchecked ( field_name, String :: from ( pair. value ( ) ) ) ;
264354 entries_index. insert ( k, bitmap) ;
355+
356+ unique_values_count += 1 ;
357+ }
358+
359+ // Track fields that were truncated or had large payloads skipped
360+ if was_truncated {
361+ truncated_fields. push ( field_name) ;
265362 }
363+ if ignored_large_payloads > 0 {
364+ fields_with_large_payloads. push ( field_name) ;
365+ }
366+ }
367+
368+ // Log summary of indexing issues
369+ if !truncated_fields. is_empty ( ) {
370+ let field_names: Vec < & str > = truncated_fields. iter ( ) . map ( |f| f. as_str ( ) ) . collect ( ) ;
371+ warn ! (
372+ "File '{}': {} field(s) truncated due to cardinality limit ({}): {:?}" ,
373+ journal_file. file( ) . path( ) ,
374+ truncated_fields. len( ) ,
375+ self . limits. max_unique_values_per_field,
376+ field_names
377+ ) ;
378+ }
379+ if !fields_with_large_payloads. is_empty ( ) {
380+ let field_names: Vec < & str > = fields_with_large_payloads
381+ . iter ( )
382+ . map ( |f| f. as_str ( ) )
383+ . collect ( ) ;
384+ tracing:: info!(
385+ "File '{}': {} field(s) had values skipped due to large payloads: {:?}" ,
386+ journal_file. file( ) . path( ) ,
387+ fields_with_large_payloads. len( ) ,
388+ field_names
389+ ) ;
266390 }
267391
268392 Ok ( entries_index)
0 commit comments