Handle fields with high-cardinality and big payloads. (netdata#21716)

vkalintiris · web-flow · commit 80a5a9f63ea0 · 2026-02-06T00:10:36.000+02:00
* Remove verbose logs and fix log-levels

`tracing` maps debug logs to info on systemd. Remove verbose logs
and set log level to trace for those we want to keep.

* Add configurable journal file indexing limits

Add max_unique_values_per_field and max_field_payload_size options to
protect against memory exhaustion when indexing high-cardinality fields
or large payloads. Limits are configurable via journal-viewer.yaml.

* Deduplicate incoming facets.
diff --git a/src/crates/journal-engine/examples/index.rs b/src/crates/journal-engine/examples/index.rs
@@ -32,7 +32,8 @@
 
 use foundation::Timeout;
 use journal_engine::{
-    Facets, FileIndexCacheBuilder, FileIndexKey, QueryTimeRange, batch_compute_file_indexes,
+    Facets, FileIndexCacheBuilder, FileIndexKey, IndexingLimits, QueryTimeRange,
+    batch_compute_file_indexes,
 };
 use journal_index::FieldName;
 use journal_registry::{Monitor, Registry};
@@ -120,8 +121,15 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // Run batch indexing
     let start = std::time::Instant::now();
-    let responses =
-        batch_compute_file_indexes(&cache, &registry, keys, &time_range, timeout).await?;
+    let responses = batch_compute_file_indexes(
+        &cache,
+        &registry,
+        keys,
+        &time_range,
+        timeout,
+        IndexingLimits::default(),
+    )
+    .await?;
 
     let elapsed = start.elapsed();
 
diff --git a/src/crates/journal-engine/src/facets.rs b/src/crates/journal-engine/src/facets.rs
@@ -103,8 +103,9 @@ impl Facets {
                 .collect()
         };
 
-        // Sort in order to get the same hash for the same set of fields
+        // Sort and deduplicate to get a canonical set of fields
         facets.sort();
+        facets.dedup();
 
         use std::hash::Hasher;
         let mut hasher = std::hash::DefaultHasher::new();
diff --git a/src/crates/journal-engine/src/indexing.rs b/src/crates/journal-engine/src/indexing.rs
@@ -10,7 +10,7 @@ use crate::{
     query_time_range::QueryTimeRange,
 };
 use foundation::Timeout;
-use journal_index::{FileIndex, FileIndexer};
+use journal_index::{FileIndex, FileIndexer, IndexingLimits};
 use journal_registry::Registry;
 use tracing::{error, trace};
 
@@ -136,8 +136,9 @@ impl Default for FileIndexCacheBuilder {
 /// * `cache` - The file index cache
 /// * `registry` - Registry to update with file metadata
 /// * `keys` - Vector of (file, facets, source_timestamp_field) to fetch/compute indexes for
-/// * `bucket_duration` - Duration of histogram buckets in seconds
+/// * `time_range` - Query time range for bucket duration calculation
 /// * `timeout` - Timeout for the entire operation (can be extended dynamically)
+/// * `indexing_limits` - Configuration limits for indexing (cardinality, payload size)
 ///
 /// # Returns
 /// Vector of responses for each key. Successful responses contain the file index.
@@ -148,6 +149,7 @@ pub async fn batch_compute_file_indexes(
     keys: Vec<FileIndexKey>,
     time_range: &QueryTimeRange,
     timeout: Timeout,
+    indexing_limits: IndexingLimits,
 ) -> Result<Vec<(FileIndexKey, FileIndex)>> {
     let bucket_duration = time_range.bucket_duration_seconds();
     // Phase 1: Batch check cache for all keys upfront
@@ -240,7 +242,7 @@ pub async fn batch_compute_file_indexes(
                     return (key, Err(EngineError::TimeBudgetExceeded));
                 }
 
-                let mut file_indexer = FileIndexer::default();
+                let mut file_indexer = FileIndexer::new(indexing_limits);
                 let result = file_indexer
                     .index(
                         &key.file,
diff --git a/src/crates/journal-engine/src/lib.rs b/src/crates/journal-engine/src/lib.rs
@@ -34,5 +34,6 @@ pub use histogram::{
     BucketRequest, BucketResponse, Histogram, HistogramEngine, calculate_bucket_duration,
 };
 pub use indexing::{FileIndexCacheBuilder, batch_compute_file_indexes};
+pub use journal_index::IndexingLimits;
 pub use logs::{CellValue, ColumnInfo, LogEntryData, LogQuery, Table, entry_data_to_table};
 pub use query_time_range::QueryTimeRange;
diff --git a/src/crates/journal-index/src/file_indexer.rs b/src/crates/journal-index/src/file_indexer.rs
@@ -17,6 +17,42 @@ use journal_registry::File;
 use std::num::NonZeroU64;
 use tracing::{error, warn};
 
+/// Default maximum number of unique values to index per field.
+pub const DEFAULT_MAX_UNIQUE_VALUES_PER_FIELD: usize = 500;
+
+/// Default maximum payload size (in bytes) for field values to index.
+pub const DEFAULT_MAX_FIELD_PAYLOAD_SIZE: usize = 100;
+
+/// Configuration limits for the indexing process.
+///
+/// These limits protect against unbounded memory growth when indexing
+/// journal files with high-cardinality fields or large payloads.
+#[derive(Debug, Clone, Copy)]
+pub struct IndexingLimits {
+    /// Maximum number of unique values to index per field.
+    ///
+    /// Fields with more unique values than this limit will have their indexing
+    /// truncated. This protects against high-cardinality fields (e.g., MESSAGE
+    /// with millions of unique values) causing memory exhaustion.
+    pub max_unique_values_per_field: usize,
+
+    /// Maximum payload size (in bytes) for field values to index.
+    ///
+    /// Field values with payloads larger than this limit (or compressed values)
+    /// will be skipped. This prevents large binary data or encoded content
+    /// from consuming excessive memory.
+    pub max_field_payload_size: usize,
+}
+
+impl Default for IndexingLimits {
+    fn default() -> Self {
+        Self {
+            max_unique_values_per_field: DEFAULT_MAX_UNIQUE_VALUES_PER_FIELD,
+            max_field_payload_size: DEFAULT_MAX_FIELD_PAYLOAD_SIZE,
+        }
+    }
+}
+
 /// Reusable indexer for creating searchable indexes from journal files.
 ///
 /// # Indexing Process
@@ -39,9 +75,12 @@ use tracing::{error, warn};
 /// The indexer captures the journal file's `tail_object_offset` at the start of indexing
 /// to create a consistent snapshot. Any entries written to the file after indexing begins
 /// are ignored, preventing race conditions with concurrent writers.
-#[derive(Debug, Default)]
+#[derive(Debug)]
 #[cfg_attr(feature = "allocative", derive(allocative::Allocative))]
 pub struct FileIndexer {
+    /// Configuration limits for the indexing process.
+    limits: IndexingLimits,
+
     // Associates a source timestamp value with its inlined cursor
     source_timestamp_cursor_pairs: Vec<(Microseconds, InlinedCursor)>,
 
@@ -64,8 +103,35 @@ pub struct FileIndexer {
     entry_offset_index: HashMap<NonZeroU64, u64>,
 }
 
+impl Default for FileIndexer {
+    fn default() -> Self {
+        Self::new(IndexingLimits::default())
+    }
+}
+
+impl FileIndexer {
+    /// Create a new indexer with the specified configuration limits.
+    pub fn new(limits: IndexingLimits) -> Self {
+        Self {
+            limits,
+            source_timestamp_cursor_pairs: Vec::new(),
+            entry_offsets: Vec::new(),
+            source_timestamp_entry_offset_pairs: Vec::new(),
+            realtime_entry_offset_pairs: Vec::new(),
+            entry_indices: Vec::new(),
+            entry_offset_index: HashMap::default(),
+        }
+    }
+}
+
 impl FileIndexer {
     /// Create a searchable index from a journal file.
+    ///
+    /// # Arguments
+    /// * `file` - The journal file to index
+    /// * `source_timestamp_field` - Optional field to use for timestamps
+    /// * `field_names` - Fields to create bitmap indexes for
+    /// * `bucket_duration` - Duration of histogram buckets
     pub fn index(
         &mut self,
         file: &File,
@@ -168,6 +234,9 @@ impl FileIndexer {
     ///
     /// Only entries with offsets <= `tail_object_offset` are included in the
     /// bitmaps, ensuring a consistent snapshot.
+    ///
+    /// Fields with more than `self.limits.max_unique_values_per_field` unique values
+    /// will have their indexing truncated to prevent unbounded memory growth.
     fn build_entries_index(
         &mut self,
         journal_file: &JournalFile<Mmap>,
@@ -176,6 +245,8 @@ impl FileIndexer {
         tail_object_offset: NonZeroU64,
     ) -> Result<HashMap<FieldValuePair, Bitmap>> {
         let mut entries_index = HashMap::default();
+        let mut truncated_fields: Vec<&FieldName> = Vec::new();
+        let mut fields_with_large_payloads: Vec<&FieldName> = Vec::new();
 
         for field_name in field_names {
             let Some(systemd_field) = field_map.get(field_name.as_str()) else {
@@ -197,13 +268,32 @@ impl FileIndexer {
                     }
                 };
 
+            // Track the number of unique values indexed for this field
+            let mut unique_values_count: usize = 0;
+            let mut ignored_large_payloads: usize = 0;
+            let mut was_truncated = false;
+
             for data_object in field_data_iterator {
+                // Check cardinality limit before processing this value
+                if unique_values_count >= self.limits.max_unique_values_per_field {
+                    was_truncated = true;
+                    break;
+                }
+
                 // Get the payload and the inlined cursor for this data object
                 let (data_payload, inlined_cursor) = {
                     let Ok(data_object) = data_object else {
                         continue;
                     };
 
+                    // Do not create indexes with fields that contain large payloads.
+                    if data_object.raw_payload().len() >= self.limits.max_field_payload_size
+                        || data_object.is_compressed()
+                    {
+                        ignored_large_payloads += 1;
+                        continue;
+                    }
+
                     // Skip the remapping value
                     if data_object.raw_payload().ends_with(field_name.as_bytes()) {
                         continue;
@@ -262,7 +352,41 @@ impl FileIndexer {
                 let field_name = FieldName::new_unchecked(field_name);
                 let k = FieldValuePair::new_unchecked(field_name, String::from(pair.value()));
                 entries_index.insert(k, bitmap);
+
+                unique_values_count += 1;
+            }
+
+            // Track fields that were truncated or had large payloads skipped
+            if was_truncated {
+                truncated_fields.push(field_name);
             }
+            if ignored_large_payloads > 0 {
+                fields_with_large_payloads.push(field_name);
+            }
+        }
+
+        // Log summary of indexing issues
+        if !truncated_fields.is_empty() {
+            let field_names: Vec<&str> = truncated_fields.iter().map(|f| f.as_str()).collect();
+            warn!(
+                "File '{}': {} field(s) truncated due to cardinality limit ({}): {:?}",
+                journal_file.file().path(),
+                truncated_fields.len(),
+                self.limits.max_unique_values_per_field,
+                field_names
+            );
+        }
+        if !fields_with_large_payloads.is_empty() {
+            let field_names: Vec<&str> = fields_with_large_payloads
+                .iter()
+                .map(|f| f.as_str())
+                .collect();
+            tracing::info!(
+                "File '{}': {} field(s) had values skipped due to large payloads: {:?}",
+                journal_file.file().path(),
+                fields_with_large_payloads.len(),
+                field_names
+            );
         }
 
         Ok(entries_index)
diff --git a/src/crates/journal-index/src/lib.rs b/src/crates/journal-index/src/lib.rs
@@ -20,7 +20,9 @@ pub use file_index::{
 };
 
 pub mod file_indexer;
-pub use file_indexer::FileIndexer;
+pub use file_indexer::{
+    FileIndexer, IndexingLimits, DEFAULT_MAX_FIELD_PAYLOAD_SIZE, DEFAULT_MAX_UNIQUE_VALUES_PER_FIELD,
+};
 
 pub mod bitmap;
 pub use bitmap::Bitmap;
diff --git a/src/crates/netdata-log-viewer/journal-function/src/lib.rs b/src/crates/netdata-log-viewer/journal-function/src/lib.rs
@@ -11,9 +11,9 @@ pub mod netdata;
 // Re-export types from journal-engine for convenience
 pub use journal_engine::{
     BucketRequest, BucketResponse, CellValue, ColumnInfo, Facets, FileIndexCache,
-    FileIndexCacheBuilder, FileIndexKey, Histogram, HistogramEngine, LogEntryData, LogQuery,
-    QueryTimeRange, Result, Table, batch_compute_file_indexes, calculate_bucket_duration,
-    entry_data_to_table,
+    FileIndexCacheBuilder, FileIndexKey, Histogram, HistogramEngine, IndexingLimits,
+    LogEntryData, LogQuery, QueryTimeRange, Result, Table, batch_compute_file_indexes,
+    calculate_bucket_duration, entry_data_to_table,
 };
 
 // Re-export Timeout from foundation (via rt for backward compatibility)
diff --git a/src/crates/netdata-log-viewer/journal-viewer-plugin/configs/journal-viewer.yaml.in b/src/crates/netdata-log-viewer/journal-viewer-plugin/configs/journal-viewer.yaml.in
@@ -43,3 +43,19 @@ cache:
   # Controls backpressure on the indexing system
   # Default: 100
   queue_capacity: 100
+
+indexing:
+  # Maximum number of unique values to index per field.
+  # Fields with more unique values than this limit will have their indexing
+  # truncated to prevent unbounded memory growth. This protects against
+  # high-cardinality fields (e.g., MESSAGE with millions of unique values)
+  # causing memory exhaustion during indexing.
+  # Default: 500
+  max_unique_values_per_field: 500
+
+  # Maximum payload size (in bytes) for field values to index.
+  # Field values with payloads larger than this limit (or compressed values)
+  # will be skipped. This prevents large binary data or encoded content
+  # from consuming excessive memory.
+  # Default: 100
+  max_field_payload_size: 100
diff --git a/src/crates/netdata-log-viewer/journal-viewer-plugin/src/catalog.rs b/src/crates/netdata-log-viewer/journal-viewer-plugin/src/catalog.rs
diff --git a/src/crates/netdata-log-viewer/journal-viewer-plugin/src/main.rs b/src/crates/netdata-log-viewer/journal-viewer-plugin/src/main.rs
diff --git a/src/crates/netdata-log-viewer/journal-viewer-plugin/src/plugin_config.rs b/src/crates/netdata-log-viewer/journal-viewer-plugin/src/plugin_config.rs