fix: normalize i16 to f32 in segmentation and flush trailing speech

gregoire22enpc · gregoire22enpc · commit ae9fbc1f6c60 · 2026-04-05T14:08:05.000+02:00
Two bugs in get_segments:

1. i16 samples are cast to f32 via `x as f32` without dividing by
   32768, feeding the ONNX segmentation model values in [-32768, 32767]
   when it expects [-1.0, 1.0]. This causes the model to misclassify
   all frames as non-speech for typical microphone input.

2. When speech extends to end-of-audio, the final segment is silently
   dropped because there is no flush when `is_speeching` is still true
   after all windows are processed.

Made-with: Cursor
diff --git a/src/segment.rs b/src/segment.rs
@@ -55,8 +55,9 @@ pub fn get_segments<P: AsRef<Path>>(
             let end = (start + window_size).min(padded_samples.len());
             let window = &padded_samples[start..end];
 
-            // Convert window to ndarray::Array1
-            let array = ndarray::Array1::from_iter(window.iter().map(|&x| x as f32));
+            // Convert window to ndarray::Array1, normalizing i16 to [-1.0, 1.0]
+            let array =
+                ndarray::Array1::from_iter(window.iter().map(|&x| x as f32 / 32768.0));
             let array = array.view().insert_axis(Axis(0)).insert_axis(Axis(1));
 
             // Handle potential errors during the session and input processing
@@ -126,6 +127,30 @@ pub fn get_segments<P: AsRef<Path>>(
                 }
             }
         }
+
+        // Flush trailing speech that extends to end-of-audio
+        if is_speeching {
+            let start = start_offset / sample_rate as f64;
+            let end = offset as f64 / sample_rate as f64;
+
+            let start_f64 = start * (sample_rate as f64);
+            let end_f64 = end * (sample_rate as f64);
+
+            let start_idx = start_f64.min((samples.len() - 1) as f64) as usize;
+            let end_idx = end_f64.min(samples.len() as f64) as usize;
+
+            let segment_samples = &padded_samples[start_idx..end_idx];
+
+            is_speeching = false;
+
+            let segment = Segment {
+                start,
+                end,
+                samples: segment_samples.to_vec(),
+            };
+            segments_queue.push_back(segment);
+        }
+
         segments_queue.pop_front().map(Ok)
     }))
 }