Skip to content

Commit ae9fbc1

Browse files
fix: normalize i16 to f32 in segmentation and flush trailing speech
Two bugs in get_segments: 1. i16 samples are cast to f32 via `x as f32` without dividing by 32768, feeding the ONNX segmentation model values in [-32768, 32767] when it expects [-1.0, 1.0]. This causes the model to misclassify all frames as non-speech for typical microphone input. 2. When speech extends to end-of-audio, the final segment is silently dropped because there is no flush when `is_speeching` is still true after all windows are processed. Made-with: Cursor
1 parent e23bd29 commit ae9fbc1

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

src/segment.rs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ pub fn get_segments<P: AsRef<Path>>(
5555
let end = (start + window_size).min(padded_samples.len());
5656
let window = &padded_samples[start..end];
5757

58-
// Convert window to ndarray::Array1
59-
let array = ndarray::Array1::from_iter(window.iter().map(|&x| x as f32));
58+
// Convert window to ndarray::Array1, normalizing i16 to [-1.0, 1.0]
59+
let array =
60+
ndarray::Array1::from_iter(window.iter().map(|&x| x as f32 / 32768.0));
6061
let array = array.view().insert_axis(Axis(0)).insert_axis(Axis(1));
6162

6263
// Handle potential errors during the session and input processing
@@ -126,6 +127,30 @@ pub fn get_segments<P: AsRef<Path>>(
126127
}
127128
}
128129
}
130+
131+
// Flush trailing speech that extends to end-of-audio
132+
if is_speeching {
133+
let start = start_offset / sample_rate as f64;
134+
let end = offset as f64 / sample_rate as f64;
135+
136+
let start_f64 = start * (sample_rate as f64);
137+
let end_f64 = end * (sample_rate as f64);
138+
139+
let start_idx = start_f64.min((samples.len() - 1) as f64) as usize;
140+
let end_idx = end_f64.min(samples.len() as f64) as usize;
141+
142+
let segment_samples = &padded_samples[start_idx..end_idx];
143+
144+
is_speeching = false;
145+
146+
let segment = Segment {
147+
start,
148+
end,
149+
samples: segment_samples.to_vec(),
150+
};
151+
segments_queue.push_back(segment);
152+
}
153+
129154
segments_queue.pop_front().map(Ok)
130155
}))
131156
}

0 commit comments

Comments
 (0)