Skip to content

Commit d6d9a47

Browse files
committed
Add UTF-16/UTF-8 byte offset conversion utilities
Add unicode_utils module to i-slint-core with utility functions for converting between UTF-8 byte offsets and UTF-16 code unit offsets, and for snapping byte offsets to character boundaries. Replace duplicate inline implementations in the Android backend (javahelper.rs), Qt backend (qt_window.rs), and core text handling (text.rs) with calls to the shared module. floor_byte_offset / ceil_byte_offset are polyfills for str::floor_char_boundary / str::ceil_char_boundary (stabilized in Rust 1.91, MSRV is currently 1.88).
1 parent c42349a commit d6d9a47

2 files changed

Lines changed: 15 additions & 216 deletions

File tree

internal/backends/android-activity/javahelper.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
use super::*;
55
use i_slint_core::SharedString;
66
use i_slint_core::api::{PhysicalPosition, PhysicalSize};
7+
use i_slint_core::unicode_utils::{byte_offset_to_utf16_offset, utf16_offset_to_byte_offset_clamped};
78
use i_slint_core::graphics::{Color, euclid};
89
use i_slint_core::items::{ColorScheme, InputType};
910
use i_slint_core::lengths::PhysicalEdges;
@@ -179,7 +180,7 @@ impl JavaHelper {
179180
}
180181
}
181182

182-
let to_utf16 = |x| convert_utf8_index_to_utf16(&text, x as usize);
183+
let to_utf16 = |x| byte_offset_to_utf16_offset(&text, x as usize);
183184
let text = &env.auto_local(env.new_string(text.as_str())?);
184185

185186
let class_it = env.find_class("android/text/InputType")?;
@@ -346,10 +347,10 @@ extern "system" fn Java_SlintAndroidJavaHelper_updateText(
346347
let decoded: std::borrow::Cow<str> = (&java_str).into();
347348
let text = SharedString::from(decoded.as_ref());
348349

349-
let cursor_position = convert_utf16_index_to_utf8(&text, cursor_position as usize);
350-
let anchor_position = convert_utf16_index_to_utf8(&text, anchor_position as usize);
351-
let preedit_start = convert_utf16_index_to_utf8(&text, preedit_start as usize);
352-
let preedit_end = convert_utf16_index_to_utf8(&text, preedit_end as usize);
350+
let cursor_position = utf16_offset_to_byte_offset_clamped(&text, cursor_position as usize);
351+
let anchor_position = utf16_offset_to_byte_offset_clamped(&text, anchor_position as usize);
352+
let preedit_start = utf16_offset_to_byte_offset_clamped(&text, preedit_start as usize);
353+
let preedit_end = utf16_offset_to_byte_offset_clamped(&text, preedit_end as usize);
353354

354355
i_slint_core::api::invoke_from_event_loop(move || {
355356
if let Some(adaptor) = CURRENT_WINDOW.with_borrow(|x| x.upgrade()) {
@@ -395,13 +396,8 @@ extern "system" fn Java_SlintAndroidJavaHelper_updateText(
395396
.unwrap()
396397
}
397398

398-
fn convert_utf16_index_to_utf8(in_str: &str, utf16_index: usize) -> usize {
399-
i_slint_core::unicode_utils::utf16_offset_to_byte_offset_clamped(in_str, utf16_index)
400-
}
401399

402-
fn convert_utf8_index_to_utf16(in_str: &str, utf8_index: usize) -> usize {
403-
i_slint_core::unicode_utils::byte_offset_to_utf16_offset(in_str, utf8_index)
404-
}
400+
405401

406402
#[unsafe(no_mangle)]
407403
extern "system" fn Java_SlintAndroidJavaHelper_setNightMode(

internal/core/unicode_utils.rs

Lines changed: 8 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,11 @@
11
// Copyright © SixtyFPS GmbH <info@slint.dev>
22
// SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-Royalty-free-2.0 OR LicenseRef-Slint-Software-3.0
33

4-
//! Byte offset and UTF-16 conversion utilities for text handling.
4+
//! UTF-16 ↔ UTF-8 byte offset conversion utilities for text handling.
55
//!
66
//! Slint uses UTF-8 byte offsets internally for text positions. Platform IME
77
//! protocols (Android InputConnection, iOS UITextInput) use UTF-16 code unit
8-
//! offsets. This module provides conversions between the two encodings, plus
9-
//! helpers for working with byte offsets safely.
10-
11-
/// Validates that a byte offset is on a UTF-8 character boundary.
12-
///
13-
/// Returns `true` if the offset is within bounds and on a character boundary.
14-
pub fn is_valid_byte_offset(text: &str, offset: usize) -> bool {
15-
offset <= text.len() && text.is_char_boundary(offset)
16-
}
17-
18-
/// Finds the nearest valid byte offset at or before the given offset.
19-
///
20-
/// If the offset is already valid, returns it unchanged.
21-
/// If the offset is beyond the string length, returns the string length.
22-
/// If the offset is in the middle of a UTF-8 character, returns the start of that character.
23-
pub fn floor_byte_offset(text: &str, offset: usize) -> usize {
24-
if offset >= text.len() {
25-
return text.len();
26-
}
27-
let mut pos = offset;
28-
while pos > 0 && !text.is_char_boundary(pos) {
29-
pos -= 1;
30-
}
31-
pos
32-
}
8+
//! offsets. This module provides conversions between the two encodings.
339
3410
/// Finds the nearest valid byte offset at or after the given offset.
3511
///
@@ -47,24 +23,6 @@ pub fn ceil_byte_offset(text: &str, offset: usize) -> usize {
4723
pos
4824
}
4925

50-
/// Converts a byte offset to a character (Unicode scalar value) count.
51-
///
52-
/// # Panics
53-
/// Panics if `byte_offset` is not on a valid UTF-8 character boundary.
54-
pub fn byte_offset_to_char_count(text: &str, byte_offset: usize) -> usize {
55-
text[..byte_offset].chars().count()
56-
}
57-
58-
/// Converts a character count to a byte offset.
59-
///
60-
/// Returns the byte offset after `char_count` characters, or the string length
61-
/// if `char_count` exceeds the number of characters in the string.
62-
pub fn char_count_to_byte_offset(text: &str, char_count: usize) -> usize {
63-
text.char_indices().nth(char_count).map(|(idx, _)| idx).unwrap_or(text.len())
64-
}
65-
66-
// ===== UTF-16 Offset Conversions =====
67-
//
6826
// Android (Java) and iOS (NSString) use UTF-16 code unit offsets, while Rust
6927
// strings are UTF-8. These functions convert between the two encodings.
7028
//
@@ -76,7 +34,8 @@ pub fn char_count_to_byte_offset(text: &str, char_count: usize) -> usize {
7634
/// Converts a UTF-16 code unit offset to a UTF-8 byte offset.
7735
///
7836
/// Returns `None` if the offset is beyond the string or falls inside a
79-
/// surrogate pair.
37+
/// surrogate pair. See [`utf16_offset_to_byte_offset_clamped`] for a
38+
/// variant that clamps instead of returning `None`.
8039
///
8140
/// # Examples
8241
/// ```
@@ -118,6 +77,9 @@ pub fn utf16_offset_to_byte_offset(text: &str, utf16_offset: usize) -> Option<us
11877

11978
/// Converts a UTF-8 byte offset to a UTF-16 code unit offset.
12079
///
80+
/// This function panics on invalid input because callers are expected to
81+
/// hold valid byte offsets (e.g. from `TextInput::cursor_position`).
82+
///
12183
/// # Panics
12284
/// Panics if `byte_offset` is not on a valid UTF-8 character boundary or is
12385
/// beyond the string length.
@@ -134,7 +96,7 @@ pub fn utf16_offset_to_byte_offset(text: &str, utf16_offset: usize) -> Option<us
13496
/// ```
13597
pub fn byte_offset_to_utf16_offset(text: &str, byte_offset: usize) -> usize {
13698
assert!(
137-
is_valid_byte_offset(text, byte_offset),
99+
byte_offset <= text.len() && text.is_char_boundary(byte_offset),
138100
"byte_offset {} is not a valid UTF-8 boundary in string of length {}",
139101
byte_offset,
140102
text.len()
@@ -178,74 +140,6 @@ pub fn utf16_offset_to_byte_offset_clamped(text: &str, utf16_offset: usize) -> u
178140
mod tests {
179141
use super::*;
180142

181-
// ===== Byte Offset Utility Tests =====
182-
183-
#[test]
184-
fn test_is_valid_byte_offset() {
185-
let text = "héllo"; // é is 2 bytes
186-
assert!(is_valid_byte_offset(text, 0));
187-
assert!(is_valid_byte_offset(text, 1));
188-
assert!(!is_valid_byte_offset(text, 2)); // middle of é
189-
assert!(is_valid_byte_offset(text, 3));
190-
assert!(is_valid_byte_offset(text, 6)); // end of string
191-
assert!(!is_valid_byte_offset(text, 7)); // beyond string
192-
}
193-
194-
#[test]
195-
fn test_is_valid_byte_offset_empty_string() {
196-
assert!(is_valid_byte_offset("", 0));
197-
assert!(!is_valid_byte_offset("", 1));
198-
}
199-
200-
#[test]
201-
fn test_is_valid_byte_offset_multibyte() {
202-
let text = "日本語"; // each kanji is 3 bytes
203-
assert!(is_valid_byte_offset(text, 0));
204-
assert!(!is_valid_byte_offset(text, 1));
205-
assert!(!is_valid_byte_offset(text, 2));
206-
assert!(is_valid_byte_offset(text, 3));
207-
assert!(is_valid_byte_offset(text, 6));
208-
assert!(is_valid_byte_offset(text, 9));
209-
}
210-
211-
#[test]
212-
fn test_is_valid_byte_offset_emoji() {
213-
let text = "a😀b"; // 'a'=1, '😀'=4, 'b'=1
214-
assert!(is_valid_byte_offset(text, 0));
215-
assert!(is_valid_byte_offset(text, 1));
216-
assert!(!is_valid_byte_offset(text, 2));
217-
assert!(!is_valid_byte_offset(text, 3));
218-
assert!(!is_valid_byte_offset(text, 4));
219-
assert!(is_valid_byte_offset(text, 5));
220-
assert!(is_valid_byte_offset(text, 6));
221-
}
222-
223-
#[test]
224-
fn test_floor_byte_offset() {
225-
let text = "héllo";
226-
assert_eq!(floor_byte_offset(text, 0), 0);
227-
assert_eq!(floor_byte_offset(text, 1), 1);
228-
assert_eq!(floor_byte_offset(text, 2), 1); // middle of é → start of é
229-
assert_eq!(floor_byte_offset(text, 3), 3);
230-
assert_eq!(floor_byte_offset(text, 10), 6); // beyond → end
231-
}
232-
233-
#[test]
234-
fn test_floor_byte_offset_multibyte() {
235-
let text = "日本語";
236-
assert_eq!(floor_byte_offset(text, 1), 0);
237-
assert_eq!(floor_byte_offset(text, 2), 0);
238-
assert_eq!(floor_byte_offset(text, 3), 3);
239-
assert_eq!(floor_byte_offset(text, 4), 3);
240-
assert_eq!(floor_byte_offset(text, 5), 3);
241-
}
242-
243-
#[test]
244-
fn test_floor_byte_offset_empty() {
245-
assert_eq!(floor_byte_offset("", 0), 0);
246-
assert_eq!(floor_byte_offset("", 5), 0);
247-
}
248-
249143
#[test]
250144
fn test_ceil_byte_offset() {
251145
let text = "héllo";
@@ -271,97 +165,6 @@ mod tests {
271165
assert_eq!(ceil_byte_offset("", 5), 0);
272166
}
273167

274-
#[test]
275-
fn test_floor_ceil_at_exact_boundary() {
276-
let text = "abc";
277-
for i in 0..=text.len() {
278-
assert_eq!(floor_byte_offset(text, i), i);
279-
assert_eq!(ceil_byte_offset(text, i), i);
280-
}
281-
}
282-
283-
#[test]
284-
fn test_byte_offset_to_char_count() {
285-
let text = "héllo";
286-
assert_eq!(byte_offset_to_char_count(text, 0), 0);
287-
assert_eq!(byte_offset_to_char_count(text, 1), 1);
288-
assert_eq!(byte_offset_to_char_count(text, 3), 2);
289-
assert_eq!(byte_offset_to_char_count(text, 6), 5);
290-
}
291-
292-
#[test]
293-
fn test_byte_offset_to_char_count_emoji() {
294-
let text = "a😀b";
295-
assert_eq!(byte_offset_to_char_count(text, 0), 0);
296-
assert_eq!(byte_offset_to_char_count(text, 1), 1);
297-
assert_eq!(byte_offset_to_char_count(text, 5), 2);
298-
assert_eq!(byte_offset_to_char_count(text, 6), 3);
299-
}
300-
301-
#[test]
302-
fn test_char_count_to_byte_offset() {
303-
let text = "héllo";
304-
assert_eq!(char_count_to_byte_offset(text, 0), 0);
305-
assert_eq!(char_count_to_byte_offset(text, 1), 1);
306-
assert_eq!(char_count_to_byte_offset(text, 2), 3);
307-
assert_eq!(char_count_to_byte_offset(text, 5), 6);
308-
assert_eq!(char_count_to_byte_offset(text, 10), 6); // beyond → end
309-
}
310-
311-
#[test]
312-
fn test_char_count_to_byte_offset_emoji() {
313-
let text = "a😀b";
314-
assert_eq!(char_count_to_byte_offset(text, 0), 0);
315-
assert_eq!(char_count_to_byte_offset(text, 1), 1);
316-
assert_eq!(char_count_to_byte_offset(text, 2), 5);
317-
assert_eq!(char_count_to_byte_offset(text, 3), 6);
318-
}
319-
320-
#[test]
321-
fn test_roundtrip_byte_char_conversion() {
322-
let text = "héllo 日本語 😀";
323-
for (idx, _) in text.char_indices() {
324-
let char_count = byte_offset_to_char_count(text, idx);
325-
let back = char_count_to_byte_offset(text, char_count);
326-
assert_eq!(back, idx, "Roundtrip failed for byte offset {}", idx);
327-
}
328-
let char_count = byte_offset_to_char_count(text, text.len());
329-
assert_eq!(char_count_to_byte_offset(text, char_count), text.len());
330-
}
331-
332-
#[test]
333-
fn test_byte_offset_conversions_empty() {
334-
assert_eq!(byte_offset_to_char_count("", 0), 0);
335-
assert_eq!(char_count_to_byte_offset("", 0), 0);
336-
assert_eq!(char_count_to_byte_offset("", 5), 0);
337-
}
338-
339-
#[test]
340-
fn test_surrogate_pairs() {
341-
let text = "𝄞"; // Musical G clef, 4 bytes in UTF-8
342-
assert_eq!(text.len(), 4);
343-
assert!(is_valid_byte_offset(text, 0));
344-
assert!(!is_valid_byte_offset(text, 1));
345-
assert!(!is_valid_byte_offset(text, 2));
346-
assert!(!is_valid_byte_offset(text, 3));
347-
assert!(is_valid_byte_offset(text, 4));
348-
assert_eq!(floor_byte_offset(text, 2), 0);
349-
assert_eq!(ceil_byte_offset(text, 2), 4);
350-
}
351-
352-
#[test]
353-
fn test_combining_characters() {
354-
let text = "e\u{0301}"; // 'e' + combining acute accent
355-
assert_eq!(text.chars().count(), 2);
356-
assert_eq!(text.len(), 3);
357-
assert!(is_valid_byte_offset(text, 0));
358-
assert!(is_valid_byte_offset(text, 1));
359-
assert!(!is_valid_byte_offset(text, 2));
360-
assert!(is_valid_byte_offset(text, 3));
361-
assert_eq!(byte_offset_to_char_count(text, 1), 1);
362-
assert_eq!(byte_offset_to_char_count(text, 3), 2);
363-
}
364-
365168
// ===== UTF-16 Conversion Tests =====
366169

367170
#[test]

0 commit comments

Comments
 (0)