11// Copyright © SixtyFPS GmbH <info@slint.dev>
22// SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-Royalty-free-2.0 OR LicenseRef-Slint-Software-3.0
33
4- //! Byte offset and UTF-16 conversion utilities for text handling.
4+ //! UTF-16 ↔ UTF-8 byte offset conversion utilities for text handling.
55//!
66//! Slint uses UTF-8 byte offsets internally for text positions. Platform IME
77//! protocols (Android InputConnection, iOS UITextInput) use UTF-16 code unit
8- //! offsets. This module provides conversions between the two encodings, plus
9- //! helpers for working with byte offsets safely.
10-
11- /// Validates that a byte offset is on a UTF-8 character boundary.
12- ///
13- /// Returns `true` if the offset is within bounds and on a character boundary.
14- pub fn is_valid_byte_offset ( text : & str , offset : usize ) -> bool {
15- offset <= text. len ( ) && text. is_char_boundary ( offset)
16- }
17-
18- /// Finds the nearest valid byte offset at or before the given offset.
19- ///
20- /// If the offset is already valid, returns it unchanged.
21- /// If the offset is beyond the string length, returns the string length.
22- /// If the offset is in the middle of a UTF-8 character, returns the start of that character.
23- pub fn floor_byte_offset ( text : & str , offset : usize ) -> usize {
24- if offset >= text. len ( ) {
25- return text. len ( ) ;
26- }
27- let mut pos = offset;
28- while pos > 0 && !text. is_char_boundary ( pos) {
29- pos -= 1 ;
30- }
31- pos
32- }
8+ //! offsets. This module provides conversions between the two encodings.
339
3410/// Finds the nearest valid byte offset at or after the given offset.
3511///
@@ -47,24 +23,6 @@ pub fn ceil_byte_offset(text: &str, offset: usize) -> usize {
4723 pos
4824}
4925
50- /// Converts a byte offset to a character (Unicode scalar value) count.
51- ///
52- /// # Panics
53- /// Panics if `byte_offset` is not on a valid UTF-8 character boundary.
54- pub fn byte_offset_to_char_count ( text : & str , byte_offset : usize ) -> usize {
55- text[ ..byte_offset] . chars ( ) . count ( )
56- }
57-
58- /// Converts a character count to a byte offset.
59- ///
60- /// Returns the byte offset after `char_count` characters, or the string length
61- /// if `char_count` exceeds the number of characters in the string.
62- pub fn char_count_to_byte_offset ( text : & str , char_count : usize ) -> usize {
63- text. char_indices ( ) . nth ( char_count) . map ( |( idx, _) | idx) . unwrap_or ( text. len ( ) )
64- }
65-
66- // ===== UTF-16 Offset Conversions =====
67- //
6826// Android (Java) and iOS (NSString) use UTF-16 code unit offsets, while Rust
6927// strings are UTF-8. These functions convert between the two encodings.
7028//
@@ -76,7 +34,8 @@ pub fn char_count_to_byte_offset(text: &str, char_count: usize) -> usize {
7634/// Converts a UTF-16 code unit offset to a UTF-8 byte offset.
7735///
7836/// Returns `None` if the offset is beyond the string or falls inside a
79- /// surrogate pair.
37+ /// surrogate pair. See [`utf16_offset_to_byte_offset_clamped`] for a
38+ /// variant that clamps instead of returning `None`.
8039///
8140/// # Examples
8241/// ```
@@ -118,6 +77,9 @@ pub fn utf16_offset_to_byte_offset(text: &str, utf16_offset: usize) -> Option<us
11877
11978/// Converts a UTF-8 byte offset to a UTF-16 code unit offset.
12079///
80+ /// This function panics on invalid input because callers are expected to
81+ /// hold valid byte offsets (e.g. from `TextInput::cursor_position`).
82+ ///
12183/// # Panics
12284/// Panics if `byte_offset` is not on a valid UTF-8 character boundary or is
12385/// beyond the string length.
@@ -134,7 +96,7 @@ pub fn utf16_offset_to_byte_offset(text: &str, utf16_offset: usize) -> Option<us
13496/// ```
13597pub fn byte_offset_to_utf16_offset ( text : & str , byte_offset : usize ) -> usize {
13698 assert ! (
137- is_valid_byte_offset ( text, byte_offset) ,
99+ byte_offset <= text. len ( ) && text . is_char_boundary ( byte_offset) ,
138100 "byte_offset {} is not a valid UTF-8 boundary in string of length {}" ,
139101 byte_offset,
140102 text. len( )
@@ -178,74 +140,6 @@ pub fn utf16_offset_to_byte_offset_clamped(text: &str, utf16_offset: usize) -> u
178140mod tests {
179141 use super :: * ;
180142
181- // ===== Byte Offset Utility Tests =====
182-
183- #[ test]
184- fn test_is_valid_byte_offset ( ) {
185- let text = "héllo" ; // é is 2 bytes
186- assert ! ( is_valid_byte_offset( text, 0 ) ) ;
187- assert ! ( is_valid_byte_offset( text, 1 ) ) ;
188- assert ! ( !is_valid_byte_offset( text, 2 ) ) ; // middle of é
189- assert ! ( is_valid_byte_offset( text, 3 ) ) ;
190- assert ! ( is_valid_byte_offset( text, 6 ) ) ; // end of string
191- assert ! ( !is_valid_byte_offset( text, 7 ) ) ; // beyond string
192- }
193-
194- #[ test]
195- fn test_is_valid_byte_offset_empty_string ( ) {
196- assert ! ( is_valid_byte_offset( "" , 0 ) ) ;
197- assert ! ( !is_valid_byte_offset( "" , 1 ) ) ;
198- }
199-
200- #[ test]
201- fn test_is_valid_byte_offset_multibyte ( ) {
202- let text = "日本語" ; // each kanji is 3 bytes
203- assert ! ( is_valid_byte_offset( text, 0 ) ) ;
204- assert ! ( !is_valid_byte_offset( text, 1 ) ) ;
205- assert ! ( !is_valid_byte_offset( text, 2 ) ) ;
206- assert ! ( is_valid_byte_offset( text, 3 ) ) ;
207- assert ! ( is_valid_byte_offset( text, 6 ) ) ;
208- assert ! ( is_valid_byte_offset( text, 9 ) ) ;
209- }
210-
211- #[ test]
212- fn test_is_valid_byte_offset_emoji ( ) {
213- let text = "a😀b" ; // 'a'=1, '😀'=4, 'b'=1
214- assert ! ( is_valid_byte_offset( text, 0 ) ) ;
215- assert ! ( is_valid_byte_offset( text, 1 ) ) ;
216- assert ! ( !is_valid_byte_offset( text, 2 ) ) ;
217- assert ! ( !is_valid_byte_offset( text, 3 ) ) ;
218- assert ! ( !is_valid_byte_offset( text, 4 ) ) ;
219- assert ! ( is_valid_byte_offset( text, 5 ) ) ;
220- assert ! ( is_valid_byte_offset( text, 6 ) ) ;
221- }
222-
223- #[ test]
224- fn test_floor_byte_offset ( ) {
225- let text = "héllo" ;
226- assert_eq ! ( floor_byte_offset( text, 0 ) , 0 ) ;
227- assert_eq ! ( floor_byte_offset( text, 1 ) , 1 ) ;
228- assert_eq ! ( floor_byte_offset( text, 2 ) , 1 ) ; // middle of é → start of é
229- assert_eq ! ( floor_byte_offset( text, 3 ) , 3 ) ;
230- assert_eq ! ( floor_byte_offset( text, 10 ) , 6 ) ; // beyond → end
231- }
232-
233- #[ test]
234- fn test_floor_byte_offset_multibyte ( ) {
235- let text = "日本語" ;
236- assert_eq ! ( floor_byte_offset( text, 1 ) , 0 ) ;
237- assert_eq ! ( floor_byte_offset( text, 2 ) , 0 ) ;
238- assert_eq ! ( floor_byte_offset( text, 3 ) , 3 ) ;
239- assert_eq ! ( floor_byte_offset( text, 4 ) , 3 ) ;
240- assert_eq ! ( floor_byte_offset( text, 5 ) , 3 ) ;
241- }
242-
243- #[ test]
244- fn test_floor_byte_offset_empty ( ) {
245- assert_eq ! ( floor_byte_offset( "" , 0 ) , 0 ) ;
246- assert_eq ! ( floor_byte_offset( "" , 5 ) , 0 ) ;
247- }
248-
249143 #[ test]
250144 fn test_ceil_byte_offset ( ) {
251145 let text = "héllo" ;
@@ -271,97 +165,6 @@ mod tests {
271165 assert_eq ! ( ceil_byte_offset( "" , 5 ) , 0 ) ;
272166 }
273167
274- #[ test]
275- fn test_floor_ceil_at_exact_boundary ( ) {
276- let text = "abc" ;
277- for i in 0 ..=text. len ( ) {
278- assert_eq ! ( floor_byte_offset( text, i) , i) ;
279- assert_eq ! ( ceil_byte_offset( text, i) , i) ;
280- }
281- }
282-
283- #[ test]
284- fn test_byte_offset_to_char_count ( ) {
285- let text = "héllo" ;
286- assert_eq ! ( byte_offset_to_char_count( text, 0 ) , 0 ) ;
287- assert_eq ! ( byte_offset_to_char_count( text, 1 ) , 1 ) ;
288- assert_eq ! ( byte_offset_to_char_count( text, 3 ) , 2 ) ;
289- assert_eq ! ( byte_offset_to_char_count( text, 6 ) , 5 ) ;
290- }
291-
292- #[ test]
293- fn test_byte_offset_to_char_count_emoji ( ) {
294- let text = "a😀b" ;
295- assert_eq ! ( byte_offset_to_char_count( text, 0 ) , 0 ) ;
296- assert_eq ! ( byte_offset_to_char_count( text, 1 ) , 1 ) ;
297- assert_eq ! ( byte_offset_to_char_count( text, 5 ) , 2 ) ;
298- assert_eq ! ( byte_offset_to_char_count( text, 6 ) , 3 ) ;
299- }
300-
301- #[ test]
302- fn test_char_count_to_byte_offset ( ) {
303- let text = "héllo" ;
304- assert_eq ! ( char_count_to_byte_offset( text, 0 ) , 0 ) ;
305- assert_eq ! ( char_count_to_byte_offset( text, 1 ) , 1 ) ;
306- assert_eq ! ( char_count_to_byte_offset( text, 2 ) , 3 ) ;
307- assert_eq ! ( char_count_to_byte_offset( text, 5 ) , 6 ) ;
308- assert_eq ! ( char_count_to_byte_offset( text, 10 ) , 6 ) ; // beyond → end
309- }
310-
311- #[ test]
312- fn test_char_count_to_byte_offset_emoji ( ) {
313- let text = "a😀b" ;
314- assert_eq ! ( char_count_to_byte_offset( text, 0 ) , 0 ) ;
315- assert_eq ! ( char_count_to_byte_offset( text, 1 ) , 1 ) ;
316- assert_eq ! ( char_count_to_byte_offset( text, 2 ) , 5 ) ;
317- assert_eq ! ( char_count_to_byte_offset( text, 3 ) , 6 ) ;
318- }
319-
320- #[ test]
321- fn test_roundtrip_byte_char_conversion ( ) {
322- let text = "héllo 日本語 😀" ;
323- for ( idx, _) in text. char_indices ( ) {
324- let char_count = byte_offset_to_char_count ( text, idx) ;
325- let back = char_count_to_byte_offset ( text, char_count) ;
326- assert_eq ! ( back, idx, "Roundtrip failed for byte offset {}" , idx) ;
327- }
328- let char_count = byte_offset_to_char_count ( text, text. len ( ) ) ;
329- assert_eq ! ( char_count_to_byte_offset( text, char_count) , text. len( ) ) ;
330- }
331-
332- #[ test]
333- fn test_byte_offset_conversions_empty ( ) {
334- assert_eq ! ( byte_offset_to_char_count( "" , 0 ) , 0 ) ;
335- assert_eq ! ( char_count_to_byte_offset( "" , 0 ) , 0 ) ;
336- assert_eq ! ( char_count_to_byte_offset( "" , 5 ) , 0 ) ;
337- }
338-
339- #[ test]
340- fn test_surrogate_pairs ( ) {
341- let text = "𝄞" ; // Musical G clef, 4 bytes in UTF-8
342- assert_eq ! ( text. len( ) , 4 ) ;
343- assert ! ( is_valid_byte_offset( text, 0 ) ) ;
344- assert ! ( !is_valid_byte_offset( text, 1 ) ) ;
345- assert ! ( !is_valid_byte_offset( text, 2 ) ) ;
346- assert ! ( !is_valid_byte_offset( text, 3 ) ) ;
347- assert ! ( is_valid_byte_offset( text, 4 ) ) ;
348- assert_eq ! ( floor_byte_offset( text, 2 ) , 0 ) ;
349- assert_eq ! ( ceil_byte_offset( text, 2 ) , 4 ) ;
350- }
351-
352- #[ test]
353- fn test_combining_characters ( ) {
354- let text = "e\u{0301} " ; // 'e' + combining acute accent
355- assert_eq ! ( text. chars( ) . count( ) , 2 ) ;
356- assert_eq ! ( text. len( ) , 3 ) ;
357- assert ! ( is_valid_byte_offset( text, 0 ) ) ;
358- assert ! ( is_valid_byte_offset( text, 1 ) ) ;
359- assert ! ( !is_valid_byte_offset( text, 2 ) ) ;
360- assert ! ( is_valid_byte_offset( text, 3 ) ) ;
361- assert_eq ! ( byte_offset_to_char_count( text, 1 ) , 1 ) ;
362- assert_eq ! ( byte_offset_to_char_count( text, 3 ) , 2 ) ;
363- }
364-
365168 // ===== UTF-16 Conversion Tests =====
366169
367170 #[ test]
0 commit comments