Skip to content

Commit 409a44d

Browse files
committed
Fix: Vietnamese old-even fold in ZMM
1 parent 5948374 commit 409a44d

1 file changed

Lines changed: 21 additions & 6 deletions

File tree

include/stringzilla/utf8_case.h

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6244,14 +6244,29 @@ SZ_INTERNAL __m512i sz_utf8_case_insensitive_find_ice_vietnamese_fold_zmm_(__m51
62446244

62456245
result_zmm = _mm512_mask_add_epi8(result_zmm, is_c3_target, result_zmm, x_20_zmm);
62466246

6247-
// 2. Latin Extended-A (C4/C5): Even -> Odd (+1)
6248-
// Covers most chars including Đ (C4 90) -> đ (C4 91)
6249-
// Target is any byte following C4 or C5 which is even
6247+
// 2. Latin Extended-A (C4/C5): Even -> Odd (+1) for MOST characters
6248+
// Standard pattern (U+0100-U+0138, U+014A-U+017F): Even=uppercase, Odd=lowercase
6249+
// INVERTED pattern (U+0139-U+0148): Odd=uppercase, Even=lowercase
6250+
// - After C4: B9,BB,BD,BF are uppercase (odd), BA,BC,BE are lowercase (even)
6251+
// - After C5: 81,83,85,87 are uppercase (odd), 80,82,84,86,88 are lowercase (even)
6252+
// Note: C4 BF (Ŀ) -> C5 80 (ŀ) crosses lead bytes, handled specially by safety profile
62506253
__mmask64 is_c4_c5_target = is_after_c4 | is_after_c5;
6251-
// Check evenness: (val & 1) == 0
62526254
__mmask64 is_even = _mm512_cmpeq_epi8_mask(_mm512_and_si512(result_zmm, x_01_zmm), _mm512_setzero_si512());
6253-
6254-
result_zmm = _mm512_mask_add_epi8(result_zmm, is_c4_c5_target & is_even, result_zmm, x_01_zmm);
6255+
__mmask64 is_odd = ~is_even;
6256+
6257+
// Identify the inverted range where Even=lowercase (should NOT be transformed +1)
6258+
// After C4: B9-BE (U+0139-U+013E: Ĺ-ľ inverted pattern)
6259+
// Note: BF (Ŀ U+013F) excluded - its lowercase ŀ (U+0140) is C5 80 (different lead byte)
6260+
__mmask64 is_c4_inverted_range = is_after_c4 & _mm512_cmpge_epu8_mask(result_zmm, _mm512_set1_epi8((char)0xB9)) &
6261+
_mm512_cmple_epu8_mask(result_zmm, _mm512_set1_epi8((char)0xBE));
6262+
// After C5: 80-88 (even bytes 80, 82, 84, 86, 88 are lowercase)
6263+
__mmask64 is_c5_inverted_range = is_after_c5 & _mm512_cmple_epu8_mask(result_zmm, _mm512_set1_epi8((char)0x88));
6264+
__mmask64 is_inverted_range = is_c4_inverted_range | is_c5_inverted_range;
6265+
6266+
// Standard range: apply +1 to even bytes (uppercase -> lowercase)
6267+
result_zmm = _mm512_mask_add_epi8(result_zmm, is_c4_c5_target & is_even & ~is_inverted_range, result_zmm, x_01_zmm);
6268+
// Inverted range: apply +1 to odd bytes (uppercase -> lowercase)
6269+
result_zmm = _mm512_mask_add_epi8(result_zmm, is_inverted_range & is_odd, result_zmm, x_01_zmm);
62556270

62566271
// 3. Latin Extended-B (C6): Specific Vietnamese chars
62576272
// Ơ (C6 A0) -> ơ (C6 A1). Even->Odd.

0 commit comments

Comments
 (0)