diff --git a/data/data_generator.rb b/data/data_generator.rb index 37a37807..fa096178 100644 --- a/data/data_generator.rb +++ b/data/data_generator.rb @@ -104,7 +104,7 @@ $case_folding_string = File.open("CaseFolding.txt", :encoding => 'utf-8').read $case_folding = {} $case_folding_string.chomp.split("\n").each do |line| - next unless line =~ /([0-9A-F]+); [CFS]; ([0-9A-F ]+);/i + next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex } end diff --git a/utf8proc.c b/utf8proc.c index c14bbe13..002b79c7 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -422,7 +422,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, if (!category) return UTF8PROC_ERROR_NOTASSIGNED; } if (options & UTF8PROC_IGNORE) { - if (property->ignorable) return 0; + if (!category || property->ignorable) return 0; } if (options & UTF8PROC_LUMP) { if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); @@ -753,3 +753,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) UTF8PROC_COMPOSE | UTF8PROC_COMPAT); return retval; } + +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_CF(const utf8proc_uint8_t *str) { + utf8proc_uint8_t *retval; + utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | + UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); + return retval; +} diff --git a/utf8proc.h b/utf8proc.h index 495cd960..e2f9faf9 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -158,7 +158,10 @@ typedef enum { UTF8PROC_COMPOSE = (1<<3), /** Return a result with decomposed characters. */ UTF8PROC_DECOMPOSE = (1<<4), - /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */ + /** + * Strip "default ignorable characters" such as SOFT-HYPHEN or + * ZERO-WIDTH-SPACE, along with unassigned codepoints. + */ UTF8PROC_IGNORE = (1<<5), /** Return an error, if the input contains unassigned codepoints. */ UTF8PROC_REJECTNA = (1<<6), @@ -676,8 +679,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( /** @name Unicode normalization * - * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC - * normalized version of the null-terminated string `str`. These + * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or + * NFKC_Casefold normalized version of the null-terminated string `str`. These * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. */ @@ -690,6 +693,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); +/** + * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT + * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE). + **/ +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_CF(const utf8proc_uint8_t *str); /** @} */ #ifdef __cplusplus