Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ utf8proc_data.c.new
printproperty
charwidth
valid
iterate
iterate
case
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,14 @@ test/valid: test/valid.c utf8proc.o utf8proc.h test/tests.h
test/iterate: test/iterate.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/iterate.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
test/case: test/case.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/case.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
test/charwidth
test/valid
test/iterate
test/case
50 changes: 50 additions & 0 deletions test/case.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include "tests.h"
#include <wctype.h>

int main(int argc, char **argv)
{
int error = 0, better = 0;
utf8proc_int32_t c;

(void) argc; /* unused */
(void) argv; /* unused */

/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
utf8proc_int32_t l = utf8proc_tolower(c);
utf8proc_int32_t u = utf8proc_toupper(c);

check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");

if (sizeof(wint_t) > 2 || c < (1<<16)) {
wint_t l0 = towlower(c), u0 = towupper(c);

/* OS unicode tables may be out of date. But if they
do have a lower/uppercase mapping, hopefully it
is correct? */
if (l0 != c && l0 != l) {
fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
l, c, l0);
++error;
}
else if (l0 != l) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towlower(%x) == %x\n", l, c, l0); */
}
if (u0 != c && u0 != u) {
fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
u, c, u0);
++error;
}
else if (u0 != u) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towupper(%x) == %x\n", u, c, u0); */
}
}
}
check(!error, "utf8proc case conversion FAILED %d tests.", error);
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
printf("utf8proc case conversion tests SUCCEEDED.\n");
return 0;
}
12 changes: 12 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut
utf8proc_get_property(c2)->boundclass);
}

UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
{
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
return cl >= 0 ? cl : c;
}

UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
{
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
return cu >= 0 ? cu : c;
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
Expand Down
15 changes: 15 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);


/**
* Given a codepoint `c`, return the codepoint of the corresponding
* lower-case character, if any; otherwise (if there is no lower-case
* variant, or if `c` is not a valid codepoint) return `c`.
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return the codepoint of the corresponding
* upper-case character, if any; otherwise (if there is no upper-case
* variant, or if `c` is not a valid codepoint) return `c`.
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);

/**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
Expand Down