Skip to content

Commit 36d4201

Browse files
committed
add toupper/tolower functions (for JuliaLang/julia#11471)
1 parent 7c14ef5 commit 36d4201

5 files changed

Lines changed: 83 additions & 1 deletion

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ graphemetest
2121
utf8proc_data.c.new
2222
printproperty
2323
charwidth
24+
case

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
105105
test/charwidth: test/charwidth.c utf8proc.o utf8proc.h test/tests.h
106106
$(cc) test/charwidth.c utf8proc.o -o $@
107107

108-
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth bench/bench.c bench/util.c bench/util.h utf8proc.o
108+
test/case: test/case.c utf8proc.o utf8proc.h test/tests.h
109+
$(cc) test/case.c utf8proc.o -o $@
110+
111+
check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/case bench/bench.c bench/util.c bench/util.h utf8proc.o
109112
$(MAKE) -C bench
110113
test/normtest data/NormalizationTest.txt
111114
test/graphemetest data/GraphemeBreakTest.txt
112115
test/charwidth
116+
test/case

test/case.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include "tests.h"
2+
#include <wctype.h>
3+
4+
int main(int argc, char **argv)
5+
{
6+
int error = 0, better = 0;
7+
utf8proc_int32_t c;
8+
9+
(void) argc; /* unused */
10+
(void) argv; /* unused */
11+
12+
/* some simple sanity tests of the character widths */
13+
for (c = 0; c <= 0x110000; ++c) {
14+
utf8proc_int32_t l = utf8proc_tolower(c);
15+
utf8proc_int32_t u = utf8proc_toupper(c);
16+
17+
check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
18+
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
19+
20+
if (sizeof(wint_t) > 2 || c < (1<<16)) {
21+
wint_t l0 = towlower(c), u0 = towupper(c);
22+
23+
/* OS unicode tables may be out of date. But if they
24+
do have a lower/uppercase mapping, hopefully it
25+
is correct? */
26+
if (l0 != c && l0 != l) {
27+
fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
28+
l, c, l0);
29+
++error;
30+
}
31+
else if (l0 != l) { /* often true for out-of-date OS unicode */
32+
++better;
33+
/* printf("%x != towlower(%x) == %x\n", l, c, l0); */
34+
}
35+
if (u0 != c && u0 != u) {
36+
fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
37+
u, c, u0);
38+
++error;
39+
}
40+
else if (u0 != u) { /* often true for out-of-date OS unicode */
41+
++better;
42+
/* printf("%x != towupper(%x) == %x\n", u, c, u0); */
43+
}
44+
}
45+
}
46+
check(!error, "utf8proc case conversion FAILED %d tests.", error);
47+
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
48+
printf("utf8proc case conversion tests SUCCEEDED.\n");
49+
return 0;
50+
}

utf8proc.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut
231231
utf8proc_get_property(c2)->boundclass);
232232
}
233233

234+
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
235+
{
236+
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
237+
return cl >= 0 ? cl : c;
238+
}
239+
240+
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
241+
{
242+
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
243+
return cu >= 0 ? cu : c;
244+
}
245+
234246
/* return a character width analogous to wcwidth (except portable and
235247
hopefully less buggy than most system wcwidth functions). */
236248
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {

utf8proc.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
506506
*/
507507
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
508508

509+
510+
/**
511+
* Given a codepoint `c`, return the codepoint of the corresponding
512+
* lower-case character, if any; otherwise (if there is no lower-case
513+
* variant, or if `c` is not a valid codepoint) return `c`.
514+
*/
515+
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
516+
517+
/**
518+
* Given a codepoint `c`, return the codepoint of the corresponding
519+
* upper-case character, if any; otherwise (if there is no upper-case
520+
* variant, or if `c` is not a valid codepoint) return `c`.
521+
*/
522+
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
523+
509524
/**
510525
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
511526
* except that a width of 0 is returned for non-printable codepoints

0 commit comments

Comments
 (0)