Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
/test/valid
/test/iterate
/test/case
/test/iscase
/test/custom
/tmp/
/mingw_static/
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ clean:
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
rm -rf MANIFEST.new tmp
$(MAKE) -C bench clean
$(MAKE) -C data clean
Expand Down Expand Up @@ -129,6 +129,12 @@ data/NormalizationTest.txt:
data/GraphemeBreakTest.txt:
$(MAKE) -C data GraphemeBreakTest.txt

data/Lowercase.txt:
$(MAKE) -C data Lowercase.txt

data/Uppercase.txt:
$(MAKE) -C data Uppercase.txt

test/tests.o: test/tests.c test/tests.h utf8proc.h
$(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c

Expand All @@ -150,6 +156,9 @@ test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@

test/iscase: test/iscase.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/iscase.c test/tests.o utf8proc.o -o $@

test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@

Expand All @@ -159,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
Expand All @@ -168,4 +177,5 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
test/valid
test/iterate
test/case
test/iscase data/Lowercase.txt data/Uppercase.txt
test/custom
7 changes: 7 additions & 0 deletions data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ GraphemeBreakTest.txt:
emoji-data.txt:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt

Uppercase.txt: DerivedCoreProperties.txt
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@

Lowercase.txt: DerivedCoreProperties.txt
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@

clean:
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
rm -f Uppercase.txt Lowercase.txt
rm -f utf8proc_data.c.new
26 changes: 24 additions & 2 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,26 @@
end
end

$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]
$uppercase = []
$uppercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $uppercase << e2 }
elsif entry =~ /^[0-9A-F]+/
$uppercase << $&.hex
end
end

$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]
$lowercase = []
$lowercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $lowercase << e2 }
elsif entry =~ /^[0-9A-F]+/
$lowercase << $&.hex
end
end

$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
Expand Down Expand Up @@ -204,8 +224,10 @@ def initialize(line)
$8.split.collect { |element| element.hex }
@bidi_mirrored = ($13=='Y') ? true : false
# issue #130: use nonstandard uppercase ß -> ẞ
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
@lowercase_mapping = ($17=='') ? nil : $17.hex
# issue #195: if character is uppercase but has no lowercase mapping,
# then make lowercase mapping = itself (vice versa for lowercase)
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
@lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
@titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
end
def case_folding
Expand Down
62 changes: 62 additions & 0 deletions test/iscase.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "tests.h"

int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
{
unsigned char buf[8192];
size_t len = simple_getline(buf, f);
size_t pos = skipspaces(buf, 0);
unsigned char s[16];
if (pos == len || buf[pos] == '#') return 0;
pos += encode(s, buf + pos) - 1;
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
if (buf[pos] == '.' && buf[pos+1] == '.') {
encode(s, buf + pos + 2);
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
}
else
*end = *start;
return 1;
}

int test_iscase(const char *fname, int (*iscase)(utf8proc_int32_t),
utf8proc_int32_t (*thatcase)(utf8proc_int32_t))
{
FILE *f = fopen(fname, "r");
int lines = 0, tests = 0, success = 1;
utf8proc_int32_t c = 0;

check(f != NULL, "error opening data file \"%s\"\n", fname);

while (success && !feof(f)) {
utf8proc_int32_t start, end;
if (read_range(f, &start, &end)) {
for (; c < start; ++c) {
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
}
for (; c <= end; ++c) {
check(iscase(c), "failed iscase(%04x) in %s\n", c, fname);
check(thatcase(c) == c, "inconsistent thatcase(%04x) in %s\n", c, fname);
++tests;
}
}
++lines;
}
for (; c <= 0x110000; ++c) {
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
}

printf("Checked %d characters from %d lines of %s\n", tests, lines, fname);
fclose(f);
return success;
}

int main(int argc, char **argv)
{
check(argc == 3, "Expected Lowercase.txt and Uppercase.txt as arguments");
check(test_iscase(argv[1], utf8proc_islower, utf8proc_tolower), "Lowercase tests failed");
check(test_iscase(argv[2], utf8proc_isupper, utf8proc_toupper), "Uppercase tests failed");
printf("utf8proc iscase tests SUCCEEDED.\n");
return 0;
}
12 changes: 6 additions & 6 deletions test/printproperty.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ int main(int argc, char **argv)
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" uppercase_mapping = %04x (seqindex %04x)%s\n"
" lowercase_mapping = %04x (seqindex %04x)%s\n"
" titlecase_mapping = %04x (seqindex %04x)\n"
" casefold = %s\n"
" comb_index = %d\n"
" bidi_mirrored = %d\n"
Expand All @@ -43,9 +43,9 @@ int main(int argc, char **argv)
p->combining_class,
p->bidi_class,
p->decomp_type,
utf8proc_toupper(c),
utf8proc_tolower(c),
utf8proc_totitle(c),
utf8proc_toupper(c), p->uppercase_seqindex, utf8proc_isupper(c) ? " (isupper)" : "",
utf8proc_tolower(c), p->lowercase_seqindex, utf8proc_islower(c) ? " (islower)" : "",
utf8proc_totitle(c), p->titlecase_seqindex,
(char *) map,
p->comb_index,
p->bidi_mirrored,
Expand Down
12 changes: 12 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
}

UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
{
const utf8proc_property_t *p = utf8proc_get_property(c);
return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
}

UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
{
const utf8proc_property_t *p = utf8proc_get_property(c);
return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
Expand Down
12 changes: 12 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return `1` if the codepoint corresponds to a lower-case character
* and `0` otherwise.
*/
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return `1` if the codepoint corresponds to an upper-case character
* and `0` otherwise.
*/
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);

/**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
Expand Down
Loading