Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ project (utf8proc C)
# API version number (defined in utf8proc.h).
# Be sure to also update these in Makefile and MANIFEST!
set(SO_MAJOR 2)
set(SO_MINOR 0)
set(SO_PATCH 2)
set(SO_MINOR 1)
set(SO_PATCH 0)

add_definitions (
-DUTF8PROC_EXPORTS
Expand Down
6 changes: 3 additions & 3 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.0.2
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2
lib/libutf8proc.so.2.0.2
lib/libutf8proc.so -> libutf8proc.so.2.1.0
lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
lib/libutf8proc.so.2.1.0
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# The API version number is defined in utf8proc.h.
# Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
MAJOR=2
MINOR=0
PATCH=2
MINOR=1
PATCH=0

OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X
Expand All @@ -49,7 +49,7 @@ clean:
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
rm -rf MANIFEST.new tmp
$(MAKE) -C bench clean
$(MAKE) -C data clean
Expand Down Expand Up @@ -136,11 +136,15 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
test/charwidth
test/valid
test/iterate
test/case
test/custom
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# utf8proc release history #

## Version 2.1 (not yet released) ##

- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom`
to allow user-supplied transformations of codepoints, in conjunction
with other transformations ([#89]).

- New function `utf8proc_normalize_utf32` to apply normalizations
directly to UTF-32 data (not just UTF-8) ([#88]).

- Fixed stack overflow that could occur due to incorrect definition
of `UINT16_MAX` with some compilers ([#84]).

## Version 2.0.2 ##

2016-07-27:
Expand Down Expand Up @@ -279,3 +291,6 @@ Release of version 1.0.1
[#78]: https://github.com/JuliaLang/utf8proc/issues/78
[#79]: https://github.com/JuliaLang/utf8proc/issues/79
[#80]: https://github.com/JuliaLang/utf8proc/issues/80
[#84]: https://github.com/JuliaLang/utf8proc/pull/84
[#88]: https://github.com/JuliaLang/utf8proc/pull/88
[#89]: https://github.com/JuliaLang/utf8proc/pull/89
27 changes: 27 additions & 0 deletions test/custom.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "tests.h"

static int thunk_test = 1;

static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk)
{
check(((int *) thunk) == &thunk_test, "unexpected thunk passed");
if (codepoint == 'a')
return 'b';
if (codepoint == 'S')
return 0x00df; /* ß */
return codepoint;
}

int main(void)
{
utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */
utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */
utf8proc_uint8_t *output;
utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM,
custom, &thunk_test);
printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output);
check(strlen((char*) output) == 6, "incorrect output length");
check(!memcmp(correct, output, 7), "incorrect output data");
free(output);
return 0;
}
25 changes: 20 additions & 5 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
return s[utf8proc_category(c)];
}



#define utf8proc_decompose_lump(replacement_uc) \
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
options & ~UTF8PROC_LUMP, last_boundclass)
Expand Down Expand Up @@ -485,6 +483,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
) {
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
}

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
) {
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
utf8proc_ssize_t wpos = 0;
Expand All @@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
}
if (custom_func != NULL) {
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
}
decomp_result = utf8proc_decompose_char(
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
&boundclass
Expand Down Expand Up @@ -683,15 +692,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
) {
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
}

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
) {
utf8proc_int32_t *buffer;
utf8proc_ssize_t result;
*dstptr = NULL;
result = utf8proc_decompose(str, strlen, NULL, 0, options);
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
if (result < 0) return result;
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
if (!buffer) return UTF8PROC_ERROR_NOMEM;
result = utf8proc_decompose(str, strlen, buffer, result, options);
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
if (result < 0) {
free(buffer);
return result;
Expand Down Expand Up @@ -737,4 +753,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval;
}

41 changes: 36 additions & 5 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 0
#define UTF8PROC_VERSION_MINOR 1
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 2
#define UTF8PROC_VERSION_PATCH 0
/** @} */

#include <stdlib.h>
Expand Down Expand Up @@ -373,6 +373,13 @@ typedef enum {
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t;

/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
* mapping of codepoints to be applied in conjunction with other mappings.
*/
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);

/**
* Array containing the byte lengths of a UTF-8 encoded codepoint based
* on the first byte.
Expand Down Expand Up @@ -480,6 +487,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
* `buffer` (which must contain at least `bufsize` entries). In case of
* success, the number of codepoints written is returned; in case of an
* error, a negative error code is returned (@ref utf8proc_errmsg).
* See @ref utf8proc_decompose_custom to supply additional transformations.
*
* If the number of written codepoints would be bigger than `bufsize`, the
* required buffer size is returned, while the buffer will be overwritten with
Expand All @@ -490,6 +498,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
);

/**
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
* that is called on each codepoint in `str` before any other transformations
* (along with a `custom_data` pointer that is passed through to `custom_func`).
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
);

/**
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
* in-place (i.e., the result is also stored in `buffer`).
Expand Down Expand Up @@ -595,7 +615,7 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
* instead of -1 as in `wcwidth`.
*
*
* @note
* If you want to check for particular types of non-printable characters,
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
Expand Down Expand Up @@ -623,7 +643,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
* in any case the result will be NULL terminated (though it might
* contain NULL characters with the string if `str` contained NULL
* characters). Other flags in the `options` field are passed to the
* functions defined above, and regarded as described.
* functions defined above, and regarded as described. See also
* @ref utfproc_map_custom to supply a custom codepoint transformation.
*
* In case of success the length of the new string is returned,
* otherwise a negative error code is returned.
Expand All @@ -635,6 +656,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
);

/**
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
* that is called on each codepoint in `str` before any other transformations
* (along with a `custom_data` pointer that is passed through to `custom_func`).
* The `custom_func` argument is ignored if it is `NULL`.
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
utf8proc_custom_func custom_func, void *custom_data
);

/** @name Unicode normalization
*
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
Expand All @@ -658,4 +690,3 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
#endif

#endif