Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ disallow_intree_builds()
project (utf8proc C)

# Be sure to also update these in Makefile!
set(SO_MAJOR 2)
set(SO_MAJOR 3)
set(SO_MINOR 0)
set(SO_PATCH 1)
set(SO_PATCH 0)

add_definitions (
-DUTF8PROC_EXPORTS
Expand Down
6 changes: 3 additions & 3 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ include/
include/utf8proc.h
lib/
lib/libutf8proc.a
lib/libutf8proc.so -> libutf8proc.so.2.0.1
lib/libutf8proc.so.2 -> libutf8proc.so.2.0.1
lib/libutf8proc.so.2.0.1
lib/libutf8proc.so -> libutf8proc.so.3.0.0
lib/libutf8proc.so.3 -> libutf8proc.so.3.0.0
lib/libutf8proc.so.3.0.0
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
# not API compatibility: MAJOR should be incremented whenever *binary*
# compatibility is broken, even if the API is backward-compatible
# Be sure to also update these in MANIFEST and CMakeLists.txt!
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you forget to update CMakeLists.txt too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that you also need to update utf8proc.h for the API version change.

MAJOR=2
MAJOR=3
MINOR=0
PATCH=1
PATCH=0

OS := $(shell uname)
ifeq ($(OS),Darwin) # MacOS X
Expand Down
6 changes: 3 additions & 3 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_boundclass[code]}, " <<
"#{$charwidth[code]}},\n"
"#{$charwidth[code]}, 0, " <<
"#{$grapheme_boundclass[code]}},\n"
end
end

Expand Down Expand Up @@ -306,7 +306,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}
Expand Down
110 changes: 80 additions & 30 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
}

/* return whether there is a grapheme break between boundclasses lbc and tbc */
static utf8proc_bool grapheme_break(int lbc, int tbc) {
return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
(lbc == UTF8PROC_BOUNDCLASS_CR &&
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
(lbc == UTF8PROC_BOUNDCLASS_L &&
(tbc == UTF8PROC_BOUNDCLASS_L ||
tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_LV ||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LV ||
lbc == UTF8PROC_BOUNDCLASS_V) &&
(tbc == UTF8PROC_BOUNDCLASS_V ||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
lbc == UTF8PROC_BOUNDCLASS_T) &&
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
(tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
/* return whether there is a grapheme break between boundclasses lbc and tbc
(according to the definition of extended grapheme clusters)

Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
http://www.unicode.org/reports/tr29/tr29-29.html

CAVEATS:
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
and GB 12/13 (regional indicator code points) require knowledge of previous characters
and are thus not handled by this function. This may result in an incorrect break before
an E_Modifier class codepoint and an incorrectly missing break between two
REGIONAL_INDICATOR class code points if such support does not exist in the caller.

See the special support in grapheme_break_extended, for required bookkeeping by the caller.
*/
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
return
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
}

/* return whether there is a grapheme break between codepoints c1 and c2 */
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) {
return grapheme_break(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass);
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
{
int lbc_override = lbc;
if (state && *state != UTF8PROC_BOUNDCLASS_START)
lbc_override = *state;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Keno, I just noticed that lbc_override seems to never be used. Did you mean to pass lbc_override to grapheme_break_simple?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's been a while, but yes, that looks right, esp by looking at the first commit in this PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already seem to have found that bug though: https://github.com/JuliaLang/utf8proc/blob/master/utf8proc.c#L290

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#77

utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);
if (state) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it really necessary to have general state like this? Wouldn't it be enough just to know the previous codepoint before lbc?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there may be an infinite number of EXTEND characters between E_Base and E_Modifier.

// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB10. Fold any EXTEND codepoints into the previous
// boundclass if we're dealing with an emoji base boundclass.
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
*state = UTF8PROC_BOUNDCLASS_E_BASE;
else
*state = tbc;
}
return break_permitted;
}

UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {

return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
state);
}

UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
Expand Down Expand Up @@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break(*last_boundclass, tbc);
*last_boundclass = tbc;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = 0xFFFF;
if (bufsize >= 2) dst[1] = uc;
Expand Down
32 changes: 25 additions & 7 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@
*/
/** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3
#define UTF8PROC_VERSION_MINOR 0
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */
Expand Down Expand Up @@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
*/
unsigned ignorable:1;
unsigned control_boundary:1;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned pad:2;
/**
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:4;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned boundclass:8;
} utf8proc_property_t;

/** Unicode categories. */
Expand Down Expand Up @@ -349,7 +350,7 @@ typedef enum {
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
} utf8proc_decomp_type_t;

/** Boundclass property. */
/** Boundclass property. (TR29) */
typedef enum {
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
Expand All @@ -364,6 +365,12 @@ typedef enum {
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t;

/**
Expand Down Expand Up @@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
/**
* Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
* state to break graphemes. This state can be passed in as a pointer
* in the `state` argument and should initially be set to 0. If the
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
* GB10/12/13 which require this state will not be applied, essentially
* matching the rules in Unicode 8.0.0.
*
* @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
* IN ORDER on ALL potential breaks in a string.
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);


/**
Expand Down
Loading