Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 59 additions & 4 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,17 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)

# Subtract Default_Ignorable_Code_Point characters (they should be zero-width).
# Exception: U+115F HANGUL CHOSEONG FILLER remains wide for jamo composition.
# See https://github.com/jquast/wcwidth/issues/118
default_ignorable = parse_default_ignorable_code_points(
fname=UnicodeDataFile.DerivedCoreProperties(version))
default_ignorable.discard(0x115F) # Keep HANGUL CHOSEONG FILLER as wide
table[version].values = table[version].values.difference(default_ignorable)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=2).values)
fname = UnicodeDataFile.DerivedGeneralCategory(version)
table[version].values.update(parse_category(fname=fname, wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)


Expand All @@ -399,15 +407,32 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
table: dict[UnicodeVersion, TableDef] = {}
for version in fetch_unicode_versions():
# Determine values of zero-width character lookup table by the following category codes
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0)
fname = UnicodeDataFile.DerivedGeneralCategory(version)
table[version] = parse_category(fname=fname, wide=0)

# Include NULL
table[version].values.add(0)

# Add Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)

# Add Default_Ignorable_Code_Point characters
# Per Unicode Standard (https://www.unicode.org/faq/unsup_char.html):
# "All default-ignorable characters should be rendered as completely invisible
# (and non advancing, i.e. 'zero width'), if not explicitly supported in rendering."
#
# See also:
# - https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point
# - https://github.com/jquast/wcwidth/issues/118
table[version].values.update(parse_default_ignorable_code_points(
fname=UnicodeDataFile.DerivedCoreProperties(version)))

# Remove U+115F HANGUL CHOSEONG FILLER from zero-width table.
# Although it has Default_Ignorable_Code_Point property, it should remain
# width 2 because it combines with other Hangul Jamo to form width-2
# syllable blocks.
table[version].values.discard(0x115F)

# Remove u+00AD categoryCode=Cf name="SOFT HYPHEN",
# > https://www.unicode.org/faq/casemap_charprop.html
#
Expand All @@ -422,6 +447,7 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
# This value was wrongly measured as a width of '0' in this wcwidth
# versions 0.2.9 - 0.2.13. Fixed in 0.2.14
table[version].values.discard(0x00AD) # SOFT HYPHEN

return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down Expand Up @@ -711,6 +737,35 @@ def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
}


def parse_default_ignorable_code_points(fname: str) -> set[int]:
"""Parse DerivedCoreProperties.txt for Default_Ignorable_Code_Point property."""
print(f'parsing {fname} for Default_Ignorable_Code_Point: ', end='', flush=True)
values: set[int] = set()

with open(fname, encoding='utf-8') as f:
for line in f:
data, _, comment = line.partition('#')
data = data.strip()
if not data:
continue

parts = [p.strip() for p in data.split(';')]
if len(parts) < 2:
continue

code_points_str, prop_name = parts[0], parts[1]

if prop_name == 'Default_Ignorable_Code_Point':
if '..' in code_points_str:
start, end = code_points_str.split('..')
values.update(range(int(start, 16), int(end, 16) + 1))
else:
values.add(int(code_points_str, 16))

print('ok')
return values


def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
"""Fetch grapheme break property tables for the latest Unicode version only."""
latest_version = fetch_unicode_versions()[-1]
Expand Down
3 changes: 3 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,8 @@ History
* **Performance** improvement in `wcswidth()`_. `PR #171`_.
* **New** argument ``ambiguous_width`` to all functions. `PR #172`_.
* **New** Functions `clip()`_ and `strip_sequences()`_. `PR #173`_.
* **Bugfix** Characters with ``Default_Ignorable_Code_Point`` property now
return width 0. `PR #174`_.

0.2.14 *2025-09-22*
* **Drop Support** for Python 2.7 and 3.5. `PR #117`_.
Expand Down Expand Up @@ -591,6 +593,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
.. _`PR #171`: https://github.com/jquast/wcwidth/pull/171
.. _`PR #172`: https://github.com/jquast/wcwidth/pull/172
.. _`PR #173`: https://github.com/jquast/wcwidth/pull/173
.. _`PR #174`: https://github.com/jquast/wcwidth/pull/174
.. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
.. _`jquast/blessed`: https://github.com/jquast/blessed
.. _`selectel/pyte`: https://github.com/selectel/pyte
Expand Down
6 changes: 6 additions & 0 deletions docs/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ return value of of :func:`wcwidth.wcswidth` is -1.
Width of 0
----------

Any characters with the ``Default_Ignorable_Code_Point`` property in
`DerivedCoreProperties.txt`_ files, 4,174 characters, excluding `U+00AD`_ SOFT HYPHEN
(width 1) and `U+115F`_ HANGUL CHOSEONG FILLER (width 2).

Any characters defined by category codes in `DerivedGeneralCategory.txt`_ files:

- 'Me': Enclosing Combining Mark, aprox. 13 characters.
Expand Down Expand Up @@ -74,6 +78,8 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
.. _`U+2029`: https://codepoints.net/U+2029
.. _`U+D7B0`: https://codepoints.net/U+D7B0
.. _`U+FE0F`: https://codepoints.net/U+FE0F
.. _`U+115F`: https://codepoints.net/U+115F
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
.. _`DerivedCoreProperties.txt`: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
.. _`emoji-variation-sequences.txt`: https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-variation-sequences.txt
55 changes: 55 additions & 0 deletions tests/test_width.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,3 +323,58 @@ def test_modern_sequences(seq, expected_width, name):
"""Modern terminal sequences are recognized as zero-width."""
assert wcwidth.width(seq) == expected_width
assert wcwidth.width(seq, control_codes='strict') == expected_width


@pytest.mark.parametrize('codepoint,expected_width', [
(0x3164, 0),
(0xFFA0, 0),
(0x2065, 0),
(0xFFF0, 0),
(0xFFF1, 0),
(0xFFF8, 0),
(0xE0000, 0),
(0xE0002, 0),
(0xE001F, 0),
(0xE0080, 0),
(0xE00FF, 0),
(0xE01F0, 0),
(0xE0FFF, 0),
])
def test_default_ignorable_zero_width(codepoint, expected_width):
"""Default_Ignorable_Code_Point characters return width 0."""
result = wcwidth.wcwidth(chr(codepoint))
assert result == expected_width


@pytest.mark.parametrize('codepoint,expected_width', [
(0x00AD, 1),
(0x115F, 2),
])
def test_default_ignorable_exceptions(codepoint, expected_width):
"""Exceptions to Default_Ignorable_Code_Point zero-width rule."""
result = wcwidth.wcwidth(chr(codepoint))
assert result == expected_width


def test_hangul_filler_zero_width():
"""U+3164 HANGUL FILLER is width 0."""
result = wcwidth.wcwidth('\u3164')
assert result == 0


def test_halfwidth_hangul_filler_zero_width():
"""U+FFA0 HALFWIDTH HANGUL FILLER is width 0."""
result = wcwidth.wcwidth('\uFFA0')
assert result == 0


def test_hangul_choseong_filler_exception():
"""U+115F HANGUL CHOSEONG FILLER remains width 2 for jamo composition."""
result = wcwidth.wcwidth('\u115F')
assert result == 2


def test_soft_hyphen_exception():
"""U+00AD SOFT HYPHEN remains width 1 for ISO-8859-1 compatibility."""
result = wcwidth.wcwidth('\u00AD')
assert result == 1
Loading
Loading