Skip to content

Commit e481099

Browse files
authored
Width 0 for Default_Ignorable_Code_Point characters (#174)
Closes #118 This also sets ``*`` positional/keyword argument enforcement for all of the new proposed API functions Characters with the Unicode Default_Ignorable_Code_Point property should be rendered as zero-width per Unicode Standard. This fixes U+3164 HANGUL FILLER, U+FFA0 HALFWIDTH HANGUL FILLER, and ~3700 reserved codepoints that were incorrectly returning non-zero width. Two exceptions are preserved: - U+00AD SOFT HYPHEN: remains width 1 (ISO-8859-1 compatibility) - U+115F HANGUL CHOSEONG FILLER: remains width 2 (Hangul jamo composition)
1 parent a3cb502 commit e481099

8 files changed

Lines changed: 325 additions & 165 deletions

File tree

bin/update-tables.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,9 +384,17 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
384384
# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
385385
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
386386

387+
# Subtract Default_Ignorable_Code_Point characters (they should be zero-width).
388+
# Exception: U+115F HANGUL CHOSEONG FILLER remains wide for jamo composition.
389+
# See https://github.com/jquast/wcwidth/issues/118
390+
default_ignorable = parse_default_ignorable_code_points(
391+
fname=UnicodeDataFile.DerivedCoreProperties(version))
392+
default_ignorable.discard(0x115F) # Keep HANGUL CHOSEONG FILLER as wide
393+
table[version].values = table[version].values.difference(default_ignorable)
394+
387395
# finally, join with atypical 'wide' characters defined by category 'Sk',
388-
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
389-
wide=2).values)
396+
fname = UnicodeDataFile.DerivedGeneralCategory(version)
397+
table[version].values.update(parse_category(fname=fname, wide=2).values)
390398
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
391399

392400

@@ -399,15 +407,32 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
399407
table: dict[UnicodeVersion, TableDef] = {}
400408
for version in fetch_unicode_versions():
401409
# Determine values of zero-width character lookup table by the following category codes
402-
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
403-
wide=0)
410+
fname = UnicodeDataFile.DerivedGeneralCategory(version)
411+
table[version] = parse_category(fname=fname, wide=0)
404412

405413
# Include NULL
406414
table[version].values.add(0)
407415

408416
# Add Hangul Jamo Vowels and Hangul Trailing Consonants
409417
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
410418

419+
# Add Default_Ignorable_Code_Point characters
420+
# Per Unicode Standard (https://www.unicode.org/faq/unsup_char.html):
421+
# "All default-ignorable characters should be rendered as completely invisible
422+
# (and non advancing, i.e. 'zero width'), if not explicitly supported in rendering."
423+
#
424+
# See also:
425+
# - https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point
426+
# - https://github.com/jquast/wcwidth/issues/118
427+
table[version].values.update(parse_default_ignorable_code_points(
428+
fname=UnicodeDataFile.DerivedCoreProperties(version)))
429+
430+
# Remove U+115F HANGUL CHOSEONG FILLER from zero-width table.
431+
# Although it has Default_Ignorable_Code_Point property, it should remain
432+
# width 2 because it combines with other Hangul Jamo to form width-2
433+
# syllable blocks.
434+
table[version].values.discard(0x115F)
435+
411436
# Remove u+00AD categoryCode=Cf name="SOFT HYPHEN",
412437
# > https://www.unicode.org/faq/casemap_charprop.html
413438
#
@@ -422,6 +447,7 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
422447
# This value was wrongly measured as a width of '0' in this wcwidth
423448
# versions 0.2.9 - 0.2.13. Fixed in 0.2.14
424449
table[version].values.discard(0x00AD) # SOFT HYPHEN
450+
425451
return UnicodeTableRenderCtx('ZERO_WIDTH', table)
426452

427453

@@ -711,6 +737,35 @@ def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
711737
}
712738

713739

740+
def parse_default_ignorable_code_points(fname: str) -> set[int]:
741+
"""Parse DerivedCoreProperties.txt for Default_Ignorable_Code_Point property."""
742+
print(f'parsing {fname} for Default_Ignorable_Code_Point: ', end='', flush=True)
743+
values: set[int] = set()
744+
745+
with open(fname, encoding='utf-8') as f:
746+
for line in f:
747+
data, _, comment = line.partition('#')
748+
data = data.strip()
749+
if not data:
750+
continue
751+
752+
parts = [p.strip() for p in data.split(';')]
753+
if len(parts) < 2:
754+
continue
755+
756+
code_points_str, prop_name = parts[0], parts[1]
757+
758+
if prop_name == 'Default_Ignorable_Code_Point':
759+
if '..' in code_points_str:
760+
start, end = code_points_str.split('..')
761+
values.update(range(int(start, 16), int(end, 16) + 1))
762+
else:
763+
values.add(int(code_points_str, 16))
764+
765+
print('ok')
766+
return values
767+
768+
714769
def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
715770
"""Fetch grapheme break property tables for the latest Unicode version only."""
716771
latest_version = fetch_unicode_versions()[-1]

docs/intro.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,8 @@ History
453453
* **Performance** improvement in `wcswidth()`_. `PR #171`_.
454454
* **New** argument ``ambiguous_width`` to all functions. `PR #172`_.
455455
* **New** Functions `clip()`_ and `strip_sequences()`_. `PR #173`_.
456+
* **Bugfix** Characters with ``Default_Ignorable_Code_Point`` property now
457+
return width 0. `PR #174`_.
456458

457459
0.2.14 *2025-09-22*
458460
* **Drop Support** for Python 2.7 and 3.5. `PR #117`_.
@@ -591,6 +593,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
591593
.. _`PR #171`: https://github.com/jquast/wcwidth/pull/171
592594
.. _`PR #172`: https://github.com/jquast/wcwidth/pull/172
593595
.. _`PR #173`: https://github.com/jquast/wcwidth/pull/173
596+
.. _`PR #174`: https://github.com/jquast/wcwidth/pull/174
594597
.. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
595598
.. _`jquast/blessed`: https://github.com/jquast/blessed
596599
.. _`selectel/pyte`: https://github.com/selectel/pyte

docs/specs.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ return value of of :func:`wcwidth.wcswidth` is -1.
2121
Width of 0
2222
----------
2323

24+
Any characters with the ``Default_Ignorable_Code_Point`` property in
25+
`DerivedCoreProperties.txt`_ files, 4,174 characters, excluding `U+00AD`_ SOFT HYPHEN
26+
(width 1) and `U+115F`_ HANGUL CHOSEONG FILLER (width 2).
27+
2428
Any characters defined by category codes in `DerivedGeneralCategory.txt`_ files:
2529

2630
- 'Me': Enclosing Combining Mark, aprox. 13 characters.
@@ -74,6 +78,8 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
7478
.. _`U+2029`: https://codepoints.net/U+2029
7579
.. _`U+D7B0`: https://codepoints.net/U+D7B0
7680
.. _`U+FE0F`: https://codepoints.net/U+FE0F
81+
.. _`U+115F`: https://codepoints.net/U+115F
7782
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
83+
.. _`DerivedCoreProperties.txt`: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
7884
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
7985
.. _`emoji-variation-sequences.txt`: https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-variation-sequences.txt

tests/test_width.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,3 +323,58 @@ def test_modern_sequences(seq, expected_width, name):
323323
"""Modern terminal sequences are recognized as zero-width."""
324324
assert wcwidth.width(seq) == expected_width
325325
assert wcwidth.width(seq, control_codes='strict') == expected_width
326+
327+
328+
@pytest.mark.parametrize('codepoint,expected_width', [
329+
(0x3164, 0),
330+
(0xFFA0, 0),
331+
(0x2065, 0),
332+
(0xFFF0, 0),
333+
(0xFFF1, 0),
334+
(0xFFF8, 0),
335+
(0xE0000, 0),
336+
(0xE0002, 0),
337+
(0xE001F, 0),
338+
(0xE0080, 0),
339+
(0xE00FF, 0),
340+
(0xE01F0, 0),
341+
(0xE0FFF, 0),
342+
])
343+
def test_default_ignorable_zero_width(codepoint, expected_width):
344+
"""Default_Ignorable_Code_Point characters return width 0."""
345+
result = wcwidth.wcwidth(chr(codepoint))
346+
assert result == expected_width
347+
348+
349+
@pytest.mark.parametrize('codepoint,expected_width', [
350+
(0x00AD, 1),
351+
(0x115F, 2),
352+
])
353+
def test_default_ignorable_exceptions(codepoint, expected_width):
354+
"""Exceptions to Default_Ignorable_Code_Point zero-width rule."""
355+
result = wcwidth.wcwidth(chr(codepoint))
356+
assert result == expected_width
357+
358+
359+
def test_hangul_filler_zero_width():
360+
"""U+3164 HANGUL FILLER is width 0."""
361+
result = wcwidth.wcwidth('\u3164')
362+
assert result == 0
363+
364+
365+
def test_halfwidth_hangul_filler_zero_width():
366+
"""U+FFA0 HALFWIDTH HANGUL FILLER is width 0."""
367+
result = wcwidth.wcwidth('\uFFA0')
368+
assert result == 0
369+
370+
371+
def test_hangul_choseong_filler_exception():
372+
"""U+115F HANGUL CHOSEONG FILLER remains width 2 for jamo composition."""
373+
result = wcwidth.wcwidth('\u115F')
374+
assert result == 2
375+
376+
377+
def test_soft_hyphen_exception():
378+
"""U+00AD SOFT HYPHEN remains width 1 for ISO-8859-1 compatibility."""
379+
result = wcwidth.wcwidth('\u00AD')
380+
assert result == 1

0 commit comments

Comments
 (0)