jquast
diff --git a/‎bin/update-tables.py‎
Lines changed: 59 additions & 4 deletions b/‎bin/update-tables.py‎
Lines changed: 59 additions & 4 deletions
diff --git a/‎docs/intro.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/intro.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/specs.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/specs.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/test_width.py‎
Lines changed: 55 additions & 0 deletions b/‎tests/test_width.py‎
Lines changed: 55 additions & 0 deletions
@@ -384,9 +384,17 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
         # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
         table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
 
+        # Subtract Default_Ignorable_Code_Point characters (they should be zero-width).
+        # Exception: U+115F HANGUL CHOSEONG FILLER remains wide for jamo composition.
+        # See https://github.com/jquast/wcwidth/issues/118
+        default_ignorable = parse_default_ignorable_code_points(
+            fname=UnicodeDataFile.DerivedCoreProperties(version))
+        default_ignorable.discard(0x115F)  # Keep HANGUL CHOSEONG FILLER as wide
+        table[version].values = table[version].values.difference(default_ignorable)
+
         # finally, join with atypical 'wide' characters defined by category 'Sk',
-        table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                    wide=2).values)
+        fname = UnicodeDataFile.DerivedGeneralCategory(version)
+        table[version].values.update(parse_category(fname=fname, wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
 
@@ -399,15 +407,32 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
     table: dict[UnicodeVersion, TableDef] = {}
     for version in fetch_unicode_versions():
         # Determine values of zero-width character lookup table by the following category codes
-        table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                        wide=0)
+        fname = UnicodeDataFile.DerivedGeneralCategory(version)
+        table[version] = parse_category(fname=fname, wide=0)
 
         # Include NULL
         table[version].values.add(0)
 
         # Add Hangul Jamo Vowels and Hangul Trailing Consonants
         table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
 
+        # Add Default_Ignorable_Code_Point characters
+        # Per Unicode Standard (https://www.unicode.org/faq/unsup_char.html):
+        # "All default-ignorable characters should be rendered as completely invisible
+        # (and non advancing, i.e. 'zero width'), if not explicitly supported in rendering."
+        #
+        # See also:
+        # - https://www.unicode.org/reports/tr44/#Default_Ignorable_Code_Point
+        # - https://github.com/jquast/wcwidth/issues/118
+        table[version].values.update(parse_default_ignorable_code_points(
+            fname=UnicodeDataFile.DerivedCoreProperties(version)))
+
+        # Remove U+115F HANGUL CHOSEONG FILLER from zero-width table.
+        # Although it has Default_Ignorable_Code_Point property, it should remain
+        # width 2 because it combines with other Hangul Jamo to form width-2
+        # syllable blocks.
+        table[version].values.discard(0x115F)
+
         # Remove u+00AD categoryCode=Cf name="SOFT HYPHEN",
         # > https://www.unicode.org/faq/casemap_charprop.html
         #
@@ -422,6 +447,7 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
         # This value was wrongly measured as a width of '0' in this wcwidth
         # versions 0.2.9 - 0.2.13. Fixed in 0.2.14
         table[version].values.discard(0x00AD)  # SOFT HYPHEN
+
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -711,6 +737,35 @@ def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
     }
 
 
+def parse_default_ignorable_code_points(fname: str) -> set[int]:
+    """Parse DerivedCoreProperties.txt for Default_Ignorable_Code_Point property."""
+    print(f'parsing {fname} for Default_Ignorable_Code_Point: ', end='', flush=True)
+    values: set[int] = set()
+
+    with open(fname, encoding='utf-8') as f:
+        for line in f:
+            data, _, comment = line.partition('#')
+            data = data.strip()
+            if not data:
+                continue
+
+            parts = [p.strip() for p in data.split(';')]
+            if len(parts) < 2:
+                continue
+
+            code_points_str, prop_name = parts[0], parts[1]
+
+            if prop_name == 'Default_Ignorable_Code_Point':
+                if '..' in code_points_str:
+                    start, end = code_points_str.split('..')
+                    values.update(range(int(start, 16), int(end, 16) + 1))
+                else:
+                    values.add(int(code_points_str, 16))
+
+    print('ok')
+    return values
+
+
 def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
     """Fetch grapheme break property tables for the latest Unicode version only."""
     latest_version = fetch_unicode_versions()[-1]
 
@@ -453,6 +453,8 @@ History
   * **Performance** improvement in `wcswidth()`_. `PR #171`_.
   * **New** argument ``ambiguous_width`` to all functions. `PR #172`_.
   * **New** Functions `clip()`_ and `strip_sequences()`_. `PR #173`_.
+  * **Bugfix** Characters with ``Default_Ignorable_Code_Point`` property now
+    return width 0. `PR #174`_.
 
 0.2.14 *2025-09-22*
   * **Drop Support** for Python 2.7 and 3.5. `PR #117`_.
@@ -591,6 +593,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
 .. _`PR #171`: https://github.com/jquast/wcwidth/pull/171
 .. _`PR #172`: https://github.com/jquast/wcwidth/pull/172
 .. _`PR #173`: https://github.com/jquast/wcwidth/pull/173
+.. _`PR #174`: https://github.com/jquast/wcwidth/pull/174
 .. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
 .. _`jquast/blessed`: https://github.com/jquast/blessed
 .. _`selectel/pyte`: https://github.com/selectel/pyte
 
@@ -21,6 +21,10 @@ return value of of :func:`wcwidth.wcswidth` is -1.
 Width of 0
 ----------
 
+Any characters with the ``Default_Ignorable_Code_Point`` property in
+`DerivedCoreProperties.txt`_ files, 4,174 characters, excluding `U+00AD`_ SOFT HYPHEN
+(width 1) and `U+115F`_ HANGUL CHOSEONG FILLER (width 2).
+
 Any characters defined by category codes in `DerivedGeneralCategory.txt`_ files:
 
 - 'Me': Enclosing Combining Mark, aprox. 13 characters.
@@ -74,6 +78,8 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
 .. _`U+2029`: https://codepoints.net/U+2029
 .. _`U+D7B0`: https://codepoints.net/U+D7B0
 .. _`U+FE0F`: https://codepoints.net/U+FE0F
+.. _`U+115F`: https://codepoints.net/U+115F
 .. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
+.. _`DerivedCoreProperties.txt`: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
 .. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
 .. _`emoji-variation-sequences.txt`: https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-variation-sequences.txt
@@ -323,3 +323,58 @@ def test_modern_sequences(seq, expected_width, name):
     """Modern terminal sequences are recognized as zero-width."""
     assert wcwidth.width(seq) == expected_width
     assert wcwidth.width(seq, control_codes='strict') == expected_width
+
+
+@pytest.mark.parametrize('codepoint,expected_width', [
+    (0x3164, 0),
+    (0xFFA0, 0),
+    (0x2065, 0),
+    (0xFFF0, 0),
+    (0xFFF1, 0),
+    (0xFFF8, 0),
+    (0xE0000, 0),
+    (0xE0002, 0),
+    (0xE001F, 0),
+    (0xE0080, 0),
+    (0xE00FF, 0),
+    (0xE01F0, 0),
+    (0xE0FFF, 0),
+])
+def test_default_ignorable_zero_width(codepoint, expected_width):
+    """Default_Ignorable_Code_Point characters return width 0."""
+    result = wcwidth.wcwidth(chr(codepoint))
+    assert result == expected_width
+
+
+@pytest.mark.parametrize('codepoint,expected_width', [
+    (0x00AD, 1),
+    (0x115F, 2),
+])
+def test_default_ignorable_exceptions(codepoint, expected_width):
+    """Exceptions to Default_Ignorable_Code_Point zero-width rule."""
+    result = wcwidth.wcwidth(chr(codepoint))
+    assert result == expected_width
+
+
+def test_hangul_filler_zero_width():
+    """U+3164 HANGUL FILLER is width 0."""
+    result = wcwidth.wcwidth('\u3164')
+    assert result == 0
+
+
+def test_halfwidth_hangul_filler_zero_width():
+    """U+FFA0 HALFWIDTH HANGUL FILLER is width 0."""
+    result = wcwidth.wcwidth('\uFFA0')
+    assert result == 0
+
+
+def test_hangul_choseong_filler_exception():
+    """U+115F HANGUL CHOSEONG FILLER remains width 2 for jamo composition."""
+    result = wcwidth.wcwidth('\u115F')
+    assert result == 2
+
+
+def test_soft_hyphen_exception():
+    """U+00AD SOFT HYPHEN remains width 1 for ISO-8859-1 compatibility."""
+    result = wcwidth.wcwidth('\u00AD')
+    assert result == 1