Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 158 additions & 1 deletion bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@
)

HEX_STR_VS16 = 'FE0F'
# Grapheme Break Property values from UAX #29
GRAPHEME_BREAK_PROPERTIES = (
'CR', 'LF', 'Control', 'Extend', 'ZWJ', 'Regional_Indicator',
'Prepend', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT'
)
INCB_VALUES = ('Linker', 'Consonant', 'Extend')


def _bisearch(ucs, table):
Expand Down Expand Up @@ -313,6 +319,26 @@ def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self:
)


@dataclass(frozen=True)
class GraphemeTableRenderCtx(RenderContext):
"""Render context for grapheme tables (latest version only)."""
unicode_version: str
tables: Mapping[str, TableDef]


@dataclass
class GraphemeTableRenderDef(RenderDefinition):
render_context: GraphemeTableRenderCtx

@classmethod
def new(cls, context: GraphemeTableRenderCtx) -> Self:
return cls(
jinja_filename='grapheme_table.py.j2',
output_filename=os.path.join(PATH_UP, 'wcwidth', 'table_grapheme.py'),
render_context=context,
)


@functools.cache
def fetch_unicode_versions() -> list[UnicodeVersion]:
"""Fetch, determine, and return Unicode Versions for processing."""
Expand Down Expand Up @@ -552,6 +578,105 @@ def parse_category(fname: str, wide: int) -> TableDef:
return TableDef(version, date, values)


def parse_grapheme_break_properties(fname: str) -> dict[str, TableDef]:
"""Parse GraphemeBreakProperty.txt for grapheme break properties needing tables."""
print(f'parsing {fname}: ', end='', flush=True)
values_by_prop: dict[str, set[int]] = {prop: set() for prop in GRAPHEME_BREAK_PROPERTIES}

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
version = next(table_iter).comment.strip()
date = next(table_iter).comment.split(':', 1)[1].strip()

for entry in table_iter:
if entry.code_range is None:
continue
if entry.properties and entry.properties[0] in values_by_prop:
values_by_prop[entry.properties[0]].update(
range(entry.code_range[0], entry.code_range[1])
)

print('ok')
return {
f'GRAPHEME_{prop.upper()}': TableDef(version, date, values)
for prop, values in values_by_prop.items()
}


def parse_extended_pictographic(fname: str) -> TableDef:
"""Parse emoji-data.txt for Extended_Pictographic property."""
print(f'parsing {fname} for Extended_Pictographic: ', end='', flush=True)
values: set[int] = set()

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
# pull "version string" from first line of source file
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()

for entry in table_iter:
if entry.code_range is None:
continue
if entry.properties and entry.properties[0] == 'Extended_Pictographic':
values.update(range(entry.code_range[0], entry.code_range[1]))

print('ok')
return TableDef(version, date, values)


def parse_indic_conjunct_breaks(fname: str) -> dict[str, TableDef]:
"""Parse DerivedCoreProperties.txt for all Indic_Conjunct_Break properties."""
print(f'parsing {fname} for InCB: ', end='', flush=True)
values_by_incb: dict[str, set[int]] = {val: set() for val in INCB_VALUES}

with open(fname, encoding='utf-8') as f:
for line in f:
data, _, comment = line.partition('#')
data = data.strip()
if not data:
continue

parts = [p.strip() for p in data.split(';')]
if len(parts) < 3:
continue

code_points_str, prop_name, prop_value = parts[0], parts[1], parts[2]

if prop_name == 'InCB' and prop_value in values_by_incb:
if '..' in code_points_str:
start, end = code_points_str.split('..')
values_by_incb[prop_value].update(
range(int(start, 16), int(end, 16) + 1)
)
else:
values_by_incb[prop_value].add(int(code_points_str, 16))

print('ok')
return {
f'INCB_{val.upper()}': TableDef('DerivedCoreProperties', 'see file', values)
for val, values in values_by_incb.items()
}


def fetch_table_grapheme_data() -> GraphemeTableRenderCtx:
"""Fetch grapheme break property tables for the latest Unicode version only."""
latest_version = fetch_unicode_versions()[-1]

# makes a table definition for each break property
tables = parse_grapheme_break_properties(
UnicodeDataFile.GraphemeBreakProperty(latest_version)
)
tables['EXTENDED_PICTOGRAPHIC'] = parse_extended_pictographic(
UnicodeDataFile.EmojiData(latest_version)
)
tables.update(parse_indic_conjunct_breaks(
UnicodeDataFile.DerivedCoreProperties(latest_version)
))

return GraphemeTableRenderCtx(str(latest_version), tables)


class UnicodeDataFile:
"""
Helper class for fetching Unicode Data Files.
Expand All @@ -569,6 +694,10 @@ class UnicodeDataFile:
URL_EMOJI_VARIATION = 'https://unicode.org/Public/{version}/ucd/emoji/emoji-variation-sequences.txt'
URL_LEGACY_VARIATION = 'https://unicode.org/Public/emoji/{version}/emoji-variation-sequences.txt'
URL_EMOJI_ZWJ = 'https://unicode.org/Public/{version}/emoji/emoji-zwj-sequences.txt'
URL_GRAPHEME_BREAK = 'https://www.unicode.org/Public/{version}/ucd/auxiliary/GraphemeBreakProperty.txt'
URL_EMOJI_DATA = 'https://www.unicode.org/Public/{version}/ucd/emoji/emoji-data.txt'
URL_DERIVED_CORE_PROPS = 'https://www.unicode.org/Public/{version}/ucd/DerivedCoreProperties.txt'
URL_GRAPHEME_BREAK_TEST = 'https://www.unicode.org/Public/{version}/ucd/auxiliary/GraphemeBreakTest.txt'

@classmethod
def DerivedAge(cls) -> str:
Expand Down Expand Up @@ -615,6 +744,32 @@ def TestEmojiZWJSequences(cls) -> str:
cls.do_retrieve(url=cls.URL_EMOJI_ZWJ.format(version=version), fname=fname)
return fname

@classmethod
def GraphemeBreakProperty(cls, version: str) -> str:
fname = os.path.join(PATH_DATA, f'GraphemeBreakProperty-{version}.txt')
cls.do_retrieve(url=cls.URL_GRAPHEME_BREAK.format(version=version), fname=fname)
return fname

@classmethod
def EmojiData(cls, version: UnicodeVersion) -> str:
"""Fetch emoji-data.txt for Extended_Pictographic property."""
fname = os.path.join(PATH_DATA, f'emoji-data-{version}.txt')
cls.do_retrieve(url=cls.URL_EMOJI_DATA.format(version=version), fname=fname)
return fname

@classmethod
def DerivedCoreProperties(cls, version: str) -> str:
fname = os.path.join(PATH_DATA, f'DerivedCoreProperties-{version}.txt')
cls.do_retrieve(url=cls.URL_DERIVED_CORE_PROPS.format(version=version), fname=fname)
return fname

@classmethod
def TestGraphemeBreakTest(cls) -> str:
version = fetch_unicode_versions()[-1]
fname = os.path.join(PATH_TESTS, 'GraphemeBreakTest.txt')
cls.do_retrieve(url=cls.URL_GRAPHEME_BREAK_TEST.format(version=version), fname=fname)
return fname

@staticmethod
def do_retrieve(url: str, fname: str, no_check_last_modified: bool = False) -> None:
"""Retrieve given url to target filepath fname."""
Expand Down Expand Up @@ -786,6 +941,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
yield GraphemeTableRenderDef.new(fetch_table_grapheme_data())
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())

for render_def in get_codegen_definitions():
Expand All @@ -801,9 +957,10 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
assert render_def.output_filename != 'table_vs16.py', ('table_vs16 not expected to change!')
print('ok')

# fetch latest test data files
# fetch latest test data files, used by our automatic tests
UnicodeDataFile.TestEmojiVariationSequences()
UnicodeDataFile.TestEmojiZWJSequences()
UnicodeDataFile.TestGraphemeBreakTest()

# fetch all legacy emoji files if requested
if fetch_all_versions:
Expand Down
20 changes: 20 additions & 0 deletions code_templates/grapheme_table.py.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
Exports grapheme cluster break property tables for Unicode version {{ unicode_version }}.

This module provides lookup tables for Unicode grapheme cluster break properties
as defined in UAX #29: Unicode Text Segmentation.

This code generated by {{this_filepath}} on {{utc_now}}.
"""
# pylint: disable=duplicate-code
{%- for var_name, table_def in tables.items() %}

{{ var_name }} = (
# Source: {{ table_def.filename }}
# Date: {{ table_def.date }}
#
{%- for hex_start, hex_end, txt_description in table_def.hex_range_descriptions %}
({{ hex_start }}, {{ hex_end }},), # {{txt_description}}
{%- endfor %}
)
{%- endfor %}
24 changes: 9 additions & 15 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,25 @@ Public API
==========

This package follows SEMVER_ rules. Therefore, for the functions of the below
list, you may safely use version dependency definition ``wcwidth<2`` in your
list, you may safely use version dependency definition ``wcwidth<1`` in your
requirements.txt or equivalent. Their signatures will never change.

.. autofunction:: wcwidth.wcwidth

.. autofunction:: wcwidth.wcswidth

.. autofunction:: wcwidth.list_versions
.. autofunction:: wcwidth.iter_graphemes

.. _SEMVER: https://semver.org
.. autofunction:: wcwidth.iter_sequences

===========
Private API
===========
.. autofunction:: wcwidth.width

These functions should only be used for wcwidth development, and not used by
dependent packages except with care and by use of frozen version dependency,
as these functions may change names, signatures, or disappear entirely at any
time in the future, and not reflected by SEMVER_ rules!
.. autofunction:: wcwidth.ljust

If stable public API for any of the given functions is needed, please suggest a
Pull Request!
.. autofunction:: wcwidth.rjust

.. autofunction:: wcwidth._bisearch
.. autofunction:: wcwidth.center

.. autofunction:: wcwidth._wcversion_value
.. autofunction:: wcwidth.list_versions

.. autofunction:: wcwidth._wcmatch_version
.. _SEMVER: https://semver.org
Loading
Loading