Bugfix accounting for Variation Selector 16

- Add new table, VS16_NARROW_TO_WIDE. It has only one version, "9.0.0". This defines a set of characters that are otherwise Narrow, like '0', that become wide when combined with U+FE0F, "VARIATION SELECTOR 16". - wcswidth() function now recalls "last-most measured character", and, on U+FE0F, checks that character in table VS16_NARROW_TO_WIDE, and, if matching, adds 1 to the measured width. - The latest list of 'emoji-zwj-sequences.txt' and 'emoji-variation-sequences.txt' are fetched by update-tables.py and placed in 'tests/' folder, and now used by automatic tests in test_emoji_zwj.py, this is helpful to ensure 100% compatibility with all latest known emoji sequences A single "9.0.0" version is used because of ambiguity in legacy releases of unicode data files. So ambiguous that very few terminals get it right, I will share results from 'ucs-detect' project based on this branch next. All in all, U+FE0F appears to be something of a "fixup" that is only for these legacy emojis, many that probably should have always been defined as WIDE. I don't expect any new FE0F sequences to be published.
jquast · Nov 7, 2023 · 3b47930 · 3b47930
1 parent 0e885e4
commit 3b47930
Show file tree

Hide file tree

Showing 17 changed files with 3,034 additions and 211 deletions.
diff --git a/bin/update-tables.py b/bin/update-tables.py
diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+This is a small script to make an inquiry into the version history of unicode data tables, and to
+validate conflicts in the tables as they are published:
+
+- check for individual code point definitions change in in subsequent releases,
+  these should be considered before attempting to reduce the size of our versioned
+  tables without a careful incremental change description.  Each "violation" is
+  logged as INFO.
+- check that a codepoint in the 'zero' table is not present in the 'wide' table
+  and vice versa. This is logged as ERROR and causes program to exit 1.
+
+Some examples of the first kind,
+
+1.
+
+    value 0x1f93b in table WIDE_EASTASIAN version 12.1.0 is not defined in 13.0.0 from range ('0x1f90d', '0x1f971')
+    value 0x1f946 in table WIDE_EASTASIAN version 12.1.0 is not defined in 13.0.0 from range ('0x1f90d', '0x1f971')
+
+two characters were changed from 'W' to 'N':
+
+    -EastAsianWidth-12.0.0.txt:1F90D..1F971;W   # So   [101] WHITE HEART..YAWNING FACE
+    +EastAsianWidth-12.1.0.txt:1F90C..1F93A;W   # So    [47] PINCHED FINGERS..FENCER
+    +EastAsianWidth-12.1.0.txt:1F93B;N          # So         MODERN PENTATHLON
+    +EastAsianWidth-12.1.0.txt:1F93C..1F945;W   # So    [10] WRESTLERS..GOAL NET
+    +EastAsianWidth-12.1.0.txt:1F946;N          # So         RIFLE
+    +EastAsianWidth-12.1.0.txt:1F947..1F978;W   # So    [50] FIRST PLACE MEDAL..DISGUISED FACE
+
+As well as for output,
+
+    value 0x11a3 in table WIDE_EASTASIAN version 6.1.0 is not defined in 6.2.0 from range ('0x11a3', '0x11a7')
+    ...
+    value 0x11fe in table WIDE_EASTASIAN version 6.1.0 is not defined in 6.2.0 from range ('0x11fa', '0x11ff')
+
+Category code was changed from 'W' to 'N':
+
+    -EastAsianWidth-6.1.0.txt:11A3;W # HANGUL JUNGSEONG A-EU
+    +EastAsianWidth-6.2.0.txt:11A3;N # HANGUL JUNGSEONG A-EU
+
+
+2.
+
+    value 0x1cf2 in table ZERO_WIDTH version 11.0.0 is not defined in 12.0.0 from range ('0x1cf2', '0x1cf4')
+    value 0x1cf3 in table ZERO_WIDTH version 11.0.0 is not defined in 12.0.0 from range ('0x1cf2', '0x1cf4')
+
+Category code was changed from 'Mc' to 'Lo':
+
+    -DerivedGeneralCategory-11.0.0.txt:1CF2..1CF3    ; Mc #   [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
+    +DerivedGeneralCategory-12.0.0.txt:1CEE..1CF3    ; Lo #   [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA
+
+As well as for output,
+
+     value 0x19b0 in table ZERO_WIDTH version 7.0.0 is not defined in 8.0.0 from range ('0x19b0', '0x19c0')
+     ...
+     value 0x19c8 in table ZERO_WIDTH version 7.0.0 is not defined in 8.0.0 from range ('0x19c8', '0x19c9')
+
+Category code was changed from 'Mc' to 'Lo':
+
+    -DerivedGeneralCategory-7.0.0.txt:19B0..19C0    ; Mc #  [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY
+    +DerivedGeneralCategory-8.0.0.txt:19B0..19C9    ; Lo #  [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2
+"""
+# std imports
+import logging
+
+
+def main(log: logging.Logger):
+    # local
+    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
+    reversed_uni_versions = list(reversed(list_versions()))
+    tables = {'ZERO_WIDTH': ZERO_WIDTH,
+              'WIDE_EASTASIAN': WIDE_EASTASIAN}
+    errors = 0
+    for idx, version in enumerate(reversed_uni_versions):
+        if idx == 0:
+            continue
+        next_version = reversed_uni_versions[idx - 1]
+        for table_name, table in tables.items():
+            next_table = table[next_version]
+            curr_table = table[version]
+            other_table_name = 'WIDE_EASTASIAN' if table_name == 'ZERO_WIDTH' else 'ZERO_WIDTH'
+            other_table = tables[other_table_name][version]
+            for start_range, stop_range in curr_table:
+                for unichar_n in range(start_range, stop_range):
+                    if not _bisearch(unichar_n, next_table):
+                        log.info(f'value {hex(unichar_n)} in table_name={table_name}'
+                                 f' version={version} is not defined in next_version={next_version}'
+                                 f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                    if _bisearch(unichar_n, other_table):
+                        log.error(f'value {hex(unichar_n)} in table_name={table_name}'
+                                  f' version={version} is duplicated in other_table_name={other_table_name}'
+                                  f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                        errors += 1
+    if errors:
+        log.error(f'{errors} errors, exit 1')
+        exit(1)
+
+
+if __name__ == '__main__':
+    _logfmt = '%(levelname)s %(filename)s:%(lineno)d %(message)s'
+    logging.basicConfig(level="INFO", format=_logfmt, force=True)
+    log = logging.getLogger()
+    main(log)
diff --git a/docs/intro.rst b/docs/intro.rst
@@ -216,6 +216,10 @@ Other Languages
 =======
 History
 =======
+0.2.10 *2023-11-08*
+  * **Bugfix** account for Wide characters in wcswidth
+    when combined with U+FE0F Variation Selector 16 (`PR #XX`)
+
 0.2.9 *2023-10-30*
   * **Bugfix** zero-width characters used in Emoji ZWJ sequences, Balinese,
     Jamo, Devanagari, Tamil, Kannada and others (`PR #91`_).

diff --git a/docs/specs.rst b/docs/specs.rst
@@ -52,3 +52,7 @@ Category codes of Nonspacing Mark (``Mn``) and Spacing Mark (``Mc``).
 
 Any characters of Modifier Symbol category, ``'Sk'`` where ``'FULLWIDTH'`` is
 present in comment of unicode data file, aprox. 3 characters.
+
+Any character in sequence with U+FE0F (Variation Selector 16) defined by
+Emoji Variation Sequences txt as ``emoji style``.
+
diff --git a/docs/unicode_version.rst b/docs/unicode_version.rst
@@ -121,3 +121,9 @@ release files:
 ``EastAsianWidth-15.1.0.txt``
   *Date: 2023-07-28, 23:34:08 GMT*
 
+``emoji-variation-sequences-12.0.0.txt``
+  *Date: 2019-01-15, 12:10:05 GMT*
+
+``emoji-variation-sequences-15.1.0.txt``
+  *Date: 2023-02-01, 02:22:54 GMT*
+
diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@ def main():
     setuptools.setup(
         name='wcwidth',
         # NOTE: manually manage __version__ in wcwidth/__init__.py !
-        version='0.2.9',
+        version='0.2.10',
         description=(
             "Measures the displayed width of unicode strings in a terminal"),
         long_description=codecs.open(