From 52fe9073a047feeca7367a0bb1adf2e362eda1ff Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 5 Jan 2024 18:52:43 -0600 Subject: [PATCH 1/5] =?UTF-8?q?chore(core):=20stacked=20markers=20tests=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - first we break #10320 --- core/src/ldml/ldml_transforms.cpp | 2 +- core/src/ldml/ldml_transforms.hpp | 15 +++++++-- core/tests/unit/ldml/test_transforms.cpp | 39 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 6f1ddcb0a5f..249d19c351c 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -1049,7 +1049,7 @@ bool normalize_nfc(std::u16string &str) { } void -prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding) { +prepend_marker(std::u32string &str, marker_num marker, marker_encoding encoding) { if (encoding == plain_sentinel) { km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker}; str.insert(0, markstr, 3); diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 70f8b7b1f73..0256fe6dd37 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -311,8 +311,14 @@ enum marker_encoding { regex_sentinel, }; -/** map from following-char to marker number. */ -typedef std::map marker_map; +/** a marker ID (1-based) */ +typedef KMX_DWORD marker_num; + +/** list of markers */ +typedef std::deque marker_list; + +/** map from following-char to marker numbers. */ +typedef std::map marker_map; /** Normalize a u32string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u32string &str); @@ -338,17 +344,20 @@ inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding /** Normalize a u32string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u32string &str); + /** Normalize a u16string inplace to NFC. @return false on failure */ bool normalize_nfc(std::u16string &str); + /** Remove markers and optionally note their glue characters in the map */ std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel); + /** same but with a reference */ inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel) { return remove_markers(str, &markers, encoding); } /** prepend the marker string in UC_SENTINEL format to the str */ -void prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding = plain_sentinel); +void prepend_marker(std::u32string &str, marker_num marker, marker_encoding encoding = plain_sentinel); /** format 'marker' as 0001...FFFF and put it at the beginning of the string */ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index fae6eb33db4..146a1de9bef 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -839,6 +839,45 @@ int test_normalize() { assert_equal(map[MARKER_BEFORE_EOT], 0x1L); } + { + // from tests + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test 10 stack o' 2x2" << std::endl; + const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0002\u0320"; + const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0002\u0320\u0300"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + // TODO-LDML: map is going to be off + // assert_equal(map.size(), 2); + // assert_equal(map[0x0320], 0x2L); + // assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + } + + + { + // from tests + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - complex test 10 stack o' 2x1x2" << std::endl; + const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0002\u0320"; + const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0002\u0320\u0300"; + std::u32string dst = src; + assert(normalize_nfd_markers(dst, map)); + if (dst != expect) { + std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; + std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; + } + zassert_string_equal(dst, expect); + // TODO-LDML: map is going to be off + // assert_equal(map.size(), 2); + // assert_equal(map[0x0320], 0x2L); + // assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + } + return EXIT_SUCCESS; } From f4be7856daad3202f36b2390b69919f2c39842c7 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Sat, 6 Jan 2024 10:17:10 -0600 Subject: [PATCH 2/5] =?UTF-8?q?feat(core):=20stacked=20markers=20tests=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - we can finally support single markers with the new structure --- core/src/ldml/ldml_transforms.cpp | 14 +++- core/src/ldml/ldml_transforms.hpp | 2 +- core/tests/unit/ldml/test_transforms.cpp | 97 +++++++++++++++--------- 3 files changed, 71 insertions(+), 42 deletions(-) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 249d19c351c..56832ee0719 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -970,7 +970,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con const auto ch = MARKER_BEFORE_EOT; const auto m = map2.find(ch); if (m != map2.end()) { - prepend_marker(str, m->second, encoding); + for (auto q = (m->second).rbegin(); q < (m->second).rend(); q++) { + prepend_marker(str, *q, encoding); + } map2.erase(ch); // remove it } } @@ -981,7 +983,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con const auto m = map2.find(ch); if (m != map2.end()) { - prepend_marker(str, m->second, encoding); + for (auto q = (m->second).rbegin(); q < (m->second).rend(); q++) { + prepend_marker(str, *q, encoding); + } map2.erase(ch); // remove it } } @@ -1247,11 +1251,13 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, ma // record the marker if (marker_no >= LDML_MARKER_MIN_INDEX && markers != nullptr) { + char32_t marker_ch; if (i == str.end()) { - markers->emplace(MARKER_BEFORE_EOT, marker_no); + marker_ch = MARKER_BEFORE_EOT; } else { - markers->emplace(*i, marker_no); + marker_ch = *i; } + markers->emplace(marker_ch, (marker_list){marker_no}); // TODO-ldml: single! #10320 } } // get the suffix between the last marker and the end diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 0256fe6dd37..edba69c29ee 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -318,7 +318,7 @@ typedef KMX_DWORD marker_num; typedef std::deque marker_list; /** map from following-char to marker numbers. */ -typedef std::map marker_map; +typedef std::map marker_map; /** Normalize a u32string inplace to NFD. @return false on failure */ bool normalize_nfd(std::u32string &str); diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 146a1de9bef..a49db921cb4 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -33,12 +33,35 @@ } #endif + // needed for streaming operators #include "utfcodec.hpp" using namespace km::core::ldml; using namespace km::core::kmx; +std::u32string marker_list_to_string(const marker_list &m) { + std::u32string s; + for (auto i = m.rbegin(); i < m.rend(); i++) { + prepend_hex_quad(s, *i); + s.insert(0, U" \\m0x"); + } + return s; +} + +bool _assert_marker_list_equal(const char *f, int l, const marker_list a, const marker_list x) { + if (a == x) return true; + std::wcerr << f << ":" << l << ": " << console_color::fg(console_color::BRIGHT_RED); + std::wcerr << "got: " << marker_list_to_string(a); + std::wcerr << " expected: " << marker_list_to_string(x); + std::wcerr << console_color::reset() << std::endl; + return false; +} + +#define assert_marker_list_equal(actual, expected) \ + if (!_assert_marker_list_equal(__FILE__, __LINE__, (actual), (expected))) \ + return EXIT_FAILURE; + // using km::core::kmx::u16cmp; int @@ -606,7 +629,8 @@ int test_strutils() { const std::u32string expect = U"6e"; zassert_string_equal(dst, expect); assert_equal(map.size(), 1); - assert_equal(map[U'e'], 0x1L); // marker 1 @ e + marker_list exp_e = { 0x1L }; + assert_marker_list_equal(map[U'e'], exp_e); // marker 1 @ e } { marker_map map; @@ -643,7 +667,8 @@ int test_strutils() { const std::u32string expect = U"6"; zassert_string_equal(dst, expect); assert_equal(map.size(), 1); - assert_equal(map[MARKER_BEFORE_EOT], 0x1L); // marker 1 @ e + marker_list exp_end = { 0x1L }; + assert_marker_list_equal(map[MARKER_BEFORE_EOT], exp_end); // marker 1 @ e } { marker_map map; @@ -653,10 +678,14 @@ int test_strutils() { const std::u32string expect = U"6e\U00000320\U00000300"; zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_equal(map[U'e'], 0x1L); - assert_equal(map[0x0320], 0x2L); - assert_equal(map[0x0300], 0x3L); - assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + marker_list exp_e = { 0x1L }; + assert_marker_list_equal(map[U'e'], exp_e); + marker_list exp_320 = { 0x2L }; + assert_marker_list_equal(map[0x0320], exp_320); + marker_list exp_300 = { 0x3L }; + assert_marker_list_equal(map[0x0300], exp_300); + marker_list exp_end = { 0x4L }; + assert_marker_list_equal(map[MARKER_BEFORE_EOT], exp_end); } { std::cout << __FILE__ << ":" << __LINE__ << " - prepend hex quad" << std::endl; @@ -725,10 +754,10 @@ int test_normalize() { assert(normalize_nfd_markers(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_equal(map[U'e'], 0x1L); - assert_equal(map[0x0320], 0x2L); - assert_equal(map[0x0300], 0x3L); - assert_equal(map[MARKER_BEFORE_EOT], 0x4L); + assert_marker_list_equal(map[U'e'], marker_list({0x1L})); + assert_marker_list_equal(map[0x0320], marker_list({0x2L})); + assert_marker_list_equal(map[0x0300], marker_list({0x3L})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT],marker_list({0x4L})); } { @@ -741,11 +770,10 @@ int test_normalize() { assert(normalize_nfd_markers(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_equal(map[U'e'], 0x1L); - assert_equal(map[0x0320], 0x2L); - assert_equal(map[0x0300], 0x3L); - assert_equal(map[MARKER_BEFORE_EOT], 0x4L); - + assert_marker_list_equal(map[U'e'], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); + assert_marker_list_equal(map[0x0300], (marker_list){0x3L}); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x4L}); } { marker_map map; @@ -762,11 +790,10 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_equal(map[U'e'], 0x1L); - assert_equal(map[0x0320], 0x3L); - assert_equal(map[0x0300], 0x2L); - assert_equal(map[MARKER_BEFORE_EOT], 0x4L); - + assert_marker_list_equal(map[U'e'], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], (marker_list){0x3L}); + assert_marker_list_equal(map[0x0300], (marker_list){0x2L}); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x4L}); } { @@ -783,7 +810,7 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 1); - assert_equal(map[0x0320], 0x1L); + assert_marker_list_equal(map[0x0320], (marker_list){0x1L}); } { @@ -800,8 +827,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_equal(map[0x0320], 0x2L); - assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); } { @@ -818,8 +845,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_equal(map[0x0320], 0x2L); - assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); } { // from tests - regex edition @@ -835,8 +862,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_equal(map[0x0320], LDML_MARKER_ANY_INDEX); - assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + assert_marker_list_equal(map[0x0320], (marker_list){LDML_MARKER_ANY_INDEX}); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); } { @@ -851,11 +878,9 @@ int test_normalize() { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; } + assert_equal(map.size(), 1); + assert_marker_list_equal(map[0x0320], ((marker_list){0x2L, 0x2L})); zassert_string_equal(dst, expect); - // TODO-LDML: map is going to be off - // assert_equal(map.size(), 2); - // assert_equal(map[0x0320], 0x2L); - // assert_equal(map[MARKER_BEFORE_EOT], 0x1L); } @@ -863,8 +888,8 @@ int test_normalize() { // from tests marker_map map; std::cout << __FILE__ << ":" << __LINE__ << " - complex test 10 stack o' 2x1x2" << std::endl; - const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0002\u0320"; - const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0002\u0320\u0300"; + const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0003\u0320"; + const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0003\u0320\u0300"; std::u32string dst = src; assert(normalize_nfd_markers(dst, map)); if (dst != expect) { @@ -872,10 +897,8 @@ int test_normalize() { std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; } zassert_string_equal(dst, expect); - // TODO-LDML: map is going to be off - // assert_equal(map.size(), 2); - // assert_equal(map[0x0320], 0x2L); - // assert_equal(map[MARKER_BEFORE_EOT], 0x1L); + assert_equal(map.size(), 1); + assert_marker_list_equal(map[0x0320], ((marker_list){0x2L, 0x1L, 0x3L})); } From b2b897970df9298ca58a7e09b1df08910594d0be Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Sat, 6 Jan 2024 11:33:51 -0600 Subject: [PATCH 3/5] =?UTF-8?q?feat(core):=20stacked=20markers=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - renamed the single-segment functions to normalize_nfd_markers_segment() since they should only be run on a single segment - commented out NFC for now, dead code - add some additional checks/DebugLog around normalization #10320 --- core/src/ldml/ldml_processor.cpp | 17 ++- core/src/ldml/ldml_transforms.cpp | 114 ++++++++++++------ core/src/ldml/ldml_transforms.hpp | 62 +++++----- .../keyboards/k_008_transform_norm-test.xml | 29 ++++- .../ldml/keyboards/k_008_transform_norm.xml | 4 +- core/tests/unit/ldml/test_transforms.cpp | 34 ++++-- 6 files changed, 172 insertions(+), 88 deletions(-) diff --git a/core/src/ldml/ldml_processor.cpp b/core/src/ldml/ldml_processor.cpp index 0609b2c4589..20f6af30f29 100644 --- a/core/src/ldml/ldml_processor.cpp +++ b/core/src/ldml/ldml_processor.cpp @@ -316,13 +316,26 @@ size_t ldml_processor::process_output(km_core_state *state, const std::u32string // drop last 'matchedContext': ctxtstr.resize(ctxtstr.length() - matchedContext); ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here. - ldml::marker_map markers; - assert(ldml::normalize_nfd_markers(ctxtstr, markers)); // TODO-LDML: Need marker-safe normalize here. + { + const auto normalize_ok = ldml::normalize_nfd_markers(ctxtstr); + assert(normalize_ok); + if(!normalize_ok) { + DebugLog("ldml_processor::process_output: failed ldml::normalize_nfd_markers(ctxtstr)"); + } + } // Ok. We've done all the happy manipulations. /** NFD w/ markers */ std::u32string ctxtstr_cleanedup = ctxtstr; + { + const auto normalize_ok = ldml::normalize_nfd_markers(ctxtstr_cleanedup); + assert(normalize_ok); + if(!normalize_ok) { + DebugLog("ldml_processor::process_output: failed ldml::normalize_nfd_markers(ctxtstr_cleanedup)"); + } + } + assert(ldml::normalize_nfd_markers(ctxtstr_cleanedup)); // find common prefix. diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 56832ee0719..5ff5f28e451 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -643,6 +643,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons output.assign(s.get(), out32len); // NOW do a marker-safe normalize if (!normalize_nfd_markers(output)) { + DebugLog("normalize_nfd_markers(output) failed"); return 0; // TODO-LDML: normalization failed. } } @@ -950,9 +951,9 @@ bool normalize_nfd(std::u16string &str) { return normalize(nfd, str, status); } -bool normalize_nfd_markers(std::u16string &str, marker_map &map, marker_encoding encoding) { +bool normalize_nfd_markers_segment(std::u16string &str, marker_map &map, marker_encoding encoding) { std::u32string rstr = km::core::kmx::u16string_to_u32string(str); - if(!normalize_nfd_markers(rstr, map, encoding)) { + if(!normalize_nfd_markers_segment(rstr, map, encoding)) { return false; } else { str = km::core::kmx::u32string_to_u16string(rstr); @@ -996,7 +997,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con * - doesn't support >1 marker per char - may need a set instead of a map! * - ideally this should be used on a normalization safe subsequence */ -bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { +bool normalize_nfd_markers_segment(std::u32string &str, marker_map &map, marker_encoding encoding) { /** original string, but no markers */ std::u32string str_unmarked = remove_markers(str, map, encoding); /** original string, no markers, NFD */ @@ -1015,42 +1016,54 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding return true; // all OK } -bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { - /** original string, but no markers */ - std::u32string str_unmarked = remove_markers(str, map, encoding); - /** original string, no markers, NFC */ - std::u32string str_unmarked_nfc = str_unmarked; - if(!normalize_nfc(str_unmarked_nfc)) { - return false; // normalize failed. - } else if (map.size() == 0) { - // no markers. Return the normalized unmarked str - str = str_unmarked_nfc; - } else if (str_unmarked_nfc == str_unmarked) { - // Normalization produced no change when markers were removed. - // So, we'll call this a no-op. - } else { - add_back_markers(str, str_unmarked_nfc, map, encoding); - } - return true; // all OK +bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) { + marker_map m; + // TODO-LDML: split segments + return normalize_nfd_markers_segment(str, m, encoding); } - -bool normalize_nfc(std::u32string &str) { - std::u16string rstr = km::core::kmx::u32string_to_u16string(str); - if(!normalize_nfc(rstr)) { - return false; - } else { - str = km::core::kmx::u16string_to_u32string(rstr); - return true; - } +bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) { + marker_map m; + // TODO-LDML: split segments + return normalize_nfd_markers_segment(str, m, encoding); } -bool normalize_nfc(std::u16string &str) { - UErrorCode status = U_ZERO_ERROR; - const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); - UASSERT_SUCCESS(status); - return normalize(nfc, str, status); -} +// bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { +// /** original string, but no markers */ +// std::u32string str_unmarked = remove_markers(str, map, encoding); +// /** original string, no markers, NFC */ +// std::u32string str_unmarked_nfc = str_unmarked; +// if(!normalize_nfc(str_unmarked_nfc)) { +// return false; // normalize failed. +// } else if (map.size() == 0) { +// // no markers. Return the normalized unmarked str +// str = str_unmarked_nfc; +// } else if (str_unmarked_nfc == str_unmarked) { +// // Normalization produced no change when markers were removed. +// // So, we'll call this a no-op. +// } else { +// add_back_markers(str, str_unmarked_nfc, map, encoding); +// } +// return true; // all OK +// } + + +// bool normalize_nfc(std::u32string &str) { +// std::u16string rstr = km::core::kmx::u32string_to_u16string(str); +// if(!normalize_nfc(rstr)) { +// return false; +// } else { +// str = km::core::kmx::u16string_to_u32string(rstr); +// return true; +// } +// } + +// bool normalize_nfc(std::u16string &str) { +// UErrorCode status = U_ZERO_ERROR; +// const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status); +// UASSERT_SUCCESS(status); +// return normalize(nfc, str, status); +// } void prepend_marker(std::u32string &str, marker_num marker, marker_encoding encoding) { @@ -1123,10 +1136,24 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) { return mark_no; } +/** add the list to the map */ +void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) { + auto rep = markers.emplace(marker_ch, list); + if (!rep.second) { + // already existed. + auto existing = rep.first; + // append all additional ones + for(auto m = list.begin(); m < list.end(); m++) { + existing->second.emplace_back(*m); + } + } +} + std::u32string remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) { std::u32string out; auto i = str.begin(); auto last = i; + marker_list last_markers; for (i = find(i, str.end(), LDML_UC_SENTINEL); i != str.end(); i = find(i, str.end(), LDML_UC_SENTINEL)) { // append any prefix (from prior pos'n to here) out.append(last, i); @@ -1247,21 +1274,34 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, ma } } assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_ANY_INDEX); + // The marker number is good, add it to the list last = i; - // record the marker if (marker_no >= LDML_MARKER_MIN_INDEX && markers != nullptr) { + // add it to the list + last_markers.emplace_back(marker_no); char32_t marker_ch; if (i == str.end()) { + // Hit end, so mark it as the end marker_ch = MARKER_BEFORE_EOT; + } else if (*i == LDML_UC_SENTINEL) { + // it's another marker (presumably) + continue; // loop around } else { marker_ch = *i; } - markers->emplace(marker_ch, (marker_list){marker_no}); // TODO-ldml: single! #10320 + add_markers_to_map(*markers, marker_ch, last_markers); + last_markers.clear(); // mark as already recorded } } // get the suffix between the last marker and the end out.append(last, str.end()); + if (!last_markers.empty() && markers != nullptr) { + // we had markers but couldn't find the base. + // it's possible that there was a malformed UC_SENTINEL string in between. + // Add it to the end. + add_markers_to_map(*markers, MARKER_BEFORE_EOT, last_markers); + } return out; } diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index edba69c29ee..027372b7d59 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -328,25 +328,25 @@ bool normalize_nfd(std::u16string &str); * @param markers will be populated with marker chars * @return false on failure **/ -bool normalize_nfd_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); -bool normalize_nfd_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); -inline bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); -inline bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); - -/** Normalize a u32string inplace to NFC, retaining markers. - * @param markers will be populated with marker chars - * @return false on failure - **/ -bool normalize_nfc_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); -bool normalize_nfc_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); -inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); -inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); - -/** Normalize a u32string inplace to NFC. @return false on failure */ -bool normalize_nfc(std::u32string &str); - -/** Normalize a u16string inplace to NFC. @return false on failure */ -bool normalize_nfc(std::u16string &str); +bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); +bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); + +// /** Normalize a u32string inplace to NFC, retaining markers. +// * @param markers will be populated with marker chars +// * @return false on failure +// **/ +// bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +// bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel); +// inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); +// inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); + +// /** Normalize a u32string inplace to NFC. @return false on failure */ +// bool normalize_nfc(std::u32string &str); + +// /** Normalize a u16string inplace to NFC. @return false on failure */ +// bool normalize_nfc(std::u16string &str); /** Remove markers and optionally note their glue characters in the map */ std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel); @@ -365,25 +365,17 @@ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); /** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]); -bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) { - marker_map m; - return normalize_nfd_markers(str, m, encoding); -} -bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) { - marker_map m; - return normalize_nfc_markers(str, m, encoding); -} +// bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) { +// marker_map m; +// return normalize_nfc_markers_segment(str, m, encoding); +// } -bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) { - marker_map m; - return normalize_nfd_markers(str, m, encoding); -} -bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) { - marker_map m; - return normalize_nfc_markers(str, m, encoding); -} +// bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) { +// marker_map m; +// return normalize_nfc_markers_segment(str, m, encoding); +// } } // namespace ldml diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml index e73a01c8f8f..1611688b43c 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml @@ -245,16 +245,43 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml index a158cc5147e..2ed6a84bfea 100644 --- a/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml +++ b/core/tests/unit/ldml/keyboards/k_008_transform_norm.xml @@ -75,8 +75,8 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar - - + + diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index a49db921cb4..3dc79d4a031 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -728,7 +728,7 @@ int test_normalize() { const std::u32string src = U"6e\U00000320\U00000300"; // already NFD const std::u32string expect = src; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 0); } @@ -738,7 +738,7 @@ int test_normalize() { const std::u32string src = U"6e\U00000300\U00000320"; // swapped const std::u32string expect = U"6e\U00000320\U00000300"; // correct NFD std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 0); } @@ -751,7 +751,7 @@ int test_normalize() { U"\U0000ffff\U00000008\U00000004"; const std::u32string expect = src; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 4); assert_marker_list_equal(map[U'e'], marker_list({0x1L})); @@ -767,7 +767,7 @@ int test_normalize() { U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000002\U00000320\U0000ffff\U00000008\U00000003\U00000300\U0000ffff\U00000008\U00000004"; const std::u32string expect = src; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 4); assert_marker_list_equal(map[U'e'], (marker_list){0x1L}); @@ -783,7 +783,7 @@ int test_normalize() { const std::u32string expect = U"6\U0000ffff\U00000008\U00000001e\U0000ffff\U00000008\U00000003\U00000320\U0000ffff\U00000008\U00000002\U00000300\U0000ffff\U00000008\U00000004"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -803,7 +803,7 @@ int test_normalize() { const std::u32string src = U"4e\u0300\uFFFF\u0008\u0001\u0320"; const std::u32string expect = U"4e\uFFFF\u0008\u0001\u0320\u0300"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -820,7 +820,7 @@ int test_normalize() { const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\u0320\uFFFF\u0008\u0001"; const std::u32string expect = U"9ce\uFFFF\u0008\u0002\u0320\u0300\uFFFF\u0008\u0001"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -838,7 +838,7 @@ int test_normalize() { const std::u32string src = U"9ce\u0300\uFFFF\u0008\\u0002\u0320\uFFFF\u0008\\u0001"; const std::u32string expect = U"9ce\uFFFF\u0008\\u0002\u0320\u0300\uFFFF\u0008\\u0001"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map, regex_sentinel)); // TODO-LDML: need regex flag + assert(normalize_nfd_markers_segment(dst, map, regex_sentinel)); // TODO-LDML: need regex flag if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -855,7 +855,7 @@ int test_normalize() { const std::u32string src = U"9ce\u0300\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\uFFFF\u0008\\u0001"; const std::u32string expect = U"9ce\uFFFF\u0008[\\u0001-\\uD7FE]\u0320\u0300\uFFFF\u0008\\u0001"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map, regex_sentinel)); + assert(normalize_nfd_markers_segment(dst, map, regex_sentinel)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -873,7 +873,7 @@ int test_normalize() { const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0002\u0320"; const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0002\u0320\u0300"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -891,7 +891,7 @@ int test_normalize() { const std::u32string src = U"9ce\u0300\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0003\u0320"; const std::u32string expect = U"9ce\uFFFF\u0008\u0002\uFFFF\u0008\u0001\uFFFF\u0008\u0003\u0320\u0300"; std::u32string dst = src; - assert(normalize_nfd_markers(dst, map)); + assert(normalize_nfd_markers_segment(dst, map)); if (dst != expect) { std::cout << "dst: " << Debug_UnicodeString(dst) << std::endl; std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; @@ -902,6 +902,18 @@ int test_normalize() { } + { + marker_map map; + std::cout << __FILE__ << ":" << __LINE__ << " - dup-char test" << std::endl; + const std::u32string src = U"a\uFFFF\u0008\u0001\u0300e\uFFFF\u0008\u0002\u0300"; + const std::u32string dst = remove_markers(src, map); + const std::u32string expect = U"a\u0300e\u0300"; // U+0300 twice! This should be removed in 2 segments + zassert_string_equal(dst, expect); + assert_equal(map.size(), 1); + marker_list exp_ae = { 0x1L, 0x2L }; // Not what the user would see in practice. + assert_marker_list_equal(map[0x0300], exp_ae); // marker 1 @ e + } + return EXIT_SUCCESS; } From 776a902eb00c204040ee6429b37b7970fe1d0e0b Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Sat, 6 Jan 2024 13:56:37 -0600 Subject: [PATCH 4/5] =?UTF-8?q?feat(core):=20stacked=20markers=20?= =?UTF-8?q?=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - build fix for win #10320 --- core/tests/unit/ldml/test_transforms.cpp | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/core/tests/unit/ldml/test_transforms.cpp b/core/tests/unit/ldml/test_transforms.cpp index 3dc79d4a031..77075afd025 100644 --- a/core/tests/unit/ldml/test_transforms.cpp +++ b/core/tests/unit/ldml/test_transforms.cpp @@ -770,10 +770,10 @@ int test_normalize() { assert(normalize_nfd_markers_segment(dst, map)); zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_marker_list_equal(map[U'e'], (marker_list){0x1L}); - assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); - assert_marker_list_equal(map[0x0300], (marker_list){0x3L}); - assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x4L}); + assert_marker_list_equal(map[U'e'], marker_list({0x1L})); + assert_marker_list_equal(map[0x0320], marker_list({0x2L})); + assert_marker_list_equal(map[0x0300], marker_list({0x3L})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], marker_list({0x4L})); } { marker_map map; @@ -790,10 +790,10 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 4); - assert_marker_list_equal(map[U'e'], (marker_list){0x1L}); - assert_marker_list_equal(map[0x0320], (marker_list){0x3L}); - assert_marker_list_equal(map[0x0300], (marker_list){0x2L}); - assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x4L}); + assert_marker_list_equal(map[U'e'], marker_list({0x1L})); + assert_marker_list_equal(map[0x0320], marker_list({0x3L})); + assert_marker_list_equal(map[0x0300], marker_list({0x2L})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], marker_list({0x4L})); } { @@ -810,7 +810,7 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 1); - assert_marker_list_equal(map[0x0320], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], marker_list({0x1L})); } { @@ -827,8 +827,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); - assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], marker_list({0x2L})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], marker_list({0x1L})); } { @@ -845,8 +845,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_marker_list_equal(map[0x0320], (marker_list){0x2L}); - assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], marker_list({0x2L})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], marker_list({0x1L})); } { // from tests - regex edition @@ -862,8 +862,8 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 2); - assert_marker_list_equal(map[0x0320], (marker_list){LDML_MARKER_ANY_INDEX}); - assert_marker_list_equal(map[MARKER_BEFORE_EOT], (marker_list){0x1L}); + assert_marker_list_equal(map[0x0320], marker_list({LDML_MARKER_ANY_INDEX})); + assert_marker_list_equal(map[MARKER_BEFORE_EOT], marker_list({0x1L})); } { @@ -879,7 +879,7 @@ int test_normalize() { std::cout << "exp: " << Debug_UnicodeString(expect) << std::endl; } assert_equal(map.size(), 1); - assert_marker_list_equal(map[0x0320], ((marker_list){0x2L, 0x2L})); + assert_marker_list_equal(map[0x0320], (marker_list({0x2L, 0x2L}))); zassert_string_equal(dst, expect); } @@ -898,7 +898,7 @@ int test_normalize() { } zassert_string_equal(dst, expect); assert_equal(map.size(), 1); - assert_marker_list_equal(map[0x0320], ((marker_list){0x2L, 0x1L, 0x3L})); + assert_marker_list_equal(map[0x0320], (marker_list({0x2L, 0x1L, 0x3L}))); } From 0d9f543a78cbc30b37030d4bce12ff71215f424a Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Wed, 10 Jan 2024 18:11:17 -0600 Subject: [PATCH 5/5] Apply suggestions from code review - cleanup NFC code later if it's entirely unneeded. Co-authored-by: Marc Durdin --- core/src/ldml/ldml_transforms.cpp | 1 + core/src/ldml/ldml_transforms.hpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/core/src/ldml/ldml_transforms.cpp b/core/src/ldml/ldml_transforms.cpp index 5ff5f28e451..560ea2c9a24 100644 --- a/core/src/ldml/ldml_transforms.cpp +++ b/core/src/ldml/ldml_transforms.cpp @@ -1028,6 +1028,7 @@ bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) { return normalize_nfd_markers_segment(str, m, encoding); } +// TODO-LDML: cleanup // bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) { // /** original string, but no markers */ // std::u32string str_unmarked = remove_markers(str, map, encoding); diff --git a/core/src/ldml/ldml_transforms.hpp b/core/src/ldml/ldml_transforms.hpp index 027372b7d59..751f423d258 100644 --- a/core/src/ldml/ldml_transforms.hpp +++ b/core/src/ldml/ldml_transforms.hpp @@ -333,6 +333,7 @@ bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, mar bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel); bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel); +// TODO-LDML: Cleanup // /** Normalize a u32string inplace to NFC, retaining markers. // * @param markers will be populated with marker chars // * @return false on failure @@ -366,6 +367,7 @@ void prepend_hex_quad(std::u32string &str, KMX_DWORD marker); KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]); +// TODO-LDML: Cleanup // bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) { // marker_map m; // return normalize_nfc_markers_segment(str, m, encoding);