Skip to content

Commit

Permalink
Merge pull request #10326 from keymanapp/feat/core/10320-stack-marker…
Browse files Browse the repository at this point in the history
…s-epic-ldml
  • Loading branch information
srl295 authored Jan 11, 2024
2 parents 64fb78d + 0d9f543 commit f68e9f3
Show file tree
Hide file tree
Showing 6 changed files with 282 additions and 118 deletions.
17 changes: 15 additions & 2 deletions core/src/ldml/ldml_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,26 @@ size_t ldml_processor::process_output(km_core_state *state, const std::u32string
// drop last 'matchedContext':
ctxtstr.resize(ctxtstr.length() - matchedContext);
ctxtstr.append(outputString); // TODO-LDML: should be able to do a normalization-safe append here.
ldml::marker_map markers;
assert(ldml::normalize_nfd_markers(ctxtstr, markers)); // TODO-LDML: Need marker-safe normalize here.
{
const auto normalize_ok = ldml::normalize_nfd_markers(ctxtstr);
assert(normalize_ok);
if(!normalize_ok) {
DebugLog("ldml_processor::process_output: failed ldml::normalize_nfd_markers(ctxtstr)");
}
}

// Ok. We've done all the happy manipulations.

/** NFD w/ markers */
std::u32string ctxtstr_cleanedup = ctxtstr;
{
const auto normalize_ok = ldml::normalize_nfd_markers(ctxtstr_cleanedup);
assert(normalize_ok);
if(!normalize_ok) {
DebugLog("ldml_processor::process_output: failed ldml::normalize_nfd_markers(ctxtstr_cleanedup)");
}
}

assert(ldml::normalize_nfd_markers(ctxtstr_cleanedup));

// find common prefix.
Expand Down
129 changes: 88 additions & 41 deletions core/src/ldml/ldml_transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ transform_entry::apply(const std::u32string &input, std::u32string &output) cons
output.assign(s.get(), out32len);
// NOW do a marker-safe normalize
if (!normalize_nfd_markers(output)) {
DebugLog("normalize_nfd_markers(output) failed");
return 0; // TODO-LDML: normalization failed.
}
}
Expand Down Expand Up @@ -950,9 +951,9 @@ bool normalize_nfd(std::u16string &str) {
return normalize(nfd, str, status);
}

bool normalize_nfd_markers(std::u16string &str, marker_map &map, marker_encoding encoding) {
bool normalize_nfd_markers_segment(std::u16string &str, marker_map &map, marker_encoding encoding) {
std::u32string rstr = km::core::kmx::u16string_to_u32string(str);
if(!normalize_nfd_markers(rstr, map, encoding)) {
if(!normalize_nfd_markers_segment(rstr, map, encoding)) {
return false;
} else {
str = km::core::kmx::u32string_to_u16string(rstr);
Expand All @@ -970,7 +971,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con
const auto ch = MARKER_BEFORE_EOT;
const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second, encoding);
for (auto q = (m->second).rbegin(); q < (m->second).rend(); q++) {
prepend_marker(str, *q, encoding);
}
map2.erase(ch); // remove it
}
}
Expand All @@ -981,7 +984,9 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con

const auto m = map2.find(ch);
if (m != map2.end()) {
prepend_marker(str, m->second, encoding);
for (auto q = (m->second).rbegin(); q < (m->second).rend(); q++) {
prepend_marker(str, *q, encoding);
}
map2.erase(ch); // remove it
}
}
Expand All @@ -992,7 +997,7 @@ static void add_back_markers(std::u32string &str, const std::u32string &src, con
* - doesn't support >1 marker per char - may need a set instead of a map!
* - ideally this should be used on a normalization safe subsequence
*/
bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding encoding) {
bool normalize_nfd_markers_segment(std::u32string &str, marker_map &map, marker_encoding encoding) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map, encoding);
/** original string, no markers, NFD */
Expand All @@ -1011,45 +1016,58 @@ bool normalize_nfd_markers(std::u32string &str, marker_map &map, marker_encoding
return true; // all OK
}

bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) {
/** original string, but no markers */
std::u32string str_unmarked = remove_markers(str, map, encoding);
/** original string, no markers, NFC */
std::u32string str_unmarked_nfc = str_unmarked;
if(!normalize_nfc(str_unmarked_nfc)) {
return false; // normalize failed.
} else if (map.size() == 0) {
// no markers. Return the normalized unmarked str
str = str_unmarked_nfc;
} else if (str_unmarked_nfc == str_unmarked) {
// Normalization produced no change when markers were removed.
// So, we'll call this a no-op.
} else {
add_back_markers(str, str_unmarked_nfc, map, encoding);
}
return true; // all OK
bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) {
marker_map m;
// TODO-LDML: split segments
return normalize_nfd_markers_segment(str, m, encoding);
}


bool normalize_nfc(std::u32string &str) {
std::u16string rstr = km::core::kmx::u32string_to_u16string(str);
if(!normalize_nfc(rstr)) {
return false;
} else {
str = km::core::kmx::u16string_to_u32string(rstr);
return true;
}
bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) {
marker_map m;
// TODO-LDML: split segments
return normalize_nfd_markers_segment(str, m, encoding);
}

bool normalize_nfc(std::u16string &str) {
UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
UASSERT_SUCCESS(status);
return normalize(nfc, str, status);
}
// TODO-LDML: cleanup
// bool normalize_nfc_markers(std::u32string &str, marker_map &map, marker_encoding encoding) {
// /** original string, but no markers */
// std::u32string str_unmarked = remove_markers(str, map, encoding);
// /** original string, no markers, NFC */
// std::u32string str_unmarked_nfc = str_unmarked;
// if(!normalize_nfc(str_unmarked_nfc)) {
// return false; // normalize failed.
// } else if (map.size() == 0) {
// // no markers. Return the normalized unmarked str
// str = str_unmarked_nfc;
// } else if (str_unmarked_nfc == str_unmarked) {
// // Normalization produced no change when markers were removed.
// // So, we'll call this a no-op.
// } else {
// add_back_markers(str, str_unmarked_nfc, map, encoding);
// }
// return true; // all OK
// }


// bool normalize_nfc(std::u32string &str) {
// std::u16string rstr = km::core::kmx::u32string_to_u16string(str);
// if(!normalize_nfc(rstr)) {
// return false;
// } else {
// str = km::core::kmx::u16string_to_u32string(rstr);
// return true;
// }
// }

// bool normalize_nfc(std::u16string &str) {
// UErrorCode status = U_ZERO_ERROR;
// const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(status);
// UASSERT_SUCCESS(status);
// return normalize(nfc, str, status);
// }

void
prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding) {
prepend_marker(std::u32string &str, marker_num marker, marker_encoding encoding) {
if (encoding == plain_sentinel) {
km_core_usv markstr[] = {LDML_UC_SENTINEL, LDML_MARKER_CODE, marker};
str.insert(0, markstr, 3);
Expand Down Expand Up @@ -1119,10 +1137,24 @@ KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]) {
return mark_no;
}

/** add the list to the map */
void add_markers_to_map(marker_map &markers, char32_t marker_ch, const marker_list &list) {
auto rep = markers.emplace(marker_ch, list);
if (!rep.second) {
// already existed.
auto existing = rep.first;
// append all additional ones
for(auto m = list.begin(); m < list.end(); m++) {
existing->second.emplace_back(*m);
}
}
}

std::u32string remove_markers(const std::u32string &str, marker_map *markers, marker_encoding encoding) {
std::u32string out;
auto i = str.begin();
auto last = i;
marker_list last_markers;
for (i = find(i, str.end(), LDML_UC_SENTINEL); i != str.end(); i = find(i, str.end(), LDML_UC_SENTINEL)) {
// append any prefix (from prior pos'n to here)
out.append(last, i);
Expand Down Expand Up @@ -1243,19 +1275,34 @@ std::u32string remove_markers(const std::u32string &str, marker_map *markers, ma
}
}
assert(marker_no >= LDML_MARKER_MIN_INDEX && marker_no <= LDML_MARKER_ANY_INDEX);
// The marker number is good, add it to the list
last = i;

// record the marker
if (marker_no >= LDML_MARKER_MIN_INDEX && markers != nullptr) {
// add it to the list
last_markers.emplace_back(marker_no);
char32_t marker_ch;
if (i == str.end()) {
markers->emplace(MARKER_BEFORE_EOT, marker_no);
// Hit end, so mark it as the end
marker_ch = MARKER_BEFORE_EOT;
} else if (*i == LDML_UC_SENTINEL) {
// it's another marker (presumably)
continue; // loop around
} else {
markers->emplace(*i, marker_no);
marker_ch = *i;
}
add_markers_to_map(*markers, marker_ch, last_markers);
last_markers.clear(); // mark as already recorded
}
}
// get the suffix between the last marker and the end
out.append(last, str.end());
if (!last_markers.empty() && markers != nullptr) {
// we had markers but couldn't find the base.
// it's possible that there was a malformed UC_SENTINEL string in between.
// Add it to the end.
add_markers_to_map(*markers, MARKER_BEFORE_EOT, last_markers);
}
return out;
}

Expand Down
75 changes: 39 additions & 36 deletions core/src/ldml/ldml_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,14 @@ enum marker_encoding {
regex_sentinel,
};

/** map from following-char to marker number. */
typedef std::map<char32_t, KMX_DWORD> marker_map;
/** a marker ID (1-based) */
typedef KMX_DWORD marker_num;

/** list of markers */
typedef std::deque<marker_num> marker_list;

/** map from following-char to marker numbers. */
typedef std::map<char32_t, marker_list> marker_map;

/** Normalize a u32string inplace to NFD. @return false on failure */
bool normalize_nfd(std::u32string &str);
Expand All @@ -322,59 +328,56 @@ bool normalize_nfd(std::u16string &str);
* @param markers will be populated with marker chars
* @return false on failure
**/
bool normalize_nfd_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

// TODO-LDML: Cleanup
// /** Normalize a u32string inplace to NFC, retaining markers.
// * @param markers will be populated with marker chars
// * @return false on failure
// **/
// bool normalize_nfd_markers_segment(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
// bool normalize_nfd_markers_segment(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
// inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
// inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

// /** Normalize a u32string inplace to NFC. @return false on failure */
// bool normalize_nfc(std::u32string &str);

// /** Normalize a u16string inplace to NFC. @return false on failure */
// bool normalize_nfc(std::u16string &str);

/** Normalize a u32string inplace to NFC, retaining markers.
* @param markers will be populated with marker chars
* @return false on failure
**/
bool normalize_nfc_markers(std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
bool normalize_nfc_markers(std::u16string &str, marker_map &markers, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding = plain_sentinel);
inline bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding = plain_sentinel);

/** Normalize a u32string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u32string &str);
/** Normalize a u16string inplace to NFC. @return false on failure */
bool normalize_nfc(std::u16string &str);
/** Remove markers and optionally note their glue characters in the map */
std::u32string remove_markers(const std::u32string &str, marker_map *markers = nullptr, marker_encoding encoding = plain_sentinel);

/** same but with a reference */
inline std::u32string remove_markers(const std::u32string &str, marker_map &markers, marker_encoding encoding = plain_sentinel) {
return remove_markers(str, &markers, encoding);
}

/** prepend the marker string in UC_SENTINEL format to the str */
void prepend_marker(std::u32string &str, KMX_DWORD marker, marker_encoding encoding = plain_sentinel);
void prepend_marker(std::u32string &str, marker_num marker, marker_encoding encoding = plain_sentinel);

/** format 'marker' as 0001...FFFF and put it at the beginning of the string */
void prepend_hex_quad(std::u32string &str, KMX_DWORD marker);

/** parse 0001...FFFF into a KMX_DWORD. Returns 0 on failure */
KMX_DWORD parse_hex_quad(const km_core_usv hex_str[]);

bool normalize_nfd_markers(std::u16string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfd_markers(str, m, encoding);
}

bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfc_markers(str, m, encoding);
}
// TODO-LDML: Cleanup
// bool normalize_nfc_markers(std::u16string &str, marker_encoding encoding) {
// marker_map m;
// return normalize_nfc_markers_segment(str, m, encoding);
// }

bool normalize_nfd_markers(std::u32string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfd_markers(str, m, encoding);
}

bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) {
marker_map m;
return normalize_nfc_markers(str, m, encoding);
}
// bool normalize_nfc_markers(std::u32string &str, marker_encoding encoding) {
// marker_map m;
// return normalize_nfc_markers_segment(str, m, encoding);
// }


} // namespace ldml
Expand Down
29 changes: 28 additions & 1 deletion core/tests/unit/ldml/keyboards/k_008_transform_norm-test.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,16 +245,43 @@
<keystroke key="u-0320"/>
<check result="YES8b"/>
</test>
<test name="regex-test-8c-0">
<test name="regex-test-8e-0">
<!-- simple test, exactly as written-->
<keystroke key="8" />
<keystroke key="e" />
<keystroke key="u-0300"/> <!-- denorm-->
<keystroke key="u-0320"/>
<keystroke key="stampy"/>
<keystroke key="lgtm"/>
<check result="YES8c"/>
</test>
<test name="regex-test-8e-1">
<keystroke key="8" />
<keystroke key="e" />
<keystroke key="u-0320"/> <!-- NFD-->
<keystroke key="u-0300"/>
<keystroke key="stampy"/>
<keystroke key="lgtm"/>
<check result="YES8c"/>
</test>
<test name="regex-test-8f-0">
<keystroke key="8" />
<keystroke key="f" />
<keystroke key="u-0300"/> <!-- denorm-->
<keystroke key="u-0320"/>
<keystroke key="stampy"/>
<keystroke key="lgtm"/>
<check result="YES8d"/>
</test>
<test name="regex-test-8f-1">
<keystroke key="8" />
<keystroke key="f" />
<keystroke key="u-0320"/> <!-- NFD-->
<keystroke key="u-0300"/>
<keystroke key="stampy"/>
<keystroke key="lgtm"/>
<check result="YES8d"/>
</test>
</tests>
<tests name="regex-tests-9">
<test name="regex-test-9a">
Expand Down
4 changes: 2 additions & 2 deletions core/tests/unit/ldml/keyboards/k_008_transform_norm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ https://github.com/unicode-org/cldr/blob/keyboard-preview/docs/ldml/tr35-keyboar

<transform from="8e\m{stampy}\u{0300}\m{lgtm}\u{0320}" to="YES8a" />
<transform from="8e\u{0300}\m{stampy}\m{lgtm}\u{0320}" to="YES8b" />
<transform from="8e\u{0300}\u{0320}\m{stampy}\m{lgtm}" to="FAIL" /> <!-- denormalized, nomatch -->
<transform from="8e\u{0320}\u{0300}\m{stampy}\m{lgtm}" to="YES8d" /> <!-- NFD -->
<transform from="8e\u{0300}\u{0320}\m{stampy}\m{lgtm}" to="YES8c" /> <!-- denormalized, but matches -->
<transform from="8f\u{0320}\u{0300}\m{stampy}\m{lgtm}" to="YES8d" /> <!-- NFD -->

<!-- normalization in output -->
<transform from="9a" to="9e\u{0300}\u{0320}" /> <!-- no marker, denormalized -->
Expand Down
Loading

0 comments on commit f68e9f3

Please sign in to comment.