From c83c899750e305e75cde5766208e626f728f1b6e Mon Sep 17 00:00:00 2001 From: macchiati Date: Wed, 27 Nov 2024 09:42:59 -0800 Subject: [PATCH 1/6] CLDR-18129 Investigate and fix (where necessary) invalid codes --- common/dtd/ldml.dtd | 2 +- common/dtd/ldmlSupplemental.dtd | 8 +- common/main/en.xml | 8 +- common/main/fi.xml | 2 +- common/main/la.xml | 7 +- common/main/nl.xml | 2 +- common/supplemental/supplementalData.xml | 2 +- common/supplemental/supplementalMetadata.xml | 5 +- .../org/unicode/cldr/util/MatchValue.java | 73 +++++++++++++++++-- .../cldr/util/SupplementalDataInfo.java | 2 +- .../org/unicode/cldr/util/UnitConverter.java | 18 +++-- .../java/org/unicode/cldr/util/Validity.java | 9 ++- .../cldr/unittest/TestAttributeValues.java | 34 +++++---- .../org/unicode/cldr/unittest/TestBasic.java | 42 ++++++++++- .../cldr/unittest/TestSupplementalInfo.java | 45 ++++++++++++ 15 files changed, 213 insertions(+), 46 deletions(-) diff --git a/common/dtd/ldml.dtd b/common/dtd/ldml.dtd index 984e03305a5..fecb752848f 100644 --- a/common/dtd/ldml.dtd +++ b/common/dtd/ldml.dtd @@ -61,7 +61,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/dtd/ldmlSupplemental.dtd b/common/dtd/ldmlSupplemental.dtd index 0ce28c9755e..4678e29ea62 100644 --- a/common/dtd/ldmlSupplemental.dtd +++ b/common/dtd/ldmlSupplemental.dtd @@ -702,7 +702,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -962,9 +962,9 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + - + @@ -996,7 +996,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/main/en.xml b/common/main/en.xml index 70918a060d0..d1aaf9ed95c 100644 --- a/common/main/en.xml +++ b/common/main/en.xml @@ -152,7 +152,7 @@ annotations. Swampy Cree Church Slavic Chuvash - Woods Cree + Cree Welsh Danish Dakota @@ -256,7 +256,7 @@ annotations. Hakka Chinese Hawaiian Southern Haida - Northern Haida + Haida Hebrew Hindi Hindi (Latin) @@ -284,7 +284,7 @@ annotations. Igbo Sichuan Yi Inupiaq - Eastern Canadian Inuktitut + Inuktitut Western Canadian Inuktitut Iloko Ingush @@ -474,7 +474,7 @@ annotations. Ojibwa Northwestern Ojibwa Central Ojibwa - Eastern Ojibwa + Ojibwa Oji-Cree Western Ojibwa Okanagan diff --git a/common/main/fi.xml b/common/main/fi.xml index c56be83d76c..7bbe4e1dec6 100644 --- a/common/main/fi.xml +++ b/common/main/fi.xml @@ -31,7 +31,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ afrihili aghem ainu - urduni + urduni akan akkadi alabama diff --git a/common/main/la.xml b/common/main/la.xml index ced058291d1..f7e50e9a286 100644 --- a/common/main/la.xml +++ b/common/main/la.xml @@ -24,7 +24,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Atropatenica Ruthenica Alba Bulgarica - Bihari Bengalica Tibetana Britonica @@ -66,12 +65,12 @@ CLDR data files are interpreted according to the LDML specification (http://unic Interlingua Interlingue Igbonica - Indonesia + Indonesia Islandica Italiana - Hebraica + Hebraica Iaponica - Iudaeogermanica + Iudaeogermanica Iavensis Georgiana Cazachica diff --git a/common/main/nl.xml b/common/main/nl.xml index b73d6893e20..bd1f8674903 100644 --- a/common/main/nl.xml +++ b/common/main/nl.xml @@ -31,7 +31,7 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ Afrihili Aghem Aino - Zuid-Levantijns-Arabisch + Levantijns-Arabisch Akan Akkadisch Alabama diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml index 6364125e27c..c3a04d97663 100644 --- a/common/supplemental/supplementalData.xml +++ b/common/supplemental/supplementalData.xml @@ -4928,7 +4928,7 @@ XXX Code for transations where no currency is involved - + diff --git a/common/supplemental/supplementalMetadata.xml b/common/supplemental/supplementalMetadata.xml index 7b3f94549a4..d8e5052adef 100644 --- a/common/supplemental/supplementalMetadata.xml +++ b/common/supplemental/supplementalMetadata.xml @@ -179,7 +179,7 @@ For terms of use, see http://www.unicode.org/copyright.html - + @@ -306,6 +306,9 @@ For terms of use, see http://www.unicode.org/copyright.html + + + diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java index 973186181c6..ec6ff314080 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java @@ -157,14 +157,22 @@ public static class LocaleMatchValue extends MatchValue { private final Predicate variant; public LocaleMatchValue() { - this(null); + this(Set.of(Status.regular, Status.special, Status.unknown, Status.macroregion)); } public LocaleMatchValue(Set statuses) { - lang = new ValidityMatchValue(LstrType.language, statuses, false); - script = new ValidityMatchValue(LstrType.script, statuses, false); - region = new ValidityMatchValue(LstrType.region, statuses, false); - variant = new ValidityMatchValue(LstrType.variant, statuses, false); + this(statuses, statuses, statuses, statuses); + } + + public LocaleMatchValue( + Set langStatus, + Set scriptStatus, + Set regionStatus, + Set variantStatus) { + lang = new ValidityMatchValue(LstrType.language, langStatus, false); + script = new ValidityMatchValue(LstrType.script, scriptStatus, false); + region = new ValidityMatchValue(LstrType.region, regionStatus, false); + variant = new ValidityMatchValue(LstrType.variant, variantStatus, false); } @Override @@ -174,8 +182,11 @@ public String getName() { @Override public boolean is(String item) { + if (item.equals("root")) { + item = "und"; + } if (!item.contains("_")) { - return lang.is(item); + return checkLang(item); } LanguageTagParser ltp; try { @@ -183,7 +194,7 @@ public boolean is(String item) { } catch (Exception e) { return false; } - return lang.is(ltp.getLanguage()) + return checkLang(ltp.getLanguage()) && (ltp.getScript().isEmpty() || script.is(ltp.getScript())) && (ltp.getRegion().isEmpty() || region.is(ltp.getRegion())) && (ltp.getVariants().isEmpty() || and(variant, ltp.getVariants())) @@ -191,12 +202,54 @@ public boolean is(String item) { && ltp.getLocaleExtensions().isEmpty(); } + public boolean checkLang(String language) { + return lang.is(language); + } + @Override public String getSample() { return "de"; } } + /** + * Check for the language OR certain backwards-compatible exceptions for data to support + * retaining variants, namely plural rules and likelySubtags: "in","iw","ji","jw","mo","tl" + */ + public static class XLocaleMatchValue extends LocaleMatchValue { + static final Set exceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); + + @Override + public boolean checkLang(String language) { + return super.checkLang(language) // first check normal + || exceptions.contains(language); + } + + @Override + public String getName() { + return "validity/xlocale"; + } + } + + /** + * Check for the language OR certain backwards-compatible exceptions for language names: "fat", + * "sh", "tl", "tw" + */ + public static class NLocaleMatchValue extends LocaleMatchValue { + static final Set exceptions = Set.of("fat", "sh", "tl", "tw"); + + @Override + public boolean checkLang(String language) { + return super.checkLang(language) // first check normal + || exceptions.contains(language); + } + + @Override + public String getName() { + return "validity/nlocale"; + } + } + // TODO remove these if possible — ticket/10120 static final Set SCRIPT_HACK = ImmutableSet.of( @@ -325,6 +378,12 @@ public static MatchValue of(String typeName) { if (typeName.equals("locale")) { return new LocaleMatchValue(); } + if (typeName.equals("xlocale")) { + return new XLocaleMatchValue(); + } + if (typeName.equals("nlocale")) { + return new NLocaleMatchValue(); + } if (typeName.equals("bcp47-wellformed")) { return new BCP47LocaleWellFormedMatchValue(); } diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index 2caae47d1b2..1a5ebbbf906 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -1312,7 +1312,7 @@ private void makeStuffSafe() { if (unitAliases != null) { // don't load unless the information is there (for old releases); unitConverter.addAliases(unitAliases); } - unitConverter.freeze(); + unitConverter.freeze(new File(directory, "../validity").toString()); rationalParser.freeze(); unitPreferences.freeze(); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java index 789350e075d..79313794442 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java @@ -168,6 +168,10 @@ public boolean isFrozen() { @Override public UnitConverter freeze() { + return freeze(CLDRPaths.VALIDITY_DIRECTORY); + } + + public UnitConverter freeze(String validityDirectory) { if (!frozen) { frozen = true; rationalParser.freeze(); @@ -185,7 +189,7 @@ public UnitConverter freeze() { baseUnits = builder.build(); targetInfoComparator = new TargetInfoComparator(); - buildMapComparators(); + buildMapComparators(validityDirectory); // must be after building comparators idToUnitId = ImmutableMap.copyOf(buildIdToUnitId()); @@ -194,14 +198,19 @@ public UnitConverter freeze() { } public void buildMapComparators() { + buildMapComparators(CLDRPaths.VALIDITY_DIRECTORY); + } + + public void buildMapComparators(String validityDirectory) { Set> all = new TreeSet<>(); + final Validity validity = Validity.getInstance(validityDirectory); Set baseSeen = new HashSet<>(); + if (DEBUG) { UnitParser up = new UnitParser(componentTypeData); Output uict = new Output<>(); - for (String longUnit : - Validity.getInstance().getStatusToCodes(LstrType.unit).get(Status.regular)) { + for (String longUnit : validity.getStatusToCodes(LstrType.unit).get(Status.regular)) { String shortUnit = getShortId(longUnit); up.set(shortUnit); List items = new ArrayList<>(); @@ -219,8 +228,7 @@ public void buildMapComparators() { System.out.println(shortUnit + "\t" + Joiner.on('\t').join(items)); } } - for (String longUnit : - Validity.getInstance().getStatusToCodes(LstrType.unit).get(Status.regular)) { + for (String longUnit : validity.getStatusToCodes(LstrType.unit).get(Status.regular)) { Output base = new Output<>(); String shortUnit = getShortId(longUnit); ConversionInfo conversionInfo = parseUnitId(shortUnit, base, false); diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java index 358fc099df8..ca56301fc01 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/Validity.java @@ -2,7 +2,9 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableSet; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.EnumMap; import java.util.LinkedHashMap; @@ -38,6 +40,11 @@ public static Validity getInstance() { } public static Validity getInstance(String validityDirectory) { + try { + validityDirectory = new File(validityDirectory).getCanonicalFile().toString(); + } catch (IOException e) { + throw new ICUUncheckedIOException(e); + } Validity result = cache.get(validityDirectory); if (result == null) { final Validity value = new Validity(validityDirectory); @@ -79,7 +86,7 @@ private Validity(String validityDirectory) { codeToStatus.put(type, subCodeToStatus = new LinkedHashMap<>()); } - XMLFileReader.loadPathValues(basePath + file, lineData, true); + XMLFileReader.loadPathValues(new File(basePath, file).toString(), lineData, true); for (Pair item : lineData) { XPathValue parts = SimpleXPathParts.getFrozenInstance(item.getFirst()); if (!"id".equals(parts.getElement(-1))) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java index a288cafb6b5..e2bf827b68c 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestAttributeValues.java @@ -92,7 +92,7 @@ public void TestValid() { addXMLFiles(dtdType, mainDirs + stringDir, files); if (isVerbose()) synchronized (pathChecker.testLog) { - warnln(mainDirs + stringDir); + logln(mainDirs + stringDir); } } Stream stream = SERIAL ? files.stream() : files.parallelStream(); @@ -102,7 +102,7 @@ public void TestValid() { // checkFile(pathChecker, file); // } } - pathChecker.show(isVerbose(), showStatuses); + pathChecker.show(dtdType, isVerbose(), showStatuses); } // List localesToTest = Arrays.asList("en", "root"); // , "zh", "hi", "ja", // "ru", "cy" @@ -145,9 +145,9 @@ private void addXMLFiles(DtdType dtdType, String path, Set files) { } else { for (String file : dirFile.list()) { String localeID = file.replace(".xml", ""); - if (StandardCodes.isLocaleAtLeastBasic(localeID)) { - addXMLFiles(dtdType, path + "/" + file, files); - } + // if (StandardCodes.isLocaleAtLeastBasic(localeID)) { + addXMLFiles(dtdType, path + "/" + file, files); + // } } } } @@ -186,7 +186,8 @@ private void checkFile(PathChecker pathChecker, String fullFile) { ++_attributeCount; String attribute = r.getAttributeLocalName(i); String attributeValue = r.getAttributeValue(i); - pathChecker.checkAttribute(element, attribute, attributeValue); + pathChecker.checkAttribute( + fullFile, element, attribute, attributeValue); } break; } @@ -237,7 +238,7 @@ public PathChecker(TestFmwk testLog, DtdData dtdData) { matchValues = ImmutableMap.copyOf(_matchValues); } - private void checkPath(String path) { + private void checkPath(String fullFile, String path) { if (seen.contains(path)) { return; } @@ -251,19 +252,20 @@ private void checkPath(String path) { for (Entry entry : parts.getAttributes(elementIndex).entrySet()) { String attribute = entry.getKey(); String attrValue = entry.getValue(); - checkAttribute(element, attribute, attrValue); + checkAttribute(fullFile, element, attribute, attrValue); } } } - public void checkElement(String element, Attributes atts) { + public void checkElement(String fullFile, String element, Attributes atts) { int length = atts.getLength(); for (int i = 0; i < length; ++i) { - checkAttribute(element, atts.getQName(i), atts.getValue(i)); + checkAttribute(fullFile, element, atts.getQName(i), atts.getValue(i)); } } - private void checkAttribute(String element, String attribute, String attrValue) { + private void checkAttribute( + String fullFile, String element, String attribute, String attrValue) { // skip cases we know we don't need to test if (!needsTesting.containsEntry(element, attribute)) { return; @@ -296,16 +298,18 @@ private void checkAttribute(String element, String attribute, String attrValue) // Set breakpoint here for debugging (referenced from // http://cldr.unicode.org/development/testattributevalues) dtdData.getValueStatus(element, attribute, attrValue); + testLog.warnln( + Joiner.on('\t').join("Invalid", fullFile, element, attribute, attrValue)); } synchronized (valueStatusInfo) { valueStatusInfo.put(valueStatus, element, attribute, attrValue, Boolean.TRUE); } } - void show(boolean verbose, ImmutableSet retain) { + void show(DtdType dtdType, boolean verbose, ImmutableSet retain) { if (dtdData.dtdType == DtdType.keyboard3 && testLog.logKnownIssue("CLDR-14974", "skipping for keyboard")) { - testLog.warnln("Skipping for keyboard3"); + testLog.warnln("keyboard3 is missing validity checks"); } boolean haveProblems = false; for (ValueStatus valueStatus : ValueStatus.values()) { @@ -323,7 +327,9 @@ void show(boolean verbose, ImmutableSet retain) { } StringBuilder out = new StringBuilder(); out.append( - "\nIf the test fails, look at https://cldr.unicode.org/development/cldr-development-site/testattributevalues\n"); + "For " + + dtdType.directories + + "\nIf the test fails, use -v for details. Also look at https://cldr.unicode.org/development/updating-codes/testattributevalues for guidance.\n"); out.append("file\tCount:\t" + dtdData.dtdType + "\t" + fileCount + "\n"); out.append("element\tCount:\t" + dtdData.dtdType + "\t" + elementCount + "\n"); diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java index c21c6c5689c..1926c81e346 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java @@ -5,6 +5,8 @@ import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; import com.google.common.collect.TreeMultimap; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; @@ -41,6 +43,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; +import java.util.stream.Collectors; import org.unicode.cldr.test.DisplayAndInputProcessor; import org.unicode.cldr.tool.CldrVersion; import org.unicode.cldr.tool.LikelySubtags; @@ -195,7 +198,7 @@ private void checkDtds( } else if (fileName.getPath().contains("/keyboards/3.0/") && logKnownIssue( "CLDR-17574", "With v46, parsing issues for keyboard xml files")) { - ; // do nothing, skip test + // do nothing, skip test } else if (name.endsWith(".xml")) { data.add(check(fileName)); if (deepCheck // takes too long to do all the time @@ -1652,5 +1655,42 @@ public void sortPaths(Comparator dc, Collection paths) { public void sortPaths(Comparator dc, String... array) { Arrays.sort(array, 0, array.length, dc); } + // public void TestNewDtdData() moved to TestDtdData + + public void testBcp47Ids() { + if (!TestCLDRPaths.canUseArchiveDirectory()) { + return; + } + final File ARCHIVE = new File(CLDRPaths.ARCHIVE_DIRECTORY); + Set> seen = new LinkedHashSet<>(); + TreeSet sorted = new TreeSet<>(Collections.reverseOrder()); + sorted.addAll(Arrays.asList(ARCHIVE.listFiles())); + Set> newKeys = pairs(SUPPLEMENTAL_DATA_INFO.getBcp47Keys()); + + for (File file : sorted) { + if (!file.getName().startsWith("cldr-")) { + continue; + } + System.out.println(file); + File supplementalDir = new File(file, "common/supplemental"); + SupplementalDataInfo otherSupplementalData = + SupplementalDataInfo.getInstance(supplementalDir); + Set> oldKeys = pairs(otherSupplementalData.getBcp47Keys()); + if (!newKeys.containsAll(oldKeys)) { + SetView> oldButNotNew = Sets.difference(oldKeys, newKeys); + SetView> oldButNotNewMinusSeen = + Sets.difference(oldButNotNew, seen); + if (!assertEquals(file.toString(), Collections.emptySet(), oldButNotNewMinusSeen)) { + seen.addAll(oldButNotNewMinusSeen); + } + } + } + } + + private Set> pairs(Relation bcp47Keys) { + return bcp47Keys.entrySet().stream() + .map(x -> Pair.of(x.getKey(), x.getValue())) + .collect(Collectors.toCollection(TreeSet::new)); + } } diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java index fcf753bbccb..87ab3b0780f 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestSupplementalInfo.java @@ -78,6 +78,7 @@ import org.unicode.cldr.util.PreferredAndAllowedHour.HourStyle; import org.unicode.cldr.util.StandardCodes; import org.unicode.cldr.util.StandardCodes.CodeType; +import org.unicode.cldr.util.StandardCodes.LstrField; import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.SupplementalDataInfo; import org.unicode.cldr.util.SupplementalDataInfo.BasicLanguageData; @@ -2176,4 +2177,48 @@ public void TestGrammarInfo() { } } } + + public void testPredominantEncompassed() { + // maybe check with lstreg instead? They should be in sync. + Map>> lstreg = StandardCodes.getEnumLstreg(); + + SupplementalDataInfo supp = SupplementalDataInfo.getInstance(); + // Returns type -> tag -> , like "language" -> "sh" -> <{"sr_Latn"}, reason> + Map, String>>> locAliases = supp.getLocaleAliasInfo(); + Map, String>> langAliases = locAliases.get("language"); + Set skip = Set.of("no", "sh"); + + Iso639Data.getNames("a"); // init (need to fix) + + Set macros = Iso639Data.getMacros(); + main: + for (String macro : macros) { + if (skip.contains(macro)) { + continue; + } + Set encompasseds = Iso639Data.getEncompassedForMacro(macro); + final List encompassedNames = + encompasseds.stream().map(x -> codeAndName(x)).collect(Collectors.toList()); + for (String encompassed : encompasseds) { + R2, String> data = langAliases.get(encompassed); + if (data != null) { + if (data.get0().contains(macro)) { + logln( + codeAndName(macro) + + "has predominant " + + codeAndName(encompassed) + + " in encompassed: " + + encompassedNames); + continue main; + } + } + } + errln("ERROR " + codeAndName(macro) + " missing predominent from " + encompassedNames); + } + } + + private String codeAndName(String macro) { + // TODO Auto-generated method stub + return CLDRConfig.getInstance().getEnglish().getName(macro) + " (" + macro + ")"; + } } From e4c3588ad298321cbdbe55e951874cf24b83c6fe Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 28 Nov 2024 06:13:21 -0800 Subject: [PATCH 2/6] =?UTF-8?q?CLDR-18129=20Clean=20up=20validity=20tests,?= =?UTF-8?q?=20adding=20new=20@MATCH=20option=20validity/=E2=80=A6/all=20to?= =?UTF-8?q?=20get=20all=20Status=20values=20that=20are=20available=20for?= =?UTF-8?q?=20that=20LstrType?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/dtd/ldml.dtd | 2 +- common/dtd/ldmlSupplemental.dtd | 18 ++--- common/main/la.xml | 3 - .../org/unicode/cldr/util/MatchValue.java | 75 ++++++++++++++++--- 4 files changed, 73 insertions(+), 25 deletions(-) diff --git a/common/dtd/ldml.dtd b/common/dtd/ldml.dtd index fecb752848f..247f6595333 100644 --- a/common/dtd/ldml.dtd +++ b/common/dtd/ldml.dtd @@ -95,7 +95,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/dtd/ldmlSupplemental.dtd b/common/dtd/ldmlSupplemental.dtd index 4678e29ea62..8de3ae612c7 100644 --- a/common/dtd/ldmlSupplemental.dtd +++ b/common/dtd/ldmlSupplemental.dtd @@ -65,7 +65,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -113,7 +113,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -284,7 +284,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -297,7 +297,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -711,7 +711,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -720,9 +720,9 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + - + @@ -738,7 +738,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + @@ -914,7 +914,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/main/la.xml b/common/main/la.xml index f7e50e9a286..40a5e6ff4f8 100644 --- a/common/main/la.xml +++ b/common/main/la.xml @@ -212,7 +212,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Brasilia Insulae Bahamenses Butania - Birmania Insula Bouvet Botswana Ruthenia Alba @@ -236,7 +235,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Insula Christi Natalis Cyprus Cechia - Res publica Democratica Germanica Germania Gibutum Dania @@ -420,7 +418,6 @@ CLDR data files are interpreted according to the LDML specification (http://unic Kosovia Iemenia Maiotta - Iugoslavia Africa Australis Zambia Zimbabua diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java index ec6ff314080..46d31756ee2 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java @@ -4,7 +4,10 @@ import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; import com.ibm.icu.impl.Relation; import com.ibm.icu.impl.Row; import com.ibm.icu.impl.Row.R2; @@ -17,6 +20,7 @@ import com.vdurmont.semver4j.Semver.SemverType; import com.vdurmont.semver4j.SemverException; import java.text.ParseException; +import java.util.Collections; import java.util.Date; import java.util.EnumSet; import java.util.HashMap; @@ -157,7 +161,7 @@ public static class LocaleMatchValue extends MatchValue { private final Predicate variant; public LocaleMatchValue() { - this(Set.of(Status.regular, Status.special, Status.unknown, Status.macroregion)); + this(null, null, null, null); // use default status } public LocaleMatchValue(Set statuses) { @@ -306,6 +310,9 @@ public static EnumParser of(Class aClass) { } public Set parse(String text) { + if (text == null) { + return null; + } Set statuses = EnumSet.noneOf(aClass); boolean negative = text.startsWith("!"); if (negative) { @@ -346,18 +353,59 @@ public boolean isAll(Set statuses) { } public static class ValidityMatchValue extends MatchValue { + private static final Validity VALIDITY = Validity.getInstance(); + public static final Multimap DEFAULT_STATUS; + + static { + Multimap DEFAULT_STATUS_ = TreeMultimap.create(); + for (LstrType lstrType : LstrType.values()) { + switch (lstrType) { + case region: + DEFAULT_STATUS_.putAll( + lstrType, + Set.of( + Status.regular, + Status.unknown, + Status.macroregion, + Status.special)); + break; + case language: + case script: + DEFAULT_STATUS_.putAll( + lstrType, Set.of(Status.regular, Status.unknown, Status.special)); + break; + case subdivision: + case currency: + DEFAULT_STATUS_.putAll( + lstrType, + Set.of(Status.regular, Status.unknown, Status.deprecated)); + break; + default: + DEFAULT_STATUS_.putAll(lstrType, Set.of(Status.regular, Status.unknown)); + break; + } + } + DEFAULT_STATUS = ImmutableMultimap.copyOf(DEFAULT_STATUS_); + } + + private static Map shortCodeToStatus; + private static final EnumParser validityStatusParser = EnumParser.of(Status.class); + private final LstrType type; private final boolean shortId; private final Set statuses; - private static Map shortCodeToStatus; - private static final EnumParser enumParser = EnumParser.of(Status.class); @Override public String getName() { + Collections a; return "validity/" + (shortId ? "short-" : "") + type.toString() - + (enumParser.isAll(statuses) ? "" : "/" + enumParser.format(statuses)); + + (statuses.equals(Set.copyOf(DEFAULT_STATUS.get(type))) + ? "" + : statuses.equals(VALIDITY.getStatusToCodes(type).keySet()) + ? "/all" + : "/" + validityStatusParser.format(statuses)); } private ValidityMatchValue(LstrType type) { @@ -370,8 +418,9 @@ private ValidityMatchValue(LstrType type, Set statuses, boolean shortId) throw new IllegalArgumentException("short- not supported except for units"); } this.shortId = shortId; + // validForType = Validity.getInstance().getStatusToCodes(type).keySet(); this.statuses = - statuses == null ? EnumSet.allOf(Status.class) : ImmutableSet.copyOf(statuses); + ImmutableSet.copyOf(statuses == null ? DEFAULT_STATUS.get(type) : statuses); } public static MatchValue of(String typeName) { @@ -387,10 +436,10 @@ public static MatchValue of(String typeName) { if (typeName.equals("bcp47-wellformed")) { return new BCP47LocaleWellFormedMatchValue(); } + String statusPart = null; int slashPos = typeName.indexOf('/'); - Set statuses = null; if (slashPos > 0) { - statuses = enumParser.parse(typeName.substring(slashPos + 1)); + statusPart = typeName.substring(slashPos + 1); typeName = typeName.substring(0, slashPos); } boolean shortId = typeName.startsWith("short-"); @@ -398,6 +447,10 @@ public static MatchValue of(String typeName) { typeName = typeName.substring(6); } LstrType type = LstrType.fromString(typeName); + Set statuses = + "all".equals(statusPart) + ? VALIDITY.getStatusToCodes(type).keySet() + : validityStatusParser.parse(statusPart); return new ValidityMatchValue(type, statuses, shortId); } @@ -425,9 +478,7 @@ public boolean is(String item) { == null) { // lazy evaluation to avoid circular dependencies Map _shortCodeToStatus = new TreeMap<>(); for (Entry entry : - Validity.getInstance() - .getCodeToStatus(LstrType.unit) - .entrySet()) { + VALIDITY.getCodeToStatus(LstrType.unit).entrySet()) { String key = entry.getKey(); Status status = entry.getValue(); final String shortKey = key.substring(key.indexOf('-') + 1); @@ -448,13 +499,13 @@ public boolean is(String item) { default: break; } - final Status status = Validity.getInstance().getCodeToStatus(type).get(item); + final Status status = VALIDITY.getCodeToStatus(type).get(item); return status != null && statuses.contains(status); } @Override public String getSample() { - return Validity.getInstance().getCodeToStatus(type).keySet().iterator().next(); + return VALIDITY.getCodeToStatus(type).keySet().iterator().next(); } } From 8f6da1db6941e5099d34ed0d0d360e37fa95cfbd Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 28 Nov 2024 12:22:45 -0800 Subject: [PATCH 3/6] CLDR-18129 Hack around SupplementalDataInfo.getInstance() on old versions (keyboard, etc) --- .../cldr/util/SupplementalDataInfo.java | 9 +++++-- .../org/unicode/cldr/util/UnitConverter.java | 19 +++++++++---- .../org/unicode/cldr/unittest/TestBasic.java | 27 ++++++++++++++----- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java index 1a5ebbbf906..a298de5c959 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/SupplementalDataInfo.java @@ -2166,8 +2166,13 @@ private boolean handleMetadata(String level2, String value, XPathValue parts) { } return true; } else if (level3.equals("attributeValues")) { - AttributeValidityInfo.add( - parts.getAttributes(-1), value, attributeValidityInfo); + // the keyboard directory disappeared in new versions. + // supplementalData/metadata/validity/attributeValues[@dtds="keyboard"][@elements="keyMap"][@attributes="modifiers"][@type="TODO"] + final String dtdsValue = parts.getAttributeValue(-1, "dtds"); + if (!"keyboard".equals(dtdsValue) && !"platform".equals(dtdsValue)) { + AttributeValidityInfo.add( + parts.getAttributes(-1), value, attributeValidityInfo); + } return true; } } else if (level2.equals("serialElements")) { diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java index 79313794442..482dff86b1c 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/UnitConverter.java @@ -240,18 +240,23 @@ public void buildMapComparators(String validityDirectory) { conversionInfo = parseUnitId("kelvin", base, false); } } - String quantity; + String quantity = null; Integer quantityNumericOrder = null; try { quantity = getQuantityFromUnit(base.value, false); + if (quantity == null && "beaufort".equals(shortUnit)) { + quantity = "speed"; + } quantityNumericOrder = quantityComparator.getNumericOrder(quantity); } catch (Exception e) { System.out.println( - "Failed " + "Failed to build unit comparator for " + shortUnit + ", " + base + ", " + + quantity + + ", " + quantityNumericOrder + ", " + e); @@ -292,7 +297,11 @@ public void buildMapComparators(String validityDirectory) { "Add new unitSystem to a grouping: " + sortingSystem); } R4 sortKey = - Row.of(quantityNumericOrder, sortingSystem, conversionInfo.factor, shortUnit); + Row.of( + quantityNumericOrder, + sortingSystem, + conversionInfo == null ? Rational.INFINITY : conversionInfo.factor, + shortUnit); all.add(sortKey); } LongUnitIdOrder.setErrorOnMissing(true); @@ -1890,16 +1899,16 @@ public BiMap getBaseUnitToQuantity() { return (BiMap) baseUnitToQuantity; } + /** Returns null if unit can't be parsed */ public String getQuantityFromUnit(String unit, boolean showYourWork) { Output metricUnit = new Output<>(); unit = fixDenormalized(unit); try { ConversionInfo unitInfo = parseUnitId(unit, metricUnit, showYourWork); - return metricUnit.value == null ? null : getQuantityFromBaseUnit(metricUnit.value); } catch (Exception e) { - System.out.println("Failed with " + unit + ", " + metricUnit + "\t" + e); return null; } + return metricUnit.value == null ? null : getQuantityFromBaseUnit(metricUnit.value); } public String getQuantityFromBaseUnit(String baseUnit) { diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java index 1926c81e346..9a3f3773249 100644 --- a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java +++ b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestBasic.java @@ -1664,18 +1664,33 @@ public void testBcp47Ids() { } final File ARCHIVE = new File(CLDRPaths.ARCHIVE_DIRECTORY); Set> seen = new LinkedHashSet<>(); - TreeSet sorted = new TreeSet<>(Collections.reverseOrder()); - sorted.addAll(Arrays.asList(ARCHIVE.listFiles())); + + // get the archive directories in reverse order (latest first) + + TreeSet sortedArchiveDirectories = new TreeSet<>(Collections.reverseOrder()); + sortedArchiveDirectories.addAll(Arrays.asList(ARCHIVE.listFiles())); + + // get the BCP 47 keys to test against + Set> newKeys = pairs(SUPPLEMENTAL_DATA_INFO.getBcp47Keys()); - for (File file : sorted) { + for (File file : sortedArchiveDirectories) { if (!file.getName().startsWith("cldr-")) { continue; } - System.out.println(file); + if (file.getName().compareTo("cldr-44.0") < 0) { + break; + } + logln(file.toString()); File supplementalDir = new File(file, "common/supplemental"); - SupplementalDataInfo otherSupplementalData = - SupplementalDataInfo.getInstance(supplementalDir); + SupplementalDataInfo otherSupplementalData; + try { + otherSupplementalData = SupplementalDataInfo.getInstance(supplementalDir); + } catch (RuntimeException e) { + errln("Can't create SupplementalDataInfo for " + supplementalDir); + throw e; + // continue; + } Set> oldKeys = pairs(otherSupplementalData.getBcp47Keys()); if (!newKeys.containsAll(oldKeys)) { SetView> oldButNotNew = Sets.difference(oldKeys, newKeys); From a1e078edd57c8d3b6a1b74b027190c4831770caa Mon Sep 17 00:00:00 2001 From: macchiati Date: Thu, 28 Nov 2024 14:02:38 -0800 Subject: [PATCH 4/6] CLDR-18129 Fix outliers --- common/main/en.xml | 4 ---- common/main/nl.xml | 1 - common/validity/language.xml | 4 ++-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/common/main/en.xml b/common/main/en.xml index d1aaf9ed95c..31cbb564d3e 100644 --- a/common/main/en.xml +++ b/common/main/en.xml @@ -152,7 +152,6 @@ annotations. Swampy Cree Church Slavic Chuvash - Cree Welsh Danish Dakota @@ -256,7 +255,6 @@ annotations. Hakka Chinese Hawaiian Southern Haida - Haida Hebrew Hindi Hindi (Latin) @@ -284,7 +282,6 @@ annotations. Igbo Sichuan Yi Inupiaq - Inuktitut Western Canadian Inuktitut Iloko Ingush @@ -474,7 +471,6 @@ annotations. Ojibwa Northwestern Ojibwa Central Ojibwa - Ojibwa Oji-Cree Western Ojibwa Okanagan diff --git a/common/main/nl.xml b/common/main/nl.xml index bd1f8674903..a8e9fa1d1ba 100644 --- a/common/main/nl.xml +++ b/common/main/nl.xml @@ -31,7 +31,6 @@ Warnings: All cp values have U+FE0F characters removed. See /annotationsDerived/ Afrihili Aghem Aino - Levantijns-Arabisch Akan Akkadisch Alabama diff --git a/common/validity/language.xml b/common/validity/language.xml index bfe60a517f1..912dac91d86 100644 --- a/common/validity/language.xml +++ b/common/validity/language.xml @@ -76,7 +76,7 @@ cia~e cih cik cim~n cip cir ciw ciy cja cje cjh~i cjk cjm~p cjs cjv cjy ckb ckh ckl~o ckq~v ckx~z - cla clc cle clh~m clo cls~u clw cly + cla clc cle clh~m clo clt~u clw cly cma cmc cme cmg cmi cml~m cmo cmr~t cna~c cng~i cnk~l cno~q cns~u cnw~x co coa~h coj~q cot~x coz @@ -628,7 +628,7 @@ aam adp agp ais ajp ajt~u als aoh arb asd aue ayr ayx~y azj baz bbz bcc bcl bgm bh bhk bic bij bjd bjq bkb blg bmy bpb btb btl bxk bxr bxx byy - cbe cbh cca ccq cdg cjr cka cld cmk cmn cnr coy cqu cug cum cwd + cbe cbh cca ccq cdg cjr cka cld cls cmk cmn cnr coy cqu cug cum cwd daf dap dgo dgu dha dhd dik diq dit djl dkl drh drr drw dud duj dwl ekc ekk elp emk emo esk fat fuc From 573fb1d3ddc4519ad06760b05e711d8193dd7441 Mon Sep 17 00:00:00 2001 From: macchiati Date: Fri, 29 Nov 2024 05:31:14 -0800 Subject: [PATCH 5/6] CLDR-18129 Delete rna.xml in exemplars directory (the language code is deprecated, with no replacement) --- exemplars/main/rna.xml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 exemplars/main/rna.xml diff --git a/exemplars/main/rna.xml b/exemplars/main/rna.xml deleted file mode 100644 index de04a86b277..00000000000 --- a/exemplars/main/rna.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - left-to-right - top-to-bottom - - - - [a b c d e f g h i j k m n o p r s t u v w y z] - [l q x] - - From fca637b853c35d30c858d9c400e80f10d800e39e Mon Sep 17 00:00:00 2001 From: macchiati Date: Fri, 29 Nov 2024 14:05:22 -0800 Subject: [PATCH 6/6] CLDR-18129 Give /[xn]locale/ better names --- common/dtd/ldml.dtd | 2 +- common/dtd/ldmlSupplemental.dtd | 6 +++--- common/main/en.xml | 1 + .../main/java/org/unicode/cldr/util/MatchValue.java | 13 +++++++------ 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/common/dtd/ldml.dtd b/common/dtd/ldml.dtd index 247f6595333..e02fff900ca 100644 --- a/common/dtd/ldml.dtd +++ b/common/dtd/ldml.dtd @@ -61,7 +61,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/dtd/ldmlSupplemental.dtd b/common/dtd/ldmlSupplemental.dtd index 8de3ae612c7..237fb957063 100644 --- a/common/dtd/ldmlSupplemental.dtd +++ b/common/dtd/ldmlSupplemental.dtd @@ -962,9 +962,9 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + - + @@ -996,7 +996,7 @@ CLDR data files are interpreted according to the LDML specification (http://unic - + diff --git a/common/main/en.xml b/common/main/en.xml index 31cbb564d3e..29542a24576 100644 --- a/common/main/en.xml +++ b/common/main/en.xml @@ -139,6 +139,7 @@ annotations. Coptic Capiznon Cree + Woods Cree Michif Crimean Tatar Southern East Cree diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java index 46d31756ee2..c0c9f17357e 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/util/MatchValue.java @@ -108,8 +108,9 @@ public static MatchValue of(String command) { throw new IllegalArgumentException( "Illegal/Unimplemented match type: " + originalArg); } + // check for errors in the MatchValue functions if (!originalArg.equals(result.getName())) { - System.err.println( + throw new IllegalArgumentException( "Non-standard form or error: " + originalArg + " ==> " + result.getName()); } return result; @@ -218,7 +219,7 @@ public String getSample() { /** * Check for the language OR certain backwards-compatible exceptions for data to support - * retaining variants, namely plural rules and likelySubtags: "in","iw","ji","jw","mo","tl" + * retaining variants, namely likelySubtags: "in","iw","ji","jw","mo","tl" */ public static class XLocaleMatchValue extends LocaleMatchValue { static final Set exceptions = Set.of("in", "iw", "ji", "jw", "mo", "tl"); @@ -231,7 +232,7 @@ public boolean checkLang(String language) { @Override public String getName() { - return "validity/xlocale"; + return "validity/locale-for-likely"; } } @@ -250,7 +251,7 @@ public boolean checkLang(String language) { @Override public String getName() { - return "validity/nlocale"; + return "validity/locale-for-names"; } } @@ -427,10 +428,10 @@ public static MatchValue of(String typeName) { if (typeName.equals("locale")) { return new LocaleMatchValue(); } - if (typeName.equals("xlocale")) { + if (typeName.equals("locale-for-likely")) { return new XLocaleMatchValue(); } - if (typeName.equals("nlocale")) { + if (typeName.equals("locale-for-names")) { return new NLocaleMatchValue(); } if (typeName.equals("bcp47-wellformed")) {