URL: run IdnaTestV2.txt in WPT

For whatwg/url#341.
web-platform-tests · Jan 20, 2023 · 4708e9a · 4708e9a
1 parent ba32d5c
commit 4708e9a
Show file tree

Hide file tree

Showing 5 changed files with 37,218 additions and 0 deletions.
diff --git a/url/IdnaTestV2.window.js b/url/IdnaTestV2.window.js
@@ -0,0 +1,23 @@
+promise_test(() => fetch("resources/IdnaTestV2.json").then(res => res.json()).then(runTests), "Loading data…");
+
+function runTests(idnaTests) {
+  for (const idnaTest of idnaTests) {
+    if (typeof idnaTest === "string") {
+      continue // skip comments
+    }
+    if (idnaTest.input === "") {
+      continue // cannot test empty string input through new URL()
+    }
+    test(() => {
+      if (idnaTest.output === null) {
+        assert_throws_js(TypeError, () => new URL(`https://${idnaTest.input}/x`));
+      } else {
+        const url = new URL(`https://${idnaTest.input}/x`);
+        assert_equals(url.host, idnaTest.output);
+        assert_equals(url.hostname, idnaTest.output);
+        assert_equals(url.pathname, "/x");
+        assert_equals(url.href, `https://${idnaTest.output}/x`);
+      }
+    }, `ToASCII("${idnaTest.input}")${idnaTest.comment ? " " + idnaTest.comment : ""}`);
+  }
+}
diff --git a/url/resources/IdnaTestV2-feedback.txt b/url/resources/IdnaTestV2-feedback.txt
@@ -0,0 +1,13 @@
+2023-01. Feedback by Anne van Kesteren on
+
+  https://unicode.org/Public/idna/latest/IdnaTestV2.txt
+  Date: 2022-05-26, 22:30:12 GMT
+
+(I have almost exclusively focused on ToASCII cases.)
+
+* VerifyDnsLength is not P4, but rather A4_1 and A4_2.
+* Tests that use trailing ASCII digit labels are not useful for browsers as that will trigger the IPv4 parser. This is a problem for a number of the A4_1 and A4_2 tests. And also a large number of tests later on, such as ToASCII("xn--gl0as212a.8.") or ToASCII("1.27").
+* Test for ToASCII("$") is marked P1 and V6, not U1. This might apply more widely.
+* NV8 is not used as a status.
+* A3 and X3 do not appear to be used as a status. (These are catered for by P4 presumably.)
+* CheckBidi is not V8. V8 does not appear to be used. You'd have to filter out all B1-6 statuses instead.
diff --git a/url/resources/IdnaTestV2-parser.py b/url/resources/IdnaTestV2-parser.py
@@ -0,0 +1,93 @@
+import os
+import json
+import requests
+
+if not os.path.exists("IdnaTestV2.txt"):
+  # Download IdnaTestV2.txt if it doesn't exist yet
+  open("IdnaTestV2.txt", "w").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)
+
+test_input = open("IdnaTestV2.txt", "r").readlines()
+test_output = ["This resource is a conversion of IdnaTestV2 aimed to match the requirements of the URL Standard's domain to ASCII"]
+
+def remove_escapes(input):
+    return json.loads("\"" + input + "\"")
+
+unique_statuses = []
+
+i = 0
+for test in test_input:
+    # Remove newlines
+    test = test.rstrip()
+
+    # Remove lines from test_input that are comments or empty
+    if test.startswith("#") or test == "":
+        continue
+
+    # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
+    test = remove_escapes(test)
+
+    # Normalize columns
+    #
+    # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
+    # about the following columns:
+    #
+    # * Column 1: source
+    # * Column 4: toAsciiN
+    # * Column 5: toAsciiNStatus
+    columns = [column.strip() for column in test.split(";")]
+
+    # Column 1
+    column_source = columns[0]
+
+    # Column 4 (if empty, use Column 2; if empty again, use Column 1)
+    column_to_ascii = columns[3]
+    if column_to_ascii == "":
+        column_to_ascii = columns[1]
+        if column_to_ascii == "":
+            column_to_ascii = column_source
+
+    # Column 5 (if empty, use Column 3; if empty again, assume empty list)
+    column_status = columns[4]
+    if column_status == "":
+        column_status = columns[2]
+    if column_status == "":
+        column_status = []
+    else:
+        assert column_status.startswith("[")
+        column_status = [status.strip() for status in column_status[1:-1].split(",")]
+
+    for status in column_status:
+        if status not in unique_statuses:
+            unique_statuses.append(status)
+
+    # The URL Standard has
+    #
+    # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
+    # * CheckHyphens=false; thus ignore V2, V3?
+    # * VerifyDnsLength=false; thus ignore A4_1 and A4_2
+    comment = ""
+    for ignored_status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
+        if ignored_status in column_status:
+            column_status.remove(ignored_status)
+            comment += ignored_status + " (ignored); "
+    for status in column_status:
+        comment += status + "; "
+    if comment != "":
+        comment = comment.strip()[:-1]
+
+    output = column_to_ascii
+    if len(column_status) > 0:
+        output = None
+
+    test_output_entry = { "input": column_source, "output": output }
+    if comment != "":
+        test_output_entry["comment"] = comment
+
+    test_output.append(test_output_entry)
+
+handle = open("IdnaTestV2.json", "w")
+handle.write(json.dumps(test_output, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
+handle.write("\n")
+
+unique_statuses.sort()
+print(unique_statuses)