Skip to content

Commit

Permalink
URL: run IdnaTestV2.txt in WPT
Browse files Browse the repository at this point in the history
  • Loading branch information
annevk committed Jan 20, 2023
1 parent ba32d5c commit 4708e9a
Show file tree
Hide file tree
Showing 5 changed files with 37,218 additions and 0 deletions.
23 changes: 23 additions & 0 deletions url/IdnaTestV2.window.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
promise_test(() => fetch("resources/IdnaTestV2.json").then(res => res.json()).then(runTests), "Loading data…");

function runTests(idnaTests) {
for (const idnaTest of idnaTests) {
if (typeof idnaTest === "string") {
continue // skip comments
}
if (idnaTest.input === "") {
continue // cannot test empty string input through new URL()
}
test(() => {
if (idnaTest.output === null) {
assert_throws_js(TypeError, () => new URL(`https://${idnaTest.input}/x`));
} else {
const url = new URL(`https://${idnaTest.input}/x`);
assert_equals(url.host, idnaTest.output);
assert_equals(url.hostname, idnaTest.output);
assert_equals(url.pathname, "/x");
assert_equals(url.href, `https://${idnaTest.output}/x`);
}
}, `ToASCII("${idnaTest.input}")${idnaTest.comment ? " " + idnaTest.comment : ""}`);
}
}
13 changes: 13 additions & 0 deletions url/resources/IdnaTestV2-feedback.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
2023-01. Feedback by Anne van Kesteren on

https://unicode.org/Public/idna/latest/IdnaTestV2.txt
Date: 2022-05-26, 22:30:12 GMT

(I have almost exclusively focused on ToASCII cases.)

* VerifyDnsLength is not P4, but rather A4_1 and A4_2.
* Tests that use trailing ASCII digit labels are not useful for browsers as that will trigger the IPv4 parser. This is a problem for a number of the A4_1 and A4_2 tests. And also a large number of tests later on, such as ToASCII("xn--gl0as212a.8.") or ToASCII("1.27").
* Test for ToASCII("$") is marked P1 and V6, not U1. This might apply more widely.
* NV8 is not used as a status.
* A3 and X3 do not appear to be used as a status. (These are catered for by P4 presumably.)
* CheckBidi is not V8. V8 does not appear to be used. You'd have to filter out all B1-6 statuses instead.
93 changes: 93 additions & 0 deletions url/resources/IdnaTestV2-parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import json
import requests

if not os.path.exists("IdnaTestV2.txt"):
# Download IdnaTestV2.txt if it doesn't exist yet
open("IdnaTestV2.txt", "w").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)

test_input = open("IdnaTestV2.txt", "r").readlines()
test_output = ["This resource is a conversion of IdnaTestV2 aimed to match the requirements of the URL Standard's domain to ASCII"]

def remove_escapes(input):
return json.loads("\"" + input + "\"")

unique_statuses = []

i = 0
for test in test_input:
# Remove newlines
test = test.rstrip()

# Remove lines from test_input that are comments or empty
if test.startswith("#") or test == "":
continue

# Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
test = remove_escapes(test)

# Normalize columns
#
# Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
# about the following columns:
#
# * Column 1: source
# * Column 4: toAsciiN
# * Column 5: toAsciiNStatus
columns = [column.strip() for column in test.split(";")]

# Column 1
column_source = columns[0]

# Column 4 (if empty, use Column 2; if empty again, use Column 1)
column_to_ascii = columns[3]
if column_to_ascii == "":
column_to_ascii = columns[1]
if column_to_ascii == "":
column_to_ascii = column_source

# Column 5 (if empty, use Column 3; if empty again, assume empty list)
column_status = columns[4]
if column_status == "":
column_status = columns[2]
if column_status == "":
column_status = []
else:
assert column_status.startswith("[")
column_status = [status.strip() for status in column_status[1:-1].split(",")]

for status in column_status:
if status not in unique_statuses:
unique_statuses.append(status)

# The URL Standard has
#
# * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
# * CheckHyphens=false; thus ignore V2, V3?
# * VerifyDnsLength=false; thus ignore A4_1 and A4_2
comment = ""
for ignored_status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
if ignored_status in column_status:
column_status.remove(ignored_status)
comment += ignored_status + " (ignored); "
for status in column_status:
comment += status + "; "
if comment != "":
comment = comment.strip()[:-1]

output = column_to_ascii
if len(column_status) > 0:
output = None

test_output_entry = { "input": column_source, "output": output }
if comment != "":
test_output_entry["comment"] = comment

test_output.append(test_output_entry)

handle = open("IdnaTestV2.json", "w")
handle.write(json.dumps(test_output, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
handle.write("\n")

unique_statuses.sort()
print(unique_statuses)
Loading

0 comments on commit 4708e9a

Please sign in to comment.