Skip to content

Commit

Permalink
Update to follow the TR46 rev 31
Browse files Browse the repository at this point in the history
See https://www.unicode.org/reports/tr46/tr46-31.html#Modifications.

This includes:

* Updating to Unicode 15.1.0.
* Changing processingOption to a transitionalProcessing boolean flag.
* Introducing the ignoreInvalidPunycode option.
* A few other algorithm changes.
  • Loading branch information
domenic committed Nov 5, 2023
1 parent 10a2405 commit 39e16f8
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 79 deletions.
38 changes: 23 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,26 @@ Converts a string of Unicode symbols to a case-folded Punycode string of ASCII s

Available options:

* [`checkBidi`](#checkBidi)
* [`checkHyphens`](#checkHyphens)
* [`checkJoiners`](#checkJoiners)
* [`processingOption`](#processingOption)
* [`useSTD3ASCIIRules`](#useSTD3ASCIIRules)
* [`verifyDNSLength`](#verifyDNSLength)
* [`checkBidi`](#checkbidi)
* [`checkHyphens`](#checkhyphens)
* [`checkJoiners`](#checkjoiners)
* [`ignoreInvalidPunycode`](#ignoreinvalidpunycode)
* [`transitionalProcessing`](#transitionalprocessing)
* [`useSTD3ASCIIRules`](#usestd3asciirules)
* [`verifyDNSLength`](#verifydnslength)

### `toUnicode(domainName[, options])`

Converts a case-folded Punycode string of ASCII symbols to a string of Unicode symbols.

Available options:

* [`checkBidi`](#checkBidi)
* [`checkHyphens`](#checkHyphens)
* [`checkJoiners`](#checkJoiners)
* [`processingOption`](#processingOption)
* [`useSTD3ASCIIRules`](#useSTD3ASCIIRules)
* [`checkBidi`](#checkbidi)
* [`checkHyphens`](#checkhyphens)
* [`checkJoiners`](#checkjoiners)
* [`ignoreInvalidPunycode`](#ignoreinvalidpunycode)
* [`transitionalProcessing`](#transitionalprocessing)
* [`useSTD3ASCIIRules`](#usestd3asciirules)

## Options

Expand All @@ -49,11 +51,17 @@ Type: `boolean`
Default value: `false`
When set to `true`, any word joiner characters within the input will be checked for validation.

### `processingOption`
### `ignoreInvalidPunycode`

Type: `string`
Default value: `"nontransitional"`
When set to `"transitional"`, symbols within the input will be validated according to the older IDNA2003 protocol. When set to `"nontransitional"`, the current IDNA2008 protocol will be used.
Type: `boolean`
Default value: `false`
When set to `true`, invalid Punycode strings within the input will be allowed.

### `transitionalProcessing`

Type: `boolean`
Default value: `false`
When set to `true`, uses [transitional (compatibility) processing](https://unicode.org/reports/tr46/#Compatibility_Processing) of the deviation characters.

### `useSTD3ASCIIRules`

Expand Down
120 changes: 80 additions & 40 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,25 +41,27 @@ function findStatus(val, { useSTD3ASCIIRules }) {
return null;
}

function mapChars(domainName, { useSTD3ASCIIRules, processingOption }) {
let hasError = false;
function mapChars(domainName, { useSTD3ASCIIRules, transitionalProcessing }) {
let processed = "";

for (const ch of domainName) {
const [status, mapping] = findStatus(ch.codePointAt(0), { useSTD3ASCIIRules });

switch (status) {
case STATUS_MAPPING.disallowed:
hasError = true;
processed += ch;
break;
case STATUS_MAPPING.ignored:
break;
case STATUS_MAPPING.mapped:
processed += mapping;
if (transitionalProcessing && ch === "ẞ") {
processed += "ss";
} else {
processed += mapping;
}
break;
case STATUS_MAPPING.deviation:
if (processingOption === "transitional") {
if (transitionalProcessing) {
processed += mapping;
} else {
processed += ch;
Expand All @@ -71,40 +73,73 @@ function mapChars(domainName, { useSTD3ASCIIRules, processingOption }) {
}
}

return {
string: processed,
error: hasError
};
return processed;
}

function validateLabel(label, { checkHyphens, checkBidi, checkJoiners, processingOption, useSTD3ASCIIRules }) {
function validateLabel(label, {
checkHyphens,
checkBidi,
checkJoiners,
transitionalProcessing,
useSTD3ASCIIRules,
isBidi
}) {
// "must be satisfied for a non-empty label"
if (label.length === 0) {
return true;
}

// "1. The label must be in Unicode Normalization Form NFC."
if (label.normalize("NFC") !== label) {
return false;
}

const codePoints = Array.from(label);

// "2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character in both the
// third and fourth positions."
//
// "3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character."
if (checkHyphens) {
if ((codePoints[2] === "-" && codePoints[3] === "-") ||
(label.startsWith("-") || label.endsWith("-"))) {
return false;
}
}

if (label.includes(".") ||
(codePoints.length > 0 && regexes.combiningMarks.test(codePoints[0]))) {
// "4. If not CheckHyphens, the label must not begin with “xn--”."
// Disabled while we figure out https://github.com/whatwg/url/issues/803.
// if (!checkHyphens) {
// if (label.startsWith("xn--")) {
// return false;
// }
// }

// "5. The label must not contain a U+002E ( . ) FULL STOP."
if (label.includes(".")) {
return false;
}

// "6. The label must not begin with a combining mark, that is: General_Category=Mark."
if (regexes.combiningMarks.test(codePoints[0])) {
return false;
}

// "7. Each code point in the label must only have certain Status values according to Section 5"
for (const ch of codePoints) {
const [status] = findStatus(ch.codePointAt(0), { useSTD3ASCIIRules });
if ((processingOption === "transitional" && status !== STATUS_MAPPING.valid) ||
(processingOption === "nontransitional" &&
status !== STATUS_MAPPING.valid && status !== STATUS_MAPPING.deviation)) {
if (transitionalProcessing) {
// "For Transitional Processing (deprecated), each value must be valid."
if (status !== STATUS_MAPPING.valid) {
return false;
}
} else if (status !== STATUS_MAPPING.valid && status !== STATUS_MAPPING.deviation) {
// "For Nontransitional Processing, each value must be either valid or deviation."
return false;
}
}

// "8. If CheckJoiners, the label must satisify the ContextJ rules"
// https://tools.ietf.org/html/rfc5892#appendix-A
if (checkJoiners) {
let last = 0;
Expand All @@ -129,10 +164,9 @@ function validateLabel(label, { checkHyphens, checkBidi, checkJoiners, processin
}
}

// "9. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy..."
// https://tools.ietf.org/html/rfc5893#section-2
// For the codePoints length check, see discussion in https://github.com/jsdom/whatwg-url/pull/250 and the second item
// in https://github.com/whatwg/url/issues/744.
if (checkBidi && codePoints.length > 0) {
if (checkBidi && isBidi) {
let rtl;

// 1
Expand Down Expand Up @@ -175,10 +209,8 @@ function isBidiDomain(labels) {
}

function processing(domainName, options) {
const { processingOption } = options;

// 1. Map.
let { string, error } = mapChars(domainName, options);
let string = mapChars(domainName, options);

// 2. Normalize.
string = string.normalize("NFC");
Expand All @@ -188,18 +220,26 @@ function processing(domainName, options) {
const isBidi = isBidiDomain(labels);

// 4. Convert/Validate.
let error = false;
for (const [i, origLabel] of labels.entries()) {
let label = origLabel;
let curProcessing = processingOption;
let transitionalProcessingForThisLabel = options.transitionalProcessing;
if (label.startsWith("xn--")) {
try {
label = punycode.decode(label.substring(4));
labels[i] = label;
} catch (err) {
if (containsNonASCII(label)) {
error = true;
continue;
}
curProcessing = "nontransitional";

try {
label = punycode.decode(label.substring(4));
} catch {
if (!options.ignoreInvalidPunycode) {
error = true;
continue;
}
}
labels[i] = label;
transitionalProcessingForThisLabel = false;
}

// No need to validate if we already know there is an error.
Expand All @@ -208,8 +248,8 @@ function processing(domainName, options) {
}
const validation = validateLabel(label, {
...options,
processingOption: curProcessing,
checkBidi: options.checkBidi && isBidi
transitionalProcessing: transitionalProcessingForThisLabel,
isBidi
});
if (!validation) {
error = true;
Expand All @@ -227,19 +267,17 @@ function toASCII(domainName, {
checkBidi = false,
checkJoiners = false,
useSTD3ASCIIRules = false,
processingOption = "nontransitional",
verifyDNSLength = false
verifyDNSLength = false,
transitionalProcessing = false,
ignoreInvalidPunycode = false
} = {}) {
if (processingOption !== "transitional" && processingOption !== "nontransitional") {
throw new RangeError("processingOption must be either transitional or nontransitional");
}

const result = processing(domainName, {
processingOption,
checkHyphens,
checkBidi,
checkJoiners,
useSTD3ASCIIRules
useSTD3ASCIIRules,
transitionalProcessing,
ignoreInvalidPunycode
});
let labels = result.string.split(".");
labels = labels.map(l => {
Expand Down Expand Up @@ -278,14 +316,16 @@ function toUnicode(domainName, {
checkBidi = false,
checkJoiners = false,
useSTD3ASCIIRules = false,
processingOption = "nontransitional"
transitionalProcessing = false,
ignoreInvalidPunycode = false
} = {}) {
const result = processing(domainName, {
processingOption,
checkHyphens,
checkBidi,
checkJoiners,
useSTD3ASCIIRules
useSTD3ASCIIRules,
transitionalProcessing,
ignoreInvalidPunycode
});

return {
Expand Down
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
},
"devDependencies": {
"@domenic/eslint-config": "^3.0.0",
"@unicode/unicode-15.0.0": "^1.5.2",
"@unicode/unicode-15.1.0": "^1.5.2",
"eslint": "^8.53.0",
"regenerate": "^1.4.2"
},
"unicodeVersion": "15.0.0"
"unicodeVersion": "15.1.0"
}
24 changes: 6 additions & 18 deletions test/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,6 @@ const path = require("path");

const tr46 = require("../index.js");

// UTS #46 version 13.0.0 includes the following broken tests.
// They both include U+18C4E, which prior to 13.0.0 was treated as disallowed
// but became valid in 13.0.0. However, the IdnaTestV2.txt file does not appear
// to be updated for this change, and still considers them invalid.
const UNICODE_13_BROKEN_TO_UNICODE_TESTS = [
[0x3A1B, 0x18C4E, 0x2E, 0x3002, 0x37, 0x0D01].map(cp => String.fromCodePoint(cp)).join(""),
"xn--mbm8237g..xn--7-7hf"
];

function normalize(inp) {
let out = "";

Expand All @@ -33,13 +24,13 @@ function normalize(inp) {
return out;
}

function testConversionOption(source, expected, status, option) {
function testConversionOption(source, expected, status, transitionalProcessing) {
const out = tr46.toASCII(source, {
checkHyphens: true,
checkBidi: true,
checkJoiners: true,
useSTD3ASCIIRules: true,
processingOption: option,
transitionalProcessing,
verifyDNSLength: true
});

Expand All @@ -55,19 +46,16 @@ function testConversionOption(source, expected, status, option) {

function testConversion(testCase) {
return () => {
testConversionOption(testCase.source, testCase.toASCIIN, testCase.toASCIINStatus, "nontransitional");
testConversionOption(testCase.source, testCase.toASCIIT, testCase.toASCIITStatus, "transitional");
testConversionOption(testCase.source, testCase.toASCIIN, testCase.toASCIINStatus, false);
testConversionOption(testCase.source, testCase.toASCIIT, testCase.toASCIITStatus, true);

const res = tr46.toUnicode(testCase.source, {
checkHyphens: true,
checkBidi: true,
checkJoiners: true,
useSTD3ASCIIRules: true,
processingOption: "nontransitional"
useSTD3ASCIIRules: true
});
if (UNICODE_13_BROKEN_TO_UNICODE_TESTS.includes(testCase.source)) {
assert.ok(!res.error);
} else if (testCase.toUnicodeStatus) { // Error code
if (testCase.toUnicodeStatus) { // Error code
assert.ok(res.error, "ToUnicode should result in an error");
} else {
assert.equal(res.domain, testCase.toUnicode, "ToUnicode should equal the expected value");
Expand Down

0 comments on commit 39e16f8

Please sign in to comment.