Skip to content

Commit

Permalink
fix: Preserve invalid nested A tags in AST (see #215 for detail)
Browse files Browse the repository at this point in the history
  • Loading branch information
nonara authored Sep 8, 2022
1 parent 8a98795 commit 374188f
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 37 deletions.
39 changes: 39 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# These settings are for any web project

# Handle line endings automatically for files detected as text
# and leave all files detected as binary untouched.
* text=auto

# Force the following filetypes to have unix eols, so Windows does not break them
*.* text eol=lf

# Windows forced line-endings
/.idea/* text eol=crlf

#
## These files are binary and should be left untouched
#

# (binary is a macro for -text -diff)
*.png binary
*.jpg binary
*.jpeg binary
*.gif binary
*.ico binary
*.mov binary
*.mp4 binary
*.mp3 binary
*.flv binary
*.fla binary
*.swf binary
*.gz binary
*.zip binary
*.7z binary
*.ttf binary
*.eot binary
*.woff binary
*.pyc binary
*.pdf binary
*.ez binary
*.bz2 binary
*.swp binary
10 changes: 7 additions & 3 deletions src/nodes/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,10 @@ const kElementsClosedByClosing = {
export interface Options {
lowerCaseTagName: boolean;
comment: boolean;
/**
* @see PR #215 for explanation
*/
fixNestedATags?: boolean;
parseNoneClosedTags?: boolean;
blockTextElements: {
[tag: string]: boolean;
Expand Down Expand Up @@ -1036,7 +1040,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
let match: RegExpExecArray;
// https://github.com/taoqf/node-html-parser/issues/38
data = `<${frameflag}>${data}</${frameflag}>`;
const { lowerCaseTagName } = options;
const { lowerCaseTagName, fixNestedATags } = options;

const dataEndPos = data.length - (frameflag.length + 2);
const frameFlagOffset = frameflag.length + 2;
Expand Down Expand Up @@ -1097,7 +1101,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
}

// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
if (tagName === 'a' || tagName === 'A') {
if (fixNestedATags && (tagName === 'a' || tagName === 'A')) {
if (noNestedTagIndex !== undefined) {
stack.splice(noNestedTagIndex);
currentParent = arr_back(stack);
Expand Down Expand Up @@ -1142,7 +1146,7 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
// Handle closing tags or self-closed elements (ie </tag> or <br>)
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) {
while (true) {
if (tagName === 'a' || tagName === 'A') noNestedTagIndex = undefined;
if (noNestedTagIndex != null && (tagName === 'a' || tagName === 'A')) noNestedTagIndex = undefined;
if (currentParent.rawTagName === tagName) {
// Update range end for closed tag
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
Expand Down
34 changes: 0 additions & 34 deletions test/tests/issues/144.js

This file was deleted.

63 changes: 63 additions & 0 deletions test/tests/nested-a-tag.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
const { parse, NodeType } = require('@test/test-target');

describe('Nested A Tags', function () {
it('Tags preserved by default', function () {
const html = `<A href="#"><b>link <a href="#">nested link</a> end</b></A>`;

const root = parse(html);

root.innerHTML.should.eql(`<A href="#"><b>link <a href="#">nested link</a> end</b></A>`);
root.childNodes.length.should.eql(1);

const a1 = root.childNodes[0];
a1.tagName.should.eql('A');
a1.nodeType.should.eql(NodeType.ELEMENT_NODE);
a1.childNodes.length.should.eql(1);

const b = a1.childNodes[0];
b.tagName.should.eql('B');
b.childNodes.length.should.eql(3);
b.text.should.eql('link nested link end');

const a2 = b.childNodes[1];
a2.tagName.should.eql('A');
a2.nodeType.should.eql(NodeType.ELEMENT_NODE);
a2.childNodes.length.should.eql(1);
a2.childNodes[0].nodeType.should.eql(NodeType.TEXT_NODE);
a2.text.should.eql('nested link');

const endText = b.childNodes[2];
endText.nodeType.should.eql(NodeType.TEXT_NODE);
endText.textContent.should.eql(' end');
});

it('Tags fixed with fixNestedATags option', function () {
const html = `<A href="#"><b>link <a href="#">nested link</a> end</b></A>`;

const root = parse(html, { fixNestedATags: true });

root.innerHTML.should.eql(`<A href="#"><b>link </b></A><a href="#">nested link</a> end`);
root.childNodes.length.should.eql(3);

const a1 = root.childNodes[0];
a1.tagName.should.eql('A');
a1.nodeType.should.eql(NodeType.ELEMENT_NODE);
a1.childNodes.length.should.eql(1);

const b = a1.childNodes[0];
b.tagName.should.eql('B');
b.childNodes.length.should.eql(1);
b.text.should.eql('link ');

const a2 = root.childNodes[1];
a2.tagName.should.eql('A');
a2.nodeType.should.eql(NodeType.ELEMENT_NODE);
a2.childNodes.length.should.eql(1);
a2.childNodes[0].nodeType.should.eql(NodeType.TEXT_NODE);
a2.text.should.eql('nested link');

const endText = root.childNodes[2];
endText.nodeType.should.eql(NodeType.TEXT_NODE);
endText.textContent.should.eql(' end');
});
});

0 comments on commit 374188f

Please sign in to comment.