diff --git a/README.md b/README.md index 4412fb0a..52ad91fa 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Check [ThankYouBackers](https://github.com/NaturalIntelligence/ThankYouBackers) + @@ -106,7 +107,7 @@ In a HTML page 3. [XML Builder](./docs/v4/3.XMLBuilder.md) 4. [XML Validator](./docs/v4/4.XMLValidator.md) 5. [Entites](./docs/5.Entities.md) - +6. [HTML Document Parsing](./docs/6.HTMLParsing.md) ## Performance ### XML Parser diff --git a/docs/v4/3.XMLBuilder.md b/docs/v4/3.XMLBuilder.md index 2163cc5a..b023ddfd 100644 --- a/docs/v4/3.XMLBuilder.md +++ b/docs/v4/3.XMLBuilder.md @@ -79,6 +79,11 @@ When you parse a XML using XMLParser with `preserveOrder: true`, the result JS o ## processEntities Set it to `true` (default) to process XML entities. Check [Entities](./5.Entities.md) section for more detail. If you don't have entities in your XML document then it is recommanded to disable it `processEntities: false` for better performance. +## stopNodes +As you set `stopNodes` to the XML parser configuration to avoid parsing and processing of any tag, you can set it builder to avoid parsing and entity processing. Check [HTML Document Parsing](./6.HTMLParsing.md) for more detail. + +This property is currently supported with `preserveOrder: true` option only. + ## suppressEmptyNode Tags with no text value would be parsed as empty tags. Input diff --git a/docs/v4/5.Entities.md b/docs/v4/5.Entities.md index 438bae9d..a5eaf985 100644 --- a/docs/v4/5.Entities.md +++ b/docs/v4/5.Entities.md @@ -132,4 +132,6 @@ Following HTML entities are supported by the parser by default when `htmlEntitie | ₹ | Indian Rupee | &inr; | ₹ | --- -In future version of FXP, we'll be supporting more features of DOCTYPE such as `ELEMENT`, reading content for an entity from a file etc. \ No newline at end of file +In future version of FXP, we'll be supporting more features of DOCTYPE such as `ELEMENT`, reading content for an entity from a file etc. + +[> Next: HTML Document Parsing](./6.HTMLParsing.md) \ No newline at end of file diff --git a/docs/v4/6.HTMLParsing.md b/docs/v4/6.HTMLParsing.md new file mode 100644 index 00000000..aec23aac --- /dev/null +++ b/docs/v4/6.HTMLParsing.md @@ -0,0 +1,204 @@ +FXP supports parsing of HTML document. Here is an example; + +Input HTML Document +```html + + + + + + Fast XMl Parser + + + + + + + + + + +

Heading

+
+

&inr;

+
+                    

Heading

+
+

&inr;

+
+ + + +``` + +Code and necessary configuration to parse it to JS object. + +```js +const parsingOptions = { + ignoreAttributes: false, + // preserveOrder: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes : [ "*.pre", "*.script"], + processEntities: true, + htmlEntities: true + }; + const parser = new XMLParser(parsingOptions); + parser.parse(html); +``` + +JS Object +```json +{ + "html": { + "head": { + "script": [ + "\n window.dataLayer = window.dataLayer || [];\n function gtag(){dataLayer.push(arguments);}\n gtag('js', new Date());\n \n gtag('config', 'UA-80202630-2');\n ", + { + "@_src": "static/js/jquery-3.2.1.min.js" + } + ], + "title": "Fast XMl Parser", + "meta": [ + { + "@_charset": "UTF-8" + }, + { + "@_name": "viewport", + "@_content": "width=device-width, initial-scale=1" + } + ], + "link": [ + { + "@_rel": "stylesheet", + "@_href": "static/css/bootstrap.min.css" + }, + { + "@_rel": "stylesheet", + "@_href": "static/css/jquery-confirm.min.css" + }, + { + "@_rel": "stylesheet", + "@_type": "text/css", + "@_href": "style.css" + } + ], + "style": ".CodeMirror{\n height: 100%;\n width: 100%;\n }" + }, + "body": { + "h1": "Heading", + "hr": "", + "h2": "₹", + "pre": "\n

Heading

\n
\n

&inr;

\n ", + "script": "\n let highlightedLine = null;\n let editor;\n \n function updateLength(){\n const xmlData = editor.getValue();\n $(\"#lengthxml\")[0].innerText = xmlData.replace(/>s*<\").length;\n }\n ", + "@_role": "document", + "@_style": "background-color: #2c3e50;" + }, + "@_lang": "en" + } +} +``` + +To build the HTML document back from JS object, you need to uncomment `preserveOrder: true` in above code. And pass the output to the XML builder; +```js +const parsingOptions = { + ignoreAttributes: false, + preserveOrder: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes : [ "*.pre", "*.script"], + processEntities: true, + htmlEntities: true + }; + const parser = new XMLParser(parsingOptions); + let result = parser.parse(html); + + const builderOptions = { + ignoreAttributes: false, + format: true, + preserveOrder: true, + suppressEmptyNode: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes : [ "*.pre", "*.script"], + } + const builder = new XMLBuilder(builderOptions); + const output = builder.build(result); +``` + +Output +```html + + + + + Fast XMl Parser + + + + + + + + + + +

+ Heading +

+
+

+ ₹ +

+
+      
+                    

Heading

+
+

&inr;

+ +
+ + + +``` \ No newline at end of file diff --git a/nexttodo.md b/nexttodo.md index 1c7573ce..4ee8bba5 100644 --- a/nexttodo.md +++ b/nexttodo.md @@ -1,28 +1,27 @@ P0 * OptionsBuilder: replace by Object.assign -* update README for main features +* Support setting entities externally as option configuration + * : https://github.com/NaturalIntelligence/fast-xml-parser/issues/342 +* Write UT for nested stop node +* support stop nodes expression like head.*.meta P1 -* special characters such as '&' - https://github.com/NaturalIntelligence/fast-xml-parser/issues/297 - https://github.com/NaturalIntelligence/fast-xml-parser/issues/343 - https://github.com/NaturalIntelligence/fast-xml-parser/issues/342 -* skip parsing of particular tag -* doctype support * Es6 modules -* Parse JSON string to XML. Currently it transforms JSON object to XML. Partially done. Need to work on performance. -* boolean tag to support HTML parsing - * https://github.com/NaturalIntelligence/fast-xml-parser/issues/206 P2 * Multiple roots * skip parsing of after some tag * validate XML stream data +* Parse JSON string to XML. Currently it transforms JSON object to XML. Partially done. Need to work on performance. * Accept streams, arrayBuffer https://github.com/NaturalIntelligence/fast-xml-parser/issues/347 * XML to JSON ML : https://en.wikipedia.org/wiki/JsonML + + + + ---- Entities diff --git a/spec/html_spec.js b/spec/html_spec.js new file mode 100644 index 00000000..ebbece14 --- /dev/null +++ b/spec/html_spec.js @@ -0,0 +1,80 @@ + +const {XMLParser, XMLBuilder} = require("../src/fxp"); + +describe("XMLParser", function() { + + it("should parse HTML with basic entities,
, 
+        
+                Fast XMl Parser
+                
+                
+                
+                
+                
+        
+                
+                
+            
+            
+            

Heading

+
+

&inr;

+
+                    

Heading

+
+

&inr;

+
+ + + `; + +const parsingOptions = { + ignoreAttributes: false, + preserveOrder: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes : [ "*.pre", "*.script"], + processEntities: true, + htmlEntities: true + }; + const parser = new XMLParser(parsingOptions); + let result = parser.parse(html); +// console.log(JSON.stringify(result, null,4)); + + const builderOptions = { + ignoreAttributes: false, + format: true, + preserveOrder: true, + suppressEmptyNode: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes : [ "*.pre", "*.script"], + } + const builder = new XMLBuilder(builderOptions); + let output = builder.build(result); +// console.log(output); + output = output.replace('₹','&inr;'); + expect(output.replace(/\s+/g, "")).toEqual(html.replace(/\s+/g, "")); + }); +}); \ No newline at end of file diff --git a/src/fxp.d.ts b/src/fxp.d.ts index fef5a7f8..1e5d2538 100644 --- a/src/fxp.d.ts +++ b/src/fxp.d.ts @@ -46,6 +46,7 @@ type XmlBuilderOptions = { suppressEmptyNode: boolean; preserveOrder: boolean; unpairedTags: string[]; + stopNodes: string[]; tagValueProcessor: (name: string, value: string) => string; attributeValueProcessor: (name: string, value: string) => string; processEntities: boolean; diff --git a/src/xmlbuilder/json2xml.js b/src/xmlbuilder/json2xml.js index e0fe2553..7b7c2c7d 100644 --- a/src/xmlbuilder/json2xml.js +++ b/src/xmlbuilder/json2xml.js @@ -27,7 +27,8 @@ const defaultOptions = { "sQuot" : { regex: new RegExp("\'", "g"), val: "'" }, "dQuot" : { regex: new RegExp("\"", "g"), val: """ } }, - processEntities: true + processEntities: true, + stopNodes: [] }; const props = [ @@ -47,6 +48,7 @@ const props = [ "unpairedTags", "entities", "processEntities", + "stopNodes", // 'rootNodeName', //when jsObject have multiple properties on root level ]; diff --git a/src/xmlbuilder/orderedJs2Xml.js b/src/xmlbuilder/orderedJs2Xml.js index 1c45084a..4cba7e39 100644 --- a/src/xmlbuilder/orderedJs2Xml.js +++ b/src/xmlbuilder/orderedJs2Xml.js @@ -7,10 +7,10 @@ const {EOL} = require('os'); * @returns */ function toXml(jArray, options){ - return arrToStr( jArray, options, 0); + return arrToStr( jArray, options, "", 0); } -function arrToStr(arr, options, level){ +function arrToStr(arr, options, jPath, level){ let xmlStr = ""; let indentation = ""; @@ -21,10 +21,16 @@ function arrToStr(arr, options, level){ for (let i = 0; i < arr.length; i++) { const tagObj = arr[i]; const tagName = propName(tagObj); + let newJPath = ""; + if(jPath.length === 0) newJPath = tagName + else newJPath = `${jPath}.${tagName}`; if(tagName === options.textNodeName){ - let tagText = options.tagValueProcessor( tagName, tagObj[tagName]); - tagText = replaceEntitiesValue(tagText, options); + let tagText = tagObj[tagName]; + if(!isStopNode(newJPath, options)){ + tagText = options.tagValueProcessor( tagName, tagText); + tagText = replaceEntitiesValue(tagText, options); + } xmlStr += indentation + tagText; continue; }else if( tagName === options.cdataPropName){ @@ -36,7 +42,7 @@ function arrToStr(arr, options, level){ } const attStr = attr_to_str(tagObj.attributes, options); let tagStart = indentation + `<${tagName}${attStr}`; - let tagValue = arrToStr(tagObj[tagName], options, level + 1); + let tagValue = arrToStr(tagObj[tagName], options, newJPath, level + 1); if( (!tagValue || tagValue.length === 0) && options.suppressEmptyNode){ if(options.unpairedTags.indexOf(tagName) !== -1){ xmlStr += tagStart + ">"; @@ -72,6 +78,15 @@ function attr_to_str(attrMap, options){ return attrStr; } +function isStopNode(jPath, options){ + jPath = jPath.substr(0,jPath.length - options.textNodeName.length - 1); + let tagName = jPath.substr(jPath.lastIndexOf(".") + 1); + for(let index in options.stopNodes){ + if(options.stopNodes[index] === jPath || options.stopNodes[index] === "*."+tagName) return true; + } + return false; +} + function replaceEntitiesValue(textValue, options){ if(textValue && textValue.length > 0 && options.processEntities){ for (const entityName in options.entities) {