From ba5f35e7680468acd7906eaabb2f69e28ed8b2aa Mon Sep 17 00:00:00 2001 From: amit kumar gupta Date: Sat, 24 Feb 2024 10:08:30 +0530 Subject: [PATCH] XML Parser v5 --- CHANGELOG.md | 3 + README.md | 21 +- docs/v5/1. Getting Started.md | 217 ++++++++++++++++ package-lock.json | 4 +- package.json | 2 +- spec/v5/test.js | 32 +++ src/v5/CharsSymbol.js | 16 ++ src/v5/EntitiesParser.js | 105 ++++++++ src/v5/OptionsBuilder.js | 73 ++++++ src/v5/OutputBuilders/BaseOutputBuilder.js | 69 +++++ src/v5/OutputBuilders/JsArrBuilder.js | 102 ++++++++ src/v5/OutputBuilders/JsMinArrBuilder.js | 101 ++++++++ src/v5/OutputBuilders/JsObjBuilder.js | 155 ++++++++++++ src/v5/OutputBuilders/ParserOptionsBuilder.js | 94 +++++++ src/v5/Report.js | 0 src/v5/TagPath.js | 81 ++++++ src/v5/TagPathMatcher.js | 15 ++ src/v5/XMLParser.js | 85 +++++++ src/v5/Xml2JsParser.js | 237 ++++++++++++++++++ src/v5/XmlPartReader.js | 212 ++++++++++++++++ src/v5/XmlSpecialTagsReader.js | 118 +++++++++ src/v5/inputSource/BufferSource.js | 118 +++++++++ src/v5/inputSource/StringSource.js | 123 +++++++++ src/v5/valueParsers/EntitiesParser.js | 105 ++++++++ src/v5/valueParsers/booleanParser.js | 23 ++ src/v5/valueParsers/booleanParserExt.js | 20 ++ src/v5/valueParsers/currency.js | 31 +++ src/v5/valueParsers/join.js | 14 ++ src/v5/valueParsers/number.js | 16 ++ src/v5/valueParsers/trim.js | 8 + 30 files changed, 2183 insertions(+), 17 deletions(-) create mode 100644 docs/v5/1. Getting Started.md create mode 100644 spec/v5/test.js create mode 100644 src/v5/CharsSymbol.js create mode 100644 src/v5/EntitiesParser.js create mode 100755 src/v5/OptionsBuilder.js create mode 100644 src/v5/OutputBuilders/BaseOutputBuilder.js create mode 100644 src/v5/OutputBuilders/JsArrBuilder.js create mode 100644 src/v5/OutputBuilders/JsMinArrBuilder.js create mode 100644 src/v5/OutputBuilders/JsObjBuilder.js create mode 100644 src/v5/OutputBuilders/ParserOptionsBuilder.js create mode 100644 src/v5/Report.js create mode 100644 src/v5/TagPath.js create mode 100644 src/v5/TagPathMatcher.js create mode 100755 src/v5/XMLParser.js create mode 100644 src/v5/Xml2JsParser.js create mode 100644 src/v5/XmlPartReader.js create mode 100644 src/v5/XmlSpecialTagsReader.js create mode 100644 src/v5/inputSource/BufferSource.js create mode 100644 src/v5/inputSource/StringSource.js create mode 100644 src/v5/valueParsers/EntitiesParser.js create mode 100644 src/v5/valueParsers/booleanParser.js create mode 100644 src/v5/valueParsers/booleanParserExt.js create mode 100644 src/v5/valueParsers/currency.js create mode 100644 src/v5/valueParsers/join.js create mode 100644 src/v5/valueParsers/number.js create mode 100644 src/v5/valueParsers/trim.js diff --git a/CHANGELOG.md b/CHANGELOG.md index 12cfb9d0..9caee3c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ Note: If you find missing information about particular minor version, that version must have been changed without any functional change in this library. +**4.3.5 / 2024-02-24** +* code for v5 is added for experimental use + **4.3.4 / 2024-01-10** * fix: Don't escape entities in CDATA sections (#633) (By [wackbyte](https://github.com/wackbyte)) diff --git a/README.md b/README.md index 968ae608..500e1f36 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ Validate XML, Parse XML to JS Object, or Build XML from JS Object without C/C++ based libraries and no callback. -I need a Career advice. I've posted the query on my profile. Your support would be appreciable. +> XML Parser v5 is added for experimental use +> https://solothought.com Sponsor this project 👉 @@ -91,6 +92,11 @@ If you want to be an anonymous user of this application and don't want to be hig * Supports parsing of PI (Processing Instruction) tags with XML declaration tags * And many more other features. +## v5 +I developed v5 in Apr 2023. And I didn't get the chance to complete all the features. I've ensured that new features don't impact performance. With v5, you have more control on parsing output. Check [docs](./docs/v5) for syntax help and basic understanding. + +Please leave a comment in discussion forum for your suggestions and if you really need v5. + ## How to use To use as package dependency @@ -174,19 +180,6 @@ Check lib folder for different browser bundles [![](static/img/ni_ads_ads.gif)](https://github.com/NaturalIntelligence/ads/) -## Our other projects and research you must try - -* **[BigBit standard](https://github.com/amitguptagwl/bigbit)** : - * Single text encoding to replace UTF-8, UTF-16, UTF-32 and more with less memory. - * Single Numeric datatype alternative of integer, float, double, long, decimal and more without precision loss. -* **[Cytorus](https://github.com/NaturalIntelligence/cytorus)**: Be specific and flexible while running E2E tests. - * Run tests only for a particular User Story - * Run tests for a route or from a route - * Customizable reporting - * Central dashboard for better monitoring - * Options to integrate E2E tests with Jira, Github etc using Central dashboard `Tian`. -* **[Stubmatic](https://github.com/NaturalIntelligence/Stubmatic)** : Create fake webservices, DynamoDB or S3 servers, Manage fake/mock stub data, Or fake any HTTP(s) call. - ## Supporters ### Contributors diff --git a/docs/v5/1. Getting Started.md b/docs/v5/1. Getting Started.md new file mode 100644 index 00000000..5f192a4d --- /dev/null +++ b/docs/v5/1. Getting Started.md @@ -0,0 +1,217 @@ + + +Example + +```js +const options = { + preserveOrder: true, + removeNSPrefix: false, // remove NS from tag name or attribute name if true + stopNodes: [], //nested tags will not be parsed even for errors + htmlEntities: false, + tags:{ + unpaired: [], + nameFor:{ + cdata: false, + comment: false, + text: '#text' + }, + separateTextProperty: false, + //"join" only if preserveOrder: true + valueParsers: ["trim","entities","join","boolean","number","currency","date"] + }, + attributes: { + ignore: false, + booleanType:true, + entities: true, + //"groupBy": "att" + }, + OutputBuilder: new JsObjOutputBuilder() +}; +const parser = new XMLParser(options); +let result = parser.parse(xmlData, true); +``` + +- You can build your own Output Builder. FXP provides 3 builders + - JsObjOutputBuilder + - JsArrBuilder + - JsMinArrBuilder +- You can control the sequence of value parsing for a tag or attribute +- You can pass a string or bytes array as input. + +### Value Parser +You can change the sequence of value parsers or remove one or provide your own parser to control the parsing. + +### Output builders +You can use provided output builds or your own output builder. + +JsObjOutputBuilder +```js +{ + "soap:Envelope": { + "@_xmlns:soap": "http://schemas.xmlsoap.org/soap/envelope/", + "soap:Body": { + "rpt:loadReportFileResponseElem": { + "@_xmlns:s": "http://bus.x.com/common/support/v1", + "@_xmlns:rpt": "http://bus.x.com/service/statement/v1", + "s:code": 0, + "s:responseTime": 2588, + "s:responseDbTime": 1893, + "s:requestId": "6b408fd09eb211e7a0807e34820340ec", + "s:route": "172.16.x.x:9192", + "rpt:result": { + "rpt:file": "\n \n \n 0\n 2588\n 1893\n 6b408fd09eb211e7a0807e34820340ec\n 172.16.x.x:9192\n \n \n \n \n \n" + } + } + } + } +} +``` + +JsArrBuilder +```js +{ + "tagname": "soap:Envelope", + "child": [ + { + "tagname": "soap:Body", + "child": [ + { + "tagname": "rpt:loadReportFileResponseElem", + "child": [ + { + "tagname": "s:code", + "child": [ + { + "#text": 0 + } + ] + }, + { + "tagname": "s:responseTime", + "child": [ + { + "#text": 2588 + } + ] + }, + { + "tagname": "s:responseDbTime", + "child": [ + { + "#text": 1893 + } + ] + }, + { + "tagname": "s:requestId", + "child": [ + { + "#text": "6b408fd09eb211e7a0807e34820340ec" + } + ] + }, + { + "tagname": "s:route", + "child": [ + { + "#text": "172.16.x.x:9192" + } + ] + }, + { + "tagname": "rpt:result", + "child": [ + { + "tagname": "rpt:file", + "child": [ + { + "#text": "\n \n \n 0\n 2588\n 1893\n 6b408fd09eb211e7a0807e34820340ec\n 172.16.x.x:9192\n \n \n \n \n \n" + } + ] + } + ] + } + ], + ":@": { + "@_xmlns:s": "http://bus.x.com/common/support/v1", + "@_xmlns:rpt": "http://bus.x.com/service/statement/v1" + } + } + ] + } + ], + ":@": { + "@_xmlns:soap": "http://schemas.xmlsoap.org/soap/envelope/" + } +} +``` + +JsMinArrBuilder +```js +{ + "soap:Envelope": [ + { + "soap:Body": [ + { + "rpt:loadReportFileResponseElem": [ + { + "s:code": [ + { + "#text": 0 + } + ] + }, + { + "s:responseTime": [ + { + "#text": 2588 + } + ] + }, + { + "s:responseDbTime": [ + { + "#text": 1893 + } + ] + }, + { + "s:requestId": [ + { + "#text": "6b408fd09eb211e7a0807e34820340ec" + } + ] + }, + { + "s:route": [ + { + "#text": "172.16.x.x:9192" + } + ] + }, + { + "rpt:result": [ + { + "rpt:file": [ + { + "#text": "\n \n \n 0\n 2588\n 1893\n 6b408fd09eb211e7a0807e34820340ec\n 172.16.x.x:9192\n \n \n \n \n \n" + } + ] + } + ] + } + ], + ":@": { + "@_xmlns:s": "http://bus.x.com/common/support/v1", + "@_xmlns:rpt": "http://bus.x.com/service/statement/v1" + } + } + ] + } + ], + ":@": { + "@_xmlns:soap": "http://schemas.xmlsoap.org/soap/envelope/" + } +} +``` + diff --git a/package-lock.json b/package-lock.json index edf64ce0..15e334ef 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "fast-xml-parser", - "version": "4.3.4", + "version": "4.3.5", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "fast-xml-parser", - "version": "4.3.4", + "version": "4.3.5", "funding": [ { "type": "paypal", diff --git a/package.json b/package.json index 46edb97c..9f121a3b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fast-xml-parser", - "version": "4.3.4", + "version": "4.3.5", "description": "Validate XML, Parse XML, Build XML without C/C++ based libraries", "main": "./src/fxp.js", "scripts": { diff --git a/spec/v5/test.js b/spec/v5/test.js new file mode 100644 index 00000000..8a45e270 --- /dev/null +++ b/spec/v5/test.js @@ -0,0 +1,32 @@ +const XMLParser = require("../../src/v5/XMLParser"); +const JsObjOutputBuilder = require("../../src/v5/OutputBuilders/JsObjBuilder"); +const JsArrBuilder = require("../../src/v5/OutputBuilders/JsArrBuilder"); +const JsMinArrBuilder = require("../../src/v5/OutputBuilders/JsMinArrBuilder"); + +const fs = require("fs"); +const path = require("path"); +const fileNamePath = path.join(__dirname, "../assets/ptest.xml");//with CDATA +// const fileNamePath = path.join(__dirname, "../assets/ptest_with_prolog.xml");//with CDATA +// const fileNamePath = path.join(__dirname, "../assets/sample.xml");//1.5k +// const fileNamePath = path.join(__dirname, "../assets/midsize.xml");//13m +// const fileNamePath = path.join(__dirname, "../assets/large.xml");//98m +const xmlData = fs.readFileSync(fileNamePath).toString(); + +describe("XMLParser Entities", function() { + + it("should parse", function() { + + const options = { + attributes: { + ignore: false, + booleanType:true + }, + OutputBuilder: new JsMinArrBuilder() + }; + const parser = new XMLParser(options); + let result = parser.parse(xmlData); + + console.log(JSON.stringify(result,null,4)); + // expect(result).toEqual(expected); + }); +}); \ No newline at end of file diff --git a/src/v5/CharsSymbol.js b/src/v5/CharsSymbol.js new file mode 100644 index 00000000..fa5ce9e8 --- /dev/null +++ b/src/v5/CharsSymbol.js @@ -0,0 +1,16 @@ +modules.export = { + "<" : "<", //tag start + ">" : ">", //tag end + "/" : "/", //close tag + "!" : "!", //comment or docttype + "!--" : "!--", //comment + "-->" : "-->", //comment end + "?" : "?", //pi + "?>" : "?>", //pi end + "?xml" : "?xml", //pi end + "![" : "![", //cdata + "]]>" : "]]>", //cdata end + "[" : "[", + "-" : "-", + "D" : "D", +} \ No newline at end of file diff --git a/src/v5/EntitiesParser.js b/src/v5/EntitiesParser.js new file mode 100644 index 00000000..d7dc400c --- /dev/null +++ b/src/v5/EntitiesParser.js @@ -0,0 +1,105 @@ +const ampEntity = { regex: /&(amp|#38|#x26);/g, val : "&"}; +const htmlEntities = { + "space": { regex: /&(nbsp|#160);/g, val: " " }, + // "lt" : { regex: /&(lt|#60);/g, val: "<" }, + // "gt" : { regex: /&(gt|#62);/g, val: ">" }, + // "amp" : { regex: /&(amp|#38);/g, val: "&" }, + // "quot" : { regex: /&(quot|#34);/g, val: "\"" }, + // "apos" : { regex: /&(apos|#39);/g, val: "'" }, + "cent" : { regex: /&(cent|#162);/g, val: "¢" }, + "pound" : { regex: /&(pound|#163);/g, val: "£" }, + "yen" : { regex: /&(yen|#165);/g, val: "¥" }, + "euro" : { regex: /&(euro|#8364);/g, val: "€" }, + "copyright" : { regex: /&(copy|#169);/g, val: "©" }, + "reg" : { regex: /&(reg|#174);/g, val: "®" }, + "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, +}; + +class EntitiesParser{ + constructor(replaceHtmlEntities) { + this.replaceHtmlEntities = replaceHtmlEntities; + this.docTypeEntities = {}; + this.lastEntities = { + "apos" : { regex: /&(apos|#39|#x27);/g, val : "'"}, + "gt" : { regex: /&(gt|#62|#x3E);/g, val : ">"}, + "lt" : { regex: /&(lt|#60|#x3C);/g, val : "<"}, + "quot" : { regex: /&(quot|#34|#x22);/g, val : "\""}, + }; + } + + addExternalEntities(externalEntities){ + const entKeys = Object.keys(externalEntities); + for (let i = 0; i < entKeys.length; i++) { + const ent = entKeys[i]; + this.addExternalEntity(ent,externalEntities[ent]) + } + } + addExternalEntity(key,val){ + validateEntityName(key); + if(val.indexOf("&") !== -1) { + reportWarning(`Entity ${key} is not added as '&' is found in value;`) + return; + }else{ + this.lastEntities[ent] = { + regex: new RegExp("&"+key+";","g"), + val : val + } + } + } + + addDocTypeEntities(entities){ + const entKeys = Object.keys(entities); + for (let i = 0; i < entKeys.length; i++) { + const ent = entKeys[i]; + this.docTypeEntities[ent] = { + regex: new RegExp("&"+ent+";","g"), + val : entities[ent] + } + } + } + + parse(val){ + return this.replaceEntitiesValue(val) + } + + /** + * 1. Replace DOCTYPE entities + * 2. Replace external entities + * 3. Replace HTML entities if asked + * @param {string} val + */ + replaceEntitiesValue(val){ + if(typeof val === "string" && val.length > 0){ + for(let entityName in this.docTypeEntities){ + const entity = this.docTypeEntities[entityName]; + val = val.replace( entity.regx, entity.val); + } + for(let entityName in this.lastEntities){ + const entity = this.lastEntities[entityName]; + val = val.replace( entity.regex, entity.val); + } + if(this.replaceHtmlEntities){ + for(let entityName in htmlEntities){ + const entity = htmlEntities[entityName]; + val = val.replace( entity.regex, entity.val); + } + } + val = val.replace( ampEntity.regex, ampEntity.val); + } + return val; + } +}; + +//an entity name should not contains special characters that may be used in regex +//Eg !?\\\/[]$%{}^&*()<> +const specialChar = "!?\\\/[]$%{}^&*()<>|+"; + +function validateEntityName(name){ + for (let i = 0; i < specialChar.length; i++) { + const ch = specialChar[i]; + if(name.indexOf(ch) !== -1) throw new Error(`Invalid character ${ch} in entity name`); + } + return name; +} + +module.exports = EntitiesParser; \ No newline at end of file diff --git a/src/v5/OptionsBuilder.js b/src/v5/OptionsBuilder.js new file mode 100755 index 00000000..5957ad04 --- /dev/null +++ b/src/v5/OptionsBuilder.js @@ -0,0 +1,73 @@ + +const JsArrBuilder = require("./OutputBuilders/JsArrBuilder"); + +const defaultOptions = { + preserveOrder: false, + removeNSPrefix: false, // remove NS from tag name or attribute name if true + //ignoreRootElement : false, + stopNodes: [], //nested tags will not be parsed even for errors + // isArray: () => false, //User will set it + htmlEntities: false, + // skipEmptyListItem: false + tags:{ + unpaired: [], + nameFor:{ + cdata: false, + comment: false, + text: '#text' + }, + separateTextProperty: false, + valueParsers: [] + }, + attributes:{ + ignore: false, + booleanType: true, + entities: true + }, + + // select: ["img[src]"], + // stop: ["anim", "[ads]"] + only: [], // rest tags will be skipped. It will result in flat array + hierarchy: false, //will be used when a particular tag is set to be parsed. + skip: [], // will be skipped from parse result. on('skip') will be triggered + + select: [], // on('select', tag => tag ) will be called if match + stop: [], //given tagPath will not be parsed. innerXML will be set as string value + OutputBuilder: new JsArrBuilder(), +}; + +const buildOptions = function(options) { + const finalOptions = { ... defaultOptions}; + finalOptions.tags.valueParsers.push("trim"); + finalOptions.tags.valueParsers.push("entities"); + if(!this.preserveOrder) + finalOptions.tags.valueParsers.push("join"); + finalOptions.tags.valueParsers.push("boolean"); + finalOptions.tags.valueParsers.push("number"); + finalOptions.tags.valueParsers.push("currency"); + finalOptions.tags.valueParsers.push("date"); + copyProperties(finalOptions,options) + return finalOptions; +}; + +function copyProperties(target, source) { + for (let key in source) { + if (source.hasOwnProperty(key)) { + if (key === 'OutputBuilder') { + target[key] = source[key]; + }else if (typeof source[key] === 'object' && !Array.isArray(source[key])) { + // Recursively copy nested properties + if (typeof target[key] === 'undefined') { + target[key] = {}; + } + copyProperties(target[key], source[key]); + } else { + // Copy non-nested properties + target[key] = source[key]; + } + } + } +} + +exports.buildOptions = buildOptions; +exports.defaultOptions = defaultOptions; \ No newline at end of file diff --git a/src/v5/OutputBuilders/BaseOutputBuilder.js b/src/v5/OutputBuilders/BaseOutputBuilder.js new file mode 100644 index 00000000..d076dadb --- /dev/null +++ b/src/v5/OutputBuilders/BaseOutputBuilder.js @@ -0,0 +1,69 @@ +class BaseOutputBuilder{ + constructor(){ + // this.attributes = {}; + } + + addAttribute(name, value){ + if(this.options.onAttribute){ + //TODO: better to pass tag path + const v = this.options.onAttribute(name, value, this.tagName); + if(!v) this.attributes[v.name] = v.value; + }else{ + name = this.options.attributes.prefix + name + this.options.attributes.suffix; + this.attributes[name] = this.parseValue(value, this.options.attributes.valueParsers); + } + } + + /** + * parse value by chain of parsers + * @param {string} val + * @returns {any} parsed value if matching parser found + */ + parseValue = function(val, valParsers){ + for (let i = 0; i < valParsers.length; i++) { + let valParser = this.registeredParsers[valParsers[i]]; + if(valParser){ + val = valParser.parse(val); + // if(!valParser.chainable) break; + } + } + return val; + } + + /** + * To add a nested empty tag. + * @param {string} key + * @param {any} val + */ + _addChild(key, val){} + + /** + * skip the comment if property is not set + */ + addComment(text){ + if(this.options.nameFor.comment) + this._addChild(this.options.nameFor.comment, text); + } + + //store CDATA separately if property is set + //otherwise add to tag's value + addCdata(text){ + if (this.options.nameFor.cdata) { + this._addChild(this.options.nameFor.cdata, text); + } else { + this.addRawValue(text || ""); + } + } + + addRawValue = text => this.addValue(text); + + addDeclaration(){ + if(!this.options.declaration){ + }else{ + this.addPi("?xml"); + } + this.attributes = {} + } +} + +module.exports = BaseOutputBuilder; \ No newline at end of file diff --git a/src/v5/OutputBuilders/JsArrBuilder.js b/src/v5/OutputBuilders/JsArrBuilder.js new file mode 100644 index 00000000..70fbe851 --- /dev/null +++ b/src/v5/OutputBuilders/JsArrBuilder.js @@ -0,0 +1,102 @@ +const {buildOptions,registerCommonValueParsers} = require("./ParserOptionsBuilder"); + +class OutputBuilder{ + constructor(options){ + this.options = buildOptions(options); + this.registeredParsers = registerCommonValueParsers(); + } + + registerValueParser(name,parserInstance){//existing name will override the parser without warning + this.registeredParsers[name] = parserInstance; + } + + getInstance(){ + return new JsArrBuilder(this.options, this.registeredParsers); + } +} + +const rootName = '!js_arr'; +const BaseOutputBuilder = require("./BaseOutputBuilder"); + +class JsArrBuilder extends BaseOutputBuilder{ + + constructor(options,registeredParsers) { + super(); + this.tagsStack = []; + this.options = options; + this.registeredParsers = registeredParsers; + + this.root = new Node(rootName); + this.currentNode = this.root; + this.attributes = {}; + } + + addTag(tag){ + //when a new tag is added, it should be added as child of current node + //TODO: shift this check to the parser + if(tag.name === "__proto__") tag.name = "#__proto__"; + + this.tagsStack.push(this.currentNode); + this.currentNode = new Node(tag.name, this.attributes); + this.attributes = {}; + } + + /** + * Check if the node should be added by checking user's preference + * @param {Node} node + * @returns boolean: true if the node should not be added + */ + closeTag(){ + const node = this.currentNode; + this.currentNode = this.tagsStack.pop(); //set parent node in scope + if(this.options.onClose !== undefined){ + //TODO TagPathMatcher + const resultTag = this.options.onClose(node, + new TagPathMatcher(this.tagsStack,node)); + + if(resultTag) return; + } + this.currentNode.child.push(node); //to parent node + } + + //Called by parent class methods + _addChild(key, val){ + // if(key === "__proto__") tagName = "#__proto__"; + this.currentNode.child.push( {[key]: val }); + // this.currentNode.leafType = false; + } + + /** + * Add text value child node + * @param {string} text + */ + addValue(text){ + this.currentNode.child.push( {[this.options.nameFor.text]: this.parseValue(text, this.options.tags.valueParsers) }); + } + + addPi(name){ + //TODO: set pi flag + if(!this.options.ignorePiTags){ + const node = new Node(name, this.attributes); + this.currentNode[":@"] = this.attributes; + this.currentNode.child.push(node); + } + this.attributes = {}; + } + getOutput(){ + return this.root.child[0]; + } +} + + + +class Node{ + constructor(tagname, attributes){ + this.tagname = tagname; + this.child = []; //nested tags, text, cdata, comments + if(attributes && Object.keys(attributes).length > 0) + this[":@"] = attributes; + } +} + +module.exports = OutputBuilder; \ No newline at end of file diff --git a/src/v5/OutputBuilders/JsMinArrBuilder.js b/src/v5/OutputBuilders/JsMinArrBuilder.js new file mode 100644 index 00000000..b0cb4b53 --- /dev/null +++ b/src/v5/OutputBuilders/JsMinArrBuilder.js @@ -0,0 +1,101 @@ +const {buildOptions,registerCommonValueParsers} = require("./ParserOptionsBuilder"); + +class OutputBuilder{ + constructor(options){ + this.options = buildOptions(options); + this.registeredParsers = registerCommonValueParsers(); + } + + registerValueParser(name,parserInstance){//existing name will override the parser without warning + this.registeredParsers[name] = parserInstance; + } + + getInstance(){ + return new JsMinArrBuilder(this.options, this.registeredParsers); + } +} + +const BaseOutputBuilder = require("./BaseOutputBuilder"); +const rootName = '^'; + +class JsMinArrBuilder extends BaseOutputBuilder{ + + constructor(options,registeredParsers) { + super(); + this.tagsStack = []; + this.options = options; + this.registeredParsers = registeredParsers; + + this.root = {[rootName]: []}; + this.currentNode = this.root; + this.currentNodeTagName = rootName; + this.attributes = {}; + } + + addTag(tag){ + //when a new tag is added, it should be added as child of current node + //TODO: shift this check to the parser + if(tag.name === "__proto__") tag.name = "#__proto__"; + + this.tagsStack.push([this.currentNodeTagName,this.currentNode]); //this.currentNode is parent node here + this.currentNodeTagName = tag.name; + this.currentNode = { [tag.name]:[]} + if(Object.keys(this.attributes).length > 0){ + this.currentNode[":@"] = this.attributes; + this.attributes = {}; + } + } + + /** + * Check if the node should be added by checking user's preference + * @param {Node} node + * @returns boolean: true if the node should not be added + */ + closeTag(){ + const node = this.currentNode; + const nodeName = this.currentNodeTagName; + const arr = this.tagsStack.pop(); //set parent node in scope + this.currentNodeTagName = arr[0]; + this.currentNode = arr[1]; + + if(this.options.onClose !== undefined){ + //TODO TagPathMatcher + const resultTag = this.options.onClose(node, + new TagPathMatcher(this.tagsStack,node)); + + if(resultTag) return; + } + this.currentNode[this.currentNodeTagName].push(node); //to parent node + } + + //Called by parent class methods + _addChild(key, val){ + // if(key === "__proto__") tagName = "#__proto__"; + this.currentNode.push( {[key]: val }); + // this.currentNode.leafType = false; + } + + /** + * Add text value child node + * @param {string} text + */ + addValue(text){ + this.currentNode[this.currentNodeTagName].push( {[this.options.nameFor.text]: this.parseValue(text, this.options.tags.valueParsers) }); + } + + addPi(name){ + if(!this.options.ignorePiTags){ + const node = { [name]:[]} + if(this.attributes){ + node[":@"] = this.attributes; + } + this.currentNode.push(node); + } + this.attributes = {}; + } + getOutput(){ + return this.root[rootName]; + } +} + +module.exports = OutputBuilder; \ No newline at end of file diff --git a/src/v5/OutputBuilders/JsObjBuilder.js b/src/v5/OutputBuilders/JsObjBuilder.js new file mode 100644 index 00000000..1b2b9f80 --- /dev/null +++ b/src/v5/OutputBuilders/JsObjBuilder.js @@ -0,0 +1,155 @@ + + +const {buildOptions,registerCommonValueParsers} = require("./ParserOptionsBuilder"); + +class OutputBuilder{ + constructor(options){ + this.options = buildOptions(options); + this.registeredParsers = registerCommonValueParsers(); + } + + registerValueParser(name,parserInstance){//existing name will override the parser without warning + this.registeredParsers[name] = parserInstance; + } + + getInstance(){ + return new JsObjBuilder(this.options, this.registeredParsers); + } +} + +const BaseOutputBuilder = require("./BaseOutputBuilder"); +const rootName = '^'; + +class JsObjBuilder extends BaseOutputBuilder{ + + constructor(options,registeredParsers) { + super(); + //hold the raw detail of a tag and sequence with reference to the output + this.tagsStack = []; + this.options = options; + this.registeredParsers = registeredParsers; + + this.root = {}; + this.parent = this.root; + this.tagName = rootName; + this.value = {}; + this.textValue = ""; + this.attributes = {}; + } + + addTag(tag){ + + let value = ""; + if( !isEmpty(this.attributes)){ + value = {}; + if(this.options.attributes.groupBy){ + value[this.options.attributes.groupBy] = this.attributes; + }else{ + value = this.attributes; + } + } + + this.tagsStack.push([this.tagName, this.textValue, this.value]); //parent tag, parent text value, parent tag value (jsobj) + this.tagName = tag.name; + this.value = value; + this.textValue = ""; + this.attributes = {}; + } + + /** + * Check if the node should be added by checking user's preference + * @param {Node} node + * @returns boolean: true if the node should not be added + */ + closeTag(){ + const tagName = this.tagName; + let value = this.value; + let textValue = this.textValue; + + //update tag text value + if(typeof value !== "object" && !Array.isArray(value)){ + value = this.parseValue(textValue.trim(), this.options.tags.valueParsers); + }else if(textValue.length > 0){ + value[this.options.nameFor.text] = this.parseValue(textValue.trim(), this.options.tags.valueParsers); + } + + + let resultTag= { + tagName: this.tagName, + value: value + }; + + if(this.options.onTagClose !== undefined){ + //TODO TagPathMatcher + resultTag = this.options.onClose(this.tagName, value, this.textValue, new TagPathMatcher(this.tagsStack,node)); + + if(!resultTag) return; + } + + //set parent node in scope + let arr = this.tagsStack.pop(); + let parentTag = arr[2]; + parentTag=this._addChildTo(resultTag.tagName, resultTag.value, parentTag); + + this.tagName = arr[0]; + this.textValue = arr[1]; + this.value = parentTag; + } + + _addChild(key, val){ + if(typeof this.value === "string"){ + this.value = { [this.options.nameFor.text] : this.value }; + } + + this._addChildTo(key, val, this.value); + // this.currentNode.leafType = false; + this.attributes = {}; + } + + _addChildTo(key, val, node){ + if(typeof node === 'string') node = {}; + if(!node[key]){ + node[key] = val; + }else{ //Repeated + if(!Array.isArray(node[key])){ //but not stored as array + node[key] = [node[key]]; + } + node[key].push(val); + } + return node; + } + + + /** + * Add text value child node + * @param {string} text + */ + addValue(text){ + //TODO: use bytes join + if(this.textValue.length > 0) this.textValue += " " + text; + else this.textValue = text; + } + + addPi(name){ + let value = ""; + if( !isEmpty(this.attributes)){ + value = {}; + if(this.options.attributes.groupBy){ + value[this.options.attributes.groupBy] = this.attributes; + }else{ + value = this.attributes; + } + } + this._addChild(name, value); + + } + getOutput(){ + return this.value; + } +} + +function isEmpty(obj) { + return Object.keys(obj).length === 0; +} + +module.exports = OutputBuilder; \ No newline at end of file diff --git a/src/v5/OutputBuilders/ParserOptionsBuilder.js b/src/v5/OutputBuilders/ParserOptionsBuilder.js new file mode 100644 index 00000000..5c20e72e --- /dev/null +++ b/src/v5/OutputBuilders/ParserOptionsBuilder.js @@ -0,0 +1,94 @@ +const trimParser = require("../valueParsers/trim") +const booleanParser = require("../valueParsers/booleanParser") +const currencyParser = require("../valueParsers/currency") +const numberParser = require("../valueParsers/number") + +const defaultOptions={ + nameFor:{ + text: "#text", + comment: "", + cdata: "", + }, + // onTagClose: () => {}, + // onAttribute: () => {}, + piTag: false, + declaration: false, //"?xml" + tags: { + valueParsers: [ + // "trim", + // "boolean", + // "number", + // "currency", + // "date", + ] + }, + attributes:{ + prefix: "@_", + suffix: "", + groupBy: "", + + valueParsers: [ + // "trim", + // "boolean", + // "number", + // "currency", + // "date", + ] + } +} +function buildOptions(options){ + //clone + const finalOptions = { ... defaultOptions}; + + //add config missed in cloning + finalOptions.tags.valueParsers.push("trim") + finalOptions.tags.valueParsers.push("boolean") + finalOptions.tags.valueParsers.push("number") + finalOptions.tags.valueParsers.push("currency") + + //add config missed in cloning + finalOptions.attributes.valueParsers.push("trim") + finalOptions.attributes.valueParsers.push("boolean") + finalOptions.attributes.valueParsers.push("number") + finalOptions.attributes.valueParsers.push("currency") + + copyProperties(finalOptions,options); + return finalOptions; +} + +function copyProperties(target, source) { + for (let key in source) { + if (source.hasOwnProperty(key)) { + if (typeof source[key] === 'object' && !Array.isArray(source[key])) { + // Recursively copy nested properties + if (typeof target[key] === 'undefined') { + target[key] = {}; + } + copyProperties(target[key], source[key]); + } else { + // Copy non-nested properties + target[key] = source[key]; + } + } + } +} + +function registerCommonValueParsers(){ + return { + "trim": new trimParser(), + // "join": this.entityParser.parse, + "boolean": new booleanParser(), + "number": new numberParser({ + hex: true, + leadingZeros: true, + eNotation: true + }), + "currency": new currencyParser(), + // "date": this.entityParser.parse, + } +} + +module.exports = { + buildOptions : buildOptions, + registerCommonValueParsers: registerCommonValueParsers +} \ No newline at end of file diff --git a/src/v5/Report.js b/src/v5/Report.js new file mode 100644 index 00000000..e69de29b diff --git a/src/v5/TagPath.js b/src/v5/TagPath.js new file mode 100644 index 00000000..d901cc3c --- /dev/null +++ b/src/v5/TagPath.js @@ -0,0 +1,81 @@ +class TagPath{ + constructor(pathStr){ + let text = ""; + let tName = ""; + let pos; + let aName = ""; + let aVal = ""; + this.stack = [] + + for (let i = 0; i < pathStr.length; i++) { + let ch = pathStr[i]; + if(ch === " ") { + if(text.length === 0) continue; + tName = text; text = ""; + }else if(ch === "["){ + if(tName.length === 0){ + tName = text; text = ""; + } + i++; + for (; i < pathStr.length; i++) { + ch = pathStr[i]; + if(ch=== "=") continue; + else if(ch=== "]") {aName = text.trim(); text=""; break; i--;} + else if(ch === "'" || ch === '"'){ + let attrEnd = pathStr.indexOf(ch,i+1); + aVal = pathStr.substring(i+1, attrEnd); + i = attrEnd; + }else{ + text +=ch; + } + } + }else if(ch !== " " && text.length === 0 && tName.length > 0){//reading tagName + //save previous tag + this.stack.push(new TagPathNode(tName,pos,aName,aVal)); + text = ch; tName = ""; aName = ""; aVal = ""; + }else{ + text+=ch; + } + } + + //last tag in the path + if(tName.length >0 || text.length>0){ + this.stack.push(new TagPathNode(text||tName,pos,aName,aVal)); + } + } + + match(tagStack,node){ + if(this.stack[0].name !== "*"){ + if(this.stack.length !== tagStack.length +1) return false; + + //loop through tagPath and tagStack and match + for (let i = 0; i < this.tagStack.length; i++) { + if(!this.stack[i].match(tagStack[i])) return false; + } + } + if(!this.stack[this.stack.length - 1].match(node)) return false; + return true; + } +} + +class TagPathNode{ + constructor(name,position,attrName,attrVal){ + this.name = name; + this.position = position; + this.attrName = attrName, + this.attrVal = attrVal; + } + + match(node){ + let matching = true; + matching = node.name === this.name; + if(this.position) matching = node.position === this.position; + if(this.attrName) matching = node.attrs[this.attrName !== undefined]; + if(this.attrVal) matching = node.attrs[this.attrName !== this.attrVal]; + return matching; + } +} + +// console.log((new TagPath("* b[b]")).stack); +// console.log((new TagPath("a[a] b[b] c")).stack); +// console.log((new TagPath(" b [ b= 'cf sdadwa' ] a ")).stack); \ No newline at end of file diff --git a/src/v5/TagPathMatcher.js b/src/v5/TagPathMatcher.js new file mode 100644 index 00000000..af236070 --- /dev/null +++ b/src/v5/TagPathMatcher.js @@ -0,0 +1,15 @@ +const TagPath = require("./TagPath"); + +class TagPathMatcher{ + constructor(stack,node){ + this.stack = stack; + this.node= node; + } + + match(path){ + const tagPath = new TagPath(path); + return tagPath.match(this.stack, this.node); + } +} + +module.exports = TagPathMatcher; \ No newline at end of file diff --git a/src/v5/XMLParser.js b/src/v5/XMLParser.js new file mode 100755 index 00000000..6de58ed1 --- /dev/null +++ b/src/v5/XMLParser.js @@ -0,0 +1,85 @@ +const { buildOptions} = require("./OptionsBuilder"); +const Xml2JsParser = require("./Xml2JsParser"); + +class XMLParser{ + + constructor(options){ + this.externalEntities = {}; + this.options = buildOptions(options); + // console.log(this.options) + } + /** + * Parse XML data string to JS object + * @param {string|Buffer} xmlData + * @param {boolean|Object} validationOption + */ + parse(xmlData){ + if(Array.isArray(xmlData) && xmlData.byteLength !== undefined){ + return this.parse(xmlData); + }else if( xmlData.toString){ + xmlData = xmlData.toString(); + }else{ + throw new Error("XML data is accepted in String or Bytes[] form.") + } + // if( validationOption){ + // if(validationOption === true) validationOption = {}; //validate with default options + + // const result = validator.validate(xmlData, validationOption); + // if (result !== true) { + // throw Error( `${result.err.msg}:${result.err.line}:${result.err.col}` ) + // } + // } + const parser = new Xml2JsParser(this.options); + parser.entityParser.addExternalEntities(this.externalEntities); + return parser.parse(xmlData); + } + /** + * Parse XML data buffer to JS object + * @param {string|Buffer} xmlData + * @param {boolean|Object} validationOption + */ + parseBytesArr(xmlData){ + if(Array.isArray(xmlData) && xmlData.byteLength !== undefined){ + }else{ + throw new Error("XML data is accepted in Bytes[] form.") + } + const parser = new Xml2JsParser(this.options); + parser.entityParser.addExternalEntities(this.externalEntities); + return parser.parseBytesArr(xmlData); + } + /** + * Parse XML data stream to JS object + * @param {fs.ReadableStream} xmlDataStream + */ + parseStream(xmlDataStream){ + if(!isStream(xmlDataStream)) throw new Error("FXP: Invalid stream input"); + + const orderedObjParser = new Xml2JsParser(this.options); + orderedObjParser.entityParser.addExternalEntities(this.externalEntities); + return orderedObjParser.parseStream(xmlDataStream); + } + + /** + * Add Entity which is not by default supported by this library + * @param {string} key + * @param {string} value + */ + addEntity(key, value){ + if(value.indexOf("&") !== -1){ + throw new Error("Entity value can't have '&'") + }else if(key.indexOf("&") !== -1 || key.indexOf(";") !== -1){ + throw new Error("An entity must be set without '&' and ';'. Eg. use '#xD' for ' '") + }else if(value === "&"){ + throw new Error("An entity with value '&' is not permitted"); + }else{ + this.externalEntities[key] = value; + } + } +} + +function isStream(stream){ + if(stream && typeof stream.read === "function" && typeof stream.on === "function" && typeof stream.readableEnded === "boolean") return true; + return false; +} + +module.exports = XMLParser; \ No newline at end of file diff --git a/src/v5/Xml2JsParser.js b/src/v5/Xml2JsParser.js new file mode 100644 index 00000000..c4baab45 --- /dev/null +++ b/src/v5/Xml2JsParser.js @@ -0,0 +1,237 @@ +const StringSource = require("./inputSource/StringSource"); +const BufferSource = require("./inputSource/BufferSource"); +const {readTagExp,readClosingTagName} = require("./XmlPartReader"); +const {readComment, readCdata,readDocType,readPiTag} = require("./XmlSpecialTagsReader"); +const TagPath = require("./TagPath"); +const TagPathMatcher = require("./TagPathMatcher"); +const EntitiesParser = require('./EntitiesParser'); + +//To hold the data of current tag +//This is usually used to compare jpath expression against current tag +class TagDetail{ + constructor(name){ + this.name = name; + this.position = 0; + // this.attributes = {}; + } +} + +class Xml2JsParser { + constructor(options) { + this.options = options; + + this.currentTagDetail = null; + this.tagTextData = ""; + this.tagsStack = []; + this.entityParser = new EntitiesParser(options.htmlEntities); + this.stopNodes = []; + for (let i = 0; i < this.options.stopNodes.length; i++) { + this.stopNodes.push(new TagPath(this.options.stopNodes[i])); + } + } + + parse(strData) { + this.source = new StringSource(strData); + this.parseXml(); + return this.outputBuilder.getOutput(); + } + parseBytesArr(data) { + this.source = new BufferSource(data ); + this.parseXml(); + return this.outputBuilder.getOutput(); + } + + parseXml() { + //TODO: Separate TagValueParser as separate class. So no scope issue in node builder class + + //OutputBuilder should be set in XML Parser + this.outputBuilder = this.options.OutputBuilder.getInstance(this.options); + this.root = { root: true}; + this.currentTagDetail = this.root; + + while(this.source.canRead()){ + let ch = this.source.readCh(); + if (ch === "") break; + + if(ch === "<"){//tagStart + let nextChar = this.source.readChAt(0); + if (nextChar === "" ) throw new Error("Unexpected end of source"); + + + if(nextChar === "!" || nextChar === "?"){ + this.source.updateBufferBoundary(); + //previously collected text should be added to current node + this.addTextNode(); + + this.readSpecialTag(nextChar);// Read DOCTYPE, comment, CDATA, PI tag + }else if(nextChar === "/"){ + this.source.updateBufferBoundary(); + this.readClosingTag(); + // console.log(this.source.buffer.length, this.source.readable); + // console.log(this.tagsStack.length); + }else{//opening tag + this.readOpeningTag(); + } + }else{ + this.tagTextData += ch; + } + }//End While loop + if(this.tagsStack.length > 0 || ( this.tagTextData !== "undefined" && this.tagTextData.trimEnd().length > 0) ) throw new Error("Unexpected data in the end of document"); + } + + /** + * read closing paired tag. Set parent tag in scope. + * skip a node on user's choice + */ + readClosingTag(){ + const tagName = this.processTagName(readClosingTagName(this.source)); + // console.log(tagName, this.tagsStack.length); + this.validateClosingTag(tagName); + // All the text data collected, belongs to current tag. + if(!this.currentTagDetail.root) this.addTextNode(); + this.outputBuilder.closeTag(); + // Since the tag is closed now, parent tag comes in scope + this.currentTagDetail = this.tagsStack.pop(); + } + + validateClosingTag(tagName){ + // This can't be unpaired tag, or a stop tag. + if(this.isUnpaired(tagName) || this.isStopNode(tagName)) throw new Error(`Unexpected closing tag '${tagName}'`); + // This must match with last opening tag + else if(tagName !== this.currentTagDetail.name) + throw new Error(`Unexpected closing tag '${tagName}' expecting '${this.currentTagDetail.name}'`) + } + + /** + * Read paired, unpaired, self-closing, stop and special tags. + * Create a new node + * Push paired tag in stack. + */ + readOpeningTag(){ + //save previously collected text data to current node + this.addTextNode(); + + //create new tag + let tagExp = readTagExp(this, ">" ); + + // process and skip from tagsStack For unpaired tag, self closing tag, and stop node + const tagDetail = new TagDetail(tagExp.tagName); + if(this.isUnpaired(tagExp.tagName)) { + //TODO: this will lead 2 extra stack operation + this.outputBuilder.addTag(tagDetail); + this.outputBuilder.closeTag(); + } else if(tagExp.selfClosing){ + this.outputBuilder.addTag(tagDetail); + this.outputBuilder.closeTag(); + } else if(this.isStopNode(this.currentTagDetail)){ + // TODO: let's user set a stop node boundary detector for complex contents like script tag + //TODO: pass tag name only to avoid string operations + const content = source.readUptoCloseTag(` 0){ + //TODO: shift parsing to output builder + + this.outputBuilder.addValue(this.replaceEntities(this.tagTextData)); + } + this.tagTextData = ""; + } + // } + } + + processAttrName(name){ + if(name === "__proto__") name = "#__proto__"; + name = resolveNameSpace(name, this.removeNSPrefix); + return name; + } + + processTagName(name){ + if(name === "__proto__") name = "#__proto__"; + name = resolveNameSpace(name, this.removeNSPrefix); + return name; + } + + /** + * Generate tags path from tagsStack + */ + tagsPath(tagName){ + //TODO: return TagPath Object. User can call match method with path + return ""; + } + + isUnpaired(tagName){ + return this.options.tags.unpaired.indexOf(tagName) !== -1; + } + + /** + * valid expressions are + * tag nested + * * nested + * tag nested[attribute] + * tag nested[attribute=""] + * tag nested[attribute!=""] + * tag nested:0 //for future + * @param {string} tagName + * @returns + */ + isStopNode(node){ + for (let i = 0; i < this.stopNodes.length; i++) { + const givenPath = this.stopNodes[i]; + if(givenPath.match(this.tagsStack, node)) return true; + } + return false + } + + replaceEntities(text){ + //TODO: if option is set then replace entities + return this.entityParser.parse(text) + } +} + +function resolveNameSpace(name, removeNSPrefix) { + if (removeNSPrefix) { + const parts = name.split(':'); + if(parts.length === 2){ + if (parts[0] === 'xmlns') return ''; + else return parts[1]; + }else reportError(`Multiple namespaces ${name}`) + } + return name; +} + +module.exports = Xml2JsParser; \ No newline at end of file diff --git a/src/v5/XmlPartReader.js b/src/v5/XmlPartReader.js new file mode 100644 index 00000000..56b180e9 --- /dev/null +++ b/src/v5/XmlPartReader.js @@ -0,0 +1,212 @@ +'use strict'; + +/** + * find paired tag for a stop node + * @param {string} xmlDoc + * @param {string} tagName + * @param {number} i : start index + */ +function readStopNode(xmlDoc, tagName, i){ + const startIndex = i; + // Starting at 1 since we already have an open tag + let openTagCount = 1; + + for (; i < xmlDoc.length; i++) { + if( xmlDoc[i] === "<"){ + if (xmlDoc[i+1] === "/") {//close tag + const closeIndex = findSubStrIndex(xmlDoc, ">", i, `${tagName} is not closed`); + let closeTagName = xmlDoc.substring(i+2,closeIndex).trim(); + if(closeTagName === tagName){ + openTagCount--; + if (openTagCount === 0) { + return { + tagContent: xmlDoc.substring(startIndex, i), + i : closeIndex + } + } + } + i=closeIndex; + } else if(xmlDoc[i+1] === '?') { + const closeIndex = findSubStrIndex(xmlDoc, "?>", i+1, "StopNode is not closed.") + i=closeIndex; + } else if(xmlDoc.substr(i + 1, 3) === '!--') { + const closeIndex = findSubStrIndex(xmlDoc, "-->", i+3, "StopNode is not closed.") + i=closeIndex; + } else if(xmlDoc.substr(i + 1, 2) === '![') { + const closeIndex = findSubStrIndex(xmlDoc, "]]>", i, "StopNode is not closed.") - 2; + i=closeIndex; + } else { + const tagData = readTagExp(xmlDoc, i, '>') + + if (tagData) { + const openTagName = tagData && tagData.tagName; + if (openTagName === tagName && tagData.tagExp[tagData.tagExp.length-1] !== "/") { + openTagCount++; + } + i=tagData.closeIndex; + } + } + } + }//end for loop +} + +/** + * Read closing tag name + * @param {Source} source + * @returns tag name + */ +function readClosingTagName(source){ + let text = ""; //temporary data + while(source.canRead()){ + let ch = source.readCh(); + // if (ch === null || ch === undefined) break; + // source.updateBuffer(); + + if (ch === ">") return text.trimEnd(); + else text += ch; + } + throw new Error(`Unexpected end of source. Reading '${substr}'`); +} + +/** + * Read XML tag and build attributes map + * This function can be used to read normal tag, pi tag. + * This function can't be used to read comment, CDATA, DOCTYPE. + * Eg + * @param {string} xmlDoc + * @param {number} startIndex starting index + * @returns tag expression includes tag name & attribute string + */ +function readTagExp(parser) { + let inSingleQuotes = false; + let inDoubleQuotes = false; + let i; + let EOE = false; + + for (i = 0; parser.source.canRead(i); i++) { + const char = parser.source.readChAt(i); + + if (char === "'" && !inDoubleQuotes) { + inSingleQuotes = !inSingleQuotes; + } else if (char === '"' && !inSingleQuotes) { + inDoubleQuotes = !inDoubleQuotes; + } else if (char === '>' && !inSingleQuotes && !inDoubleQuotes) { + // If not inside quotes, stop reading at '>' + EOE = true; + break; + } + + } + if(inSingleQuotes || inDoubleQuotes){ + throw new Error("Invalid attribute expression. Quote is not properly closed"); + }else if(!EOE) throw new Error("Unexpected closing of source. Waiting for '>'"); + + + const exp = parser.source.readStr(i); + parser.source.updateBufferBoundary(i + 1); + return buildTagExpObj(exp, parser) +} + +function readPiExp(parser) { + let inSingleQuotes = false; + let inDoubleQuotes = false; + let i; + let EOE = false; + + for (i = 0; parser.source.canRead(i) ; i++) { + const currentChar = parser.source.readChAt(i); + const nextChar = parser.source.readChAt(i+1); + + if (currentChar === "'" && !inDoubleQuotes) { + inSingleQuotes = !inSingleQuotes; + } else if (currentChar === '"' && !inSingleQuotes) { + inDoubleQuotes = !inDoubleQuotes; + } + + if (!inSingleQuotes && !inDoubleQuotes) { + if (currentChar === '?' && nextChar === '>') { + EOE = true; + break; // Exit the loop when '?>' is found + } + } + } + if(inSingleQuotes || inDoubleQuotes){ + throw new Error("Invalid attribute expression. Quote is not properly closed in PI tag expression"); + }else if(!EOE) throw new Error("Unexpected closing of source. Waiting for '?>'"); + + if(!parser.options.attributes.ignore){ + //TODO: use regex to verify attributes if not set to ignore + } + + const exp = parser.source.readStr(i); + parser.source.updateBufferBoundary(i + 1); + return buildTagExpObj(exp, parser) +} + +function buildTagExpObj(exp, parser){ + const tagExp = { + tagName: "", + selfClosing: false + }; + let attrsExp = ""; + + if(exp[exp.length -1] === "/") tagExp.selfClosing = true; + + //separate tag name + let i = 0; + for (; i < exp.length; i++) { + const char = exp[i]; + if(char === " "){ + tagExp.tagName = exp.substring(0, i); + attrsExp = exp.substring(i + 1); + break; + } + } + //only tag + if(tagExp.tagName.length === 0 && i === exp.length)tagExp.tagName = exp; + + tagExp.tagName = tagExp.tagName.trimEnd(); + + if(!parser.options.attributes.ignore && attrsExp.length > 0){ + parseAttributesExp(attrsExp,parser) + } + + return tagExp; +} + +const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm'); + +function parseAttributesExp(attrStr, parser) { + const matches = getAllMatches(attrStr, attrsRegx); + const len = matches.length; //don't make it inline + for (let i = 0; i < len; i++) { + let attrName = parser.processAttrName(matches[i][1]); + let attrVal = parser.replaceEntities(matches[i][4] || true); + + parser.outputBuilder.addAttribute(attrName, attrVal); + } +} + + +const getAllMatches = function(string, regex) { + const matches = []; + let match = regex.exec(string); + while (match) { + const allmatches = []; + allmatches.startIndex = regex.lastIndex - match[0].length; + const len = match.length; + for (let index = 0; index < len; index++) { + allmatches.push(match[index]); + } + matches.push(allmatches); + match = regex.exec(string); + } + return matches; +}; + +module.exports = { + readStopNode: readStopNode, + readClosingTagName: readClosingTagName, + readTagExp: readTagExp, + readPiExp: readPiExp, +} \ No newline at end of file diff --git a/src/v5/XmlSpecialTagsReader.js b/src/v5/XmlSpecialTagsReader.js new file mode 100644 index 00000000..0fba196a --- /dev/null +++ b/src/v5/XmlSpecialTagsReader.js @@ -0,0 +1,118 @@ +const {readPiExp} = require("./XmlPartReader"); + +function readCdata(parser){ + //"); + parser.outputBuilder.addCdata(text); +} +function readPiTag(parser){ + //"); + if(!tagExp) throw new Error("Invalid Pi Tag expression."); + + if (tagExp.tagName === "?xml") {//TODO: test if tagName is just xml + parser.outputBuilder.addDeclaration(); + } else { + parser.outputBuilder.addPi("?"+tagExp.tagName); + } +} + +function readComment(parser){ + //"); + parser.outputBuilder.addComment(text); +} + +const DOCTYPE_tags = { + "EL":/^EMENT\s+([^\s>]+)\s+(ANY|EMPTY|\(.+\)\s*$)/m, + "AT":/^TLIST\s+[^\s]+\s+[^\s]+\s+[^\s]+\s+[^\s]+\s+$/m, + "NO":/^TATION.+$/m +} +function readDocType(parser){ + //"); + const regx = DOCTYPE_tags[str]; + if(regx){ + const match = dTagExp.match(regx); + if(!match) throw new Error("Invalid DOCTYPE"); + }else throw new Error("Invalid DOCTYPE"); + } + }else if( ch === '>' && lastch === "]"){//end of doctype + return; + } + }else if( ch === '>'){//end of doctype + return; + }else if( ch === '['){ + hasBody = true; + }else{ + lastch = ch; + } + }//End While loop + +} + +function registerEntity(parser){ + //read Entity + let attrBoundary=""; + let name ="", val =""; + while(source.canRead()){ + let ch = source.readCh(); + + if(attrBoundary){ + if (ch === attrBoundary){ + val = text; + text = "" + } + }else if(ch === " " || ch === "\t"){ + if(!name){ + name = text.trimStart(); + text = ""; + } + }else if (ch === '"' || ch === "'") {//start of attrBoundary + attrBoundary = ch; + }else if(ch === ">"){ + parser.entityParser.addExternalEntity(name,val); + return; + }else{ + text+=ch; + } + } +} + +module.exports = { + readCdata: readCdata, + readComment:readComment, + readDocType:readDocType, + readPiTag:readPiTag +} \ No newline at end of file diff --git a/src/v5/inputSource/BufferSource.js b/src/v5/inputSource/BufferSource.js new file mode 100644 index 00000000..b83ce460 --- /dev/null +++ b/src/v5/inputSource/BufferSource.js @@ -0,0 +1,118 @@ +const Constants = { + space: 32, + tab: 9 +} +class BufferSource{ + constructor(bytesArr){ + this.line = 1; + this.cols = 0; + this.buffer = bytesArr; + this.startIndex = 0; + } + + + + readCh() { + return String.fromCharCode(this.buffer[this.startIndex++]); + } + + readChAt(index) { + return String.fromCharCode(this.buffer[this.startIndex+index]); + } + + readStr(n,from){ + if(typeof from === "undefined") from = this.startIndex; + return this.buffer.slice(from, from + n).toString(); + } + + readUpto(stopStr) { + const inputLength = this.buffer.length; + const stopLength = stopStr.length; + const stopBuffer = Buffer.from(stopStr); + + for (let i = this.startIndex; i < inputLength; i++) { + let match = true; + for (let j = 0; j < stopLength; j++) { + if (this.buffer[i + j] !== stopBuffer[j]) { + match = false; + break; + } + } + + if (match) { + const result = this.buffer.slice(this.startIndex, i).toString(); + this.startIndex = i + stopLength; + return result; + } + } + + throw new Error(`Unexpected end of source. Reading '${stopStr}'`); +} + +readUptoCloseTag(stopStr) { //stopStr: "'){ //TODO: if it should be equivalent ASCII + match = 2; + //tag boundary found + // this.startIndex + } + }else{ + match = 1; + for (let j = 0; j < stopLength; j++) { + if (this.buffer[i + j] !== stopBuffer[j]) { + match = 0; + break; + } + } + } + if (match === 2) {//matched closing part + const result = this.buffer.slice(this.startIndex, stopIndex - 1 ).toString(); + this.startIndex = i + 1; + return result; + } + } + + throw new Error(`Unexpected end of source. Reading '${stopStr}'`); +} + + readFromBuffer(n, shouldUpdate) { + let ch; + if (n === 1) { + ch = this.buffer[this.startIndex]; + if (ch === 10) { + this.line++; + this.cols = 1; + } else { + this.cols++; + } + ch = String.fromCharCode(ch); + } else { + this.cols += n; + ch = this.buffer.slice(this.startIndex, this.startIndex + n).toString(); + } + if (shouldUpdate) this.updateBuffer(n); + return ch; + } + + updateBufferBoundary(n = 1) { //n: number of characters read + this.startIndex += n; + } + + canRead(n){ + n = n || this.startIndex; + return this.buffer.length - n + 1 > 0; + } + +} + +module.exports = BufferSource; \ No newline at end of file diff --git a/src/v5/inputSource/StringSource.js b/src/v5/inputSource/StringSource.js new file mode 100644 index 00000000..a996528b --- /dev/null +++ b/src/v5/inputSource/StringSource.js @@ -0,0 +1,123 @@ +const whiteSpaces = [" ", "\n", "\t"]; + + +class StringSource{ + constructor(str){ + this.line = 1; + this.cols = 0; + this.buffer = str; + //a boundary pointer to indicate where from the buffer dat should be read + // data before this pointer can be deleted to free the memory + this.startIndex = 0; + } + + readCh() { + return this.buffer[this.startIndex++]; + } + + readChAt(index) { + return this.buffer[this.startIndex+index]; + } + + readStr(n,from){ + if(typeof from === "undefined") from = this.startIndex; + return this.buffer.substring(from, from + n); + } + + readUpto(stopStr) { + const inputLength = this.buffer.length; + const stopLength = stopStr.length; + + for (let i = this.startIndex; i < inputLength; i++) { + let match = true; + for (let j = 0; j < stopLength; j++) { + if (this.buffer[i + j] !== stopStr[j]) { + match = false; + break; + } + } + + if (match) { + const result = this.buffer.substring(this.startIndex, i); + this.startIndex = i + stopLength; + return result; + } + } + + throw new Error(`Unexpected end of source. Reading '${stopStr}'`); + } + + readUptoCloseTag(stopStr) { //stopStr: "'){ + match = 2; + //tag boundary found + // this.startIndex + } + }else{ + match = 1; + for (let j = 0; j < stopLength; j++) { + if (this.buffer[i + j] !== stopStr[j]) { + match = 0; + break; + } + } + } + if (match === 2) {//matched closing part + const result = this.buffer.substring(this.startIndex, stopIndex - 1 ); + this.startIndex = i + 1; + return result; + } + } + + throw new Error(`Unexpected end of source. Reading '${stopStr}'`); + } + + readFromBuffer(n, updateIndex){ + let ch; + if(n===1){ + ch = this.buffer[this.startIndex]; + // if(ch === "\n") { + // this.line++; + // this.cols = 1; + // }else{ + // this.cols++; + // } + }else{ + ch = this.buffer.substring(this.startIndex, this.startIndex + n); + // if("".indexOf("\n") !== -1){ + // //TODO: handle the scenario when there are multiple lines + // //TODO: col should be set to number of chars after last '\n' + // // this.cols = 1; + // }else{ + // this.cols += n; + + // } + } + if(updateIndex) this.updateBufferBoundary(n); + return ch; + } + + //TODO: rename to updateBufferReadIndex + + updateBufferBoundary(n = 1) { //n: number of characters read + this.startIndex += n; + } + + canRead(n){ + n = n || this.startIndex; + return this.buffer.length - n + 1 > 0; + } + +} + +module.exports = StringSource; \ No newline at end of file diff --git a/src/v5/valueParsers/EntitiesParser.js b/src/v5/valueParsers/EntitiesParser.js new file mode 100644 index 00000000..d7dc400c --- /dev/null +++ b/src/v5/valueParsers/EntitiesParser.js @@ -0,0 +1,105 @@ +const ampEntity = { regex: /&(amp|#38|#x26);/g, val : "&"}; +const htmlEntities = { + "space": { regex: /&(nbsp|#160);/g, val: " " }, + // "lt" : { regex: /&(lt|#60);/g, val: "<" }, + // "gt" : { regex: /&(gt|#62);/g, val: ">" }, + // "amp" : { regex: /&(amp|#38);/g, val: "&" }, + // "quot" : { regex: /&(quot|#34);/g, val: "\"" }, + // "apos" : { regex: /&(apos|#39);/g, val: "'" }, + "cent" : { regex: /&(cent|#162);/g, val: "¢" }, + "pound" : { regex: /&(pound|#163);/g, val: "£" }, + "yen" : { regex: /&(yen|#165);/g, val: "¥" }, + "euro" : { regex: /&(euro|#8364);/g, val: "€" }, + "copyright" : { regex: /&(copy|#169);/g, val: "©" }, + "reg" : { regex: /&(reg|#174);/g, val: "®" }, + "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, +}; + +class EntitiesParser{ + constructor(replaceHtmlEntities) { + this.replaceHtmlEntities = replaceHtmlEntities; + this.docTypeEntities = {}; + this.lastEntities = { + "apos" : { regex: /&(apos|#39|#x27);/g, val : "'"}, + "gt" : { regex: /&(gt|#62|#x3E);/g, val : ">"}, + "lt" : { regex: /&(lt|#60|#x3C);/g, val : "<"}, + "quot" : { regex: /&(quot|#34|#x22);/g, val : "\""}, + }; + } + + addExternalEntities(externalEntities){ + const entKeys = Object.keys(externalEntities); + for (let i = 0; i < entKeys.length; i++) { + const ent = entKeys[i]; + this.addExternalEntity(ent,externalEntities[ent]) + } + } + addExternalEntity(key,val){ + validateEntityName(key); + if(val.indexOf("&") !== -1) { + reportWarning(`Entity ${key} is not added as '&' is found in value;`) + return; + }else{ + this.lastEntities[ent] = { + regex: new RegExp("&"+key+";","g"), + val : val + } + } + } + + addDocTypeEntities(entities){ + const entKeys = Object.keys(entities); + for (let i = 0; i < entKeys.length; i++) { + const ent = entKeys[i]; + this.docTypeEntities[ent] = { + regex: new RegExp("&"+ent+";","g"), + val : entities[ent] + } + } + } + + parse(val){ + return this.replaceEntitiesValue(val) + } + + /** + * 1. Replace DOCTYPE entities + * 2. Replace external entities + * 3. Replace HTML entities if asked + * @param {string} val + */ + replaceEntitiesValue(val){ + if(typeof val === "string" && val.length > 0){ + for(let entityName in this.docTypeEntities){ + const entity = this.docTypeEntities[entityName]; + val = val.replace( entity.regx, entity.val); + } + for(let entityName in this.lastEntities){ + const entity = this.lastEntities[entityName]; + val = val.replace( entity.regex, entity.val); + } + if(this.replaceHtmlEntities){ + for(let entityName in htmlEntities){ + const entity = htmlEntities[entityName]; + val = val.replace( entity.regex, entity.val); + } + } + val = val.replace( ampEntity.regex, ampEntity.val); + } + return val; + } +}; + +//an entity name should not contains special characters that may be used in regex +//Eg !?\\\/[]$%{}^&*()<> +const specialChar = "!?\\\/[]$%{}^&*()<>|+"; + +function validateEntityName(name){ + for (let i = 0; i < specialChar.length; i++) { + const ch = specialChar[i]; + if(name.indexOf(ch) !== -1) throw new Error(`Invalid character ${ch} in entity name`); + } + return name; +} + +module.exports = EntitiesParser; \ No newline at end of file diff --git a/src/v5/valueParsers/booleanParser.js b/src/v5/valueParsers/booleanParser.js new file mode 100644 index 00000000..f8f5d12a --- /dev/null +++ b/src/v5/valueParsers/booleanParser.js @@ -0,0 +1,23 @@ +class boolParser{ + constructor(trueList, falseList){ + if(trueList) + this.trueList = trueList; + else + this.trueList = ["true"]; + + if(falseList) + this.falseList = falseList; + else + this.falseList = ["false"]; + } + parse(val){ + if (typeof val === 'string') { + //TODO: performance: don't convert + const temp = val.toLowerCase(); + if(this.trueList.indexOf(temp) !== -1) return true; + else if(this.falseList.indexOf(temp) !== -1 ) return false; + } + return val; + } +} +module.exports = boolParser; \ No newline at end of file diff --git a/src/v5/valueParsers/booleanParserExt.js b/src/v5/valueParsers/booleanParserExt.js new file mode 100644 index 00000000..21b80502 --- /dev/null +++ b/src/v5/valueParsers/booleanParserExt.js @@ -0,0 +1,20 @@ +function boolParserExt(val){ + if(isArray(val)){ + for (let i = 0; i < val.length; i++) { + val[i] = parse(val[i]) + } + }else{ + val = parse(val) + } + return val; +} + +function parse(val){ + if (typeof val === 'string') { + const temp = val.toLowerCase(); + if(temp === 'true' || temp ==="yes" || temp==="1") return true; + else if(temp === 'false' || temp ==="no" || temp==="0") return false; + } + return val; +} +module.exports = boolParserExt; \ No newline at end of file diff --git a/src/v5/valueParsers/currency.js b/src/v5/valueParsers/currency.js new file mode 100644 index 00000000..5ec00006 --- /dev/null +++ b/src/v5/valueParsers/currency.js @@ -0,0 +1,31 @@ + +const localeMap = { + "$":"en-US", + "€":"de-DE", + "£":"en-GB", + "¥":"ja-JP", + "₹":"en-IN", +} + +const currencyCheckRegex = /^\s*(?:-|\+)?(?:\d+|\d{1,3}(?:,\d{3})+)?(?:\.\d{1,2})?\s*(?:\$|€|¥|₹)?\s*$/u; + +class CurrencyParser{ + constructor(options){ + this.options = options; + } + parse(val){ + if (typeof val === 'string') { + if(val.indexOf(",,") !== -1 && val.indexOf(".." !== -1)){ + const match = val.match(currencyCheckRegex); + if(match){ + const locale = this.options.locale || localeMap[match[2]||match[5]||"₹"]; + const formatter = new Intl.NumberFormat(locale) + val = val.replace(/[^0-9,.]/g, '').trim(); + val = Number(val.replace(formatter.format(1000)[1], '')); + } + } + } + return val; + } +} +module.exports = CurrencyParser; \ No newline at end of file diff --git a/src/v5/valueParsers/join.js b/src/v5/valueParsers/join.js new file mode 100644 index 00000000..d7f2027d --- /dev/null +++ b/src/v5/valueParsers/join.js @@ -0,0 +1,14 @@ +/** + * + * @param {array} val + * @param {string} by + * @returns + */ +function join(val, by=" "){ + if(isArray(val)){ + val.join(by) + } + return val; +} + +module.exports = join; \ No newline at end of file diff --git a/src/v5/valueParsers/number.js b/src/v5/valueParsers/number.js new file mode 100644 index 00000000..bef3803c --- /dev/null +++ b/src/v5/valueParsers/number.js @@ -0,0 +1,16 @@ +const toNumber = require("strnum"); + + +class numParser{ + constructor(options){ + this.options = options; + } + parse(val){ + if (typeof val === 'string') { + val = toNumber(val,this.options); + } + return val; + } +} + +module.exports = numParser; \ No newline at end of file diff --git a/src/v5/valueParsers/trim.js b/src/v5/valueParsers/trim.js new file mode 100644 index 00000000..ecce49a1 --- /dev/null +++ b/src/v5/valueParsers/trim.js @@ -0,0 +1,8 @@ +class trimmer{ + parse(val){ + if(typeof val === "string") return val.trim(); + else return val; + } +} + +module.exports = trimmer; \ No newline at end of file