From 33d7a012a7235b99535b05236d204e779a1cbf29 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Tue, 11 Sep 2018 16:10:12 -0700 Subject: [PATCH] Fix #42: Improve price string parsing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update how we pull the price string from extracted Fathom price elements to provide main and subunits (e.g. dollars and cents) if available. * Add price string cleaning methods to remove extra characters (like commas) that were causing price parsing to fail. * Handle case when price string parsing still fails after cleaning by checking in the background script that the price string is formatted correctly before rendering the browserAction popup. * This will guarantee we never see the “blank panel” reported in #79 and #88. Price element innerText strings now supported as a result of these changes: * "$1327 /each" ([Home Depot example page](https://www.homedepot.com/p/KitchenAid-Classic-4-5-Qt-Tilt-Head-White-Stand-Mixer-K45SSWH/202546032)) * "$1,049.00" ([Amazon example page](https://www.amazon.com/Fujifilm-X-T2-Mirrorless-F2-8-4-0-Lens/dp/B01I3LNQ6M/ref=sr_1_2?ie=UTF8&qid=1535594119&sr=8-2&keywords=fuji+xt2+camera)) * "US $789.99" ([Ebay example page](https://www.ebay.com/itm/Dell-Inspiron-7570-15-6-Touch-Laptop-i7-8550U-1-8GHz-8GB-1TB-NVIDIA-940MX-W10/263827294291)) * "$4.99+" ([Etsy example page](https://www.etsy.com/listing/555504975/frankenstein-2-custom-stencil?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=&ref=sr_gallery-1-13)) Note: This does not handle the case where there is more than one price for the product page (e.g. if we see a range of prices such as "$19.92 - $38.00" or if the price changes based on size/color, etc.); that’s handled by Issue #86. --- src/background/index.js | 16 ++++++ src/extraction/fathom_extraction.js | 75 +++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/src/background/index.js b/src/background/index.js index 44e46f6..3cbb879 100644 --- a/src/background/index.js +++ b/src/background/index.js @@ -21,6 +21,17 @@ import {extractedProductShape} from 'commerce/state/products'; import {loadStateFromStorage} from 'commerce/state/sync'; import {validatePropType} from 'commerce/utils'; +/** + * Returns true if price string is of form "$NX.XX". + */ +function isPriceFormattedCorrectly(priceStr) { + const pricePattern = /^\$([0-9]+)\.([0-9]{2})$/; + if (pricePattern.test(priceStr)) { + return true; + } + return false; +} + /** * Triggers background tasks when a product is extracted from a webpage. Along * with normal page navigation, this is also run when the prices are being @@ -37,6 +48,11 @@ function handleExtractedProductData(extractedProduct, sender) { return; } + // Do nothing if the extracted product's price is not of the form "$NX.XX". + if (!isPriceFormattedCorrectly(extractedProduct.price)) { + return; + } + // Update the toolbar icon's URL with the current page's product if we can if (sender.tab) { const url = new URL(BROWSER_ACTION_URL); diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js index 00cebbc..eee1eb9 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -35,12 +35,76 @@ function runRuleset(doc) { fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { - extractedElements[feature] = fnodesList[0].element; + const element = fnodesList[0].element; + // Check for price units and subunits + if (feature === 'price' && element.children.length > 0) { + extractedElements[feature] = getPriceUnitElements(element); + continue; + } + extractedElements[feature] = element; } } return extractedElements; } +/** + * Returns true if the string contains a number. + */ +function hasNumber(string) { + return /\d/.test(string); +} + +/** + * Get the main and sub unit elements for the product price. + * + * @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys. + */ +function getPriceUnitElements(element) { + let isMainUnit = true; + const priceElements = {}; + // Loop through children: first element containing a digit is main unit, + // second is subunit. + for (const priceSubEle of element.children) { + if (hasNumber(priceSubEle.innerText)) { + if (isMainUnit) { + priceElements.mainUnit = priceSubEle; + isMainUnit = false; + } else { + priceElements.subUnit = priceSubEle; + } + } + } + return priceElements; +} + +/** + * Checks if a price object has subunits and returns a price string. + * + * @param {Object} If the price has subunits, an object literal, else an HTML element + */ +function getPriceString(priceObj) { + // Check for subunits e.g. dollars and cents. + if ('mainUnit' in priceObj) { + const mainUnitStr = priceObj.mainUnit.innerText; + const subUnitStr = priceObj.subUnit.innerText; + return cleanPriceString(`$${mainUnitStr}.${subUnitStr}`); + } + return cleanPriceString(priceObj.innerText); +} + + +/** + * Reformats price string to be of form "$NX.XX". + */ +function cleanPriceString(priceStr) { + // Remove any commas + let cleanedPriceStr = priceStr.replace(/,/g, ''); + // Remove any characters preceding the '$' and following the '.XX' + cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$')); + cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3); + return cleanedPriceStr; +} + /** * Returns true if every key in PRODUCT_FEATURES has a truthy value. */ @@ -58,9 +122,14 @@ export default function extractProduct(doc) { for (const feature of PRODUCT_FEATURES) { if (feature === 'image') { extractedProduct[feature] = extractedElements[feature].src; - } else { - extractedProduct[feature] = extractedElements[feature].innerText; + continue; + // Clean up price string and check for subunits + } else if (feature === 'price') { + const priceStr = getPriceString(extractedElements[feature]); + extractedProduct[feature] = priceStr; + continue; } + extractedProduct[feature] = extractedElements[feature].innerText; } } return hasAllFeatures(extractedProduct) ? extractedProduct : null;