From 5808d412b2a17b4afbbbb0a344d0ad7e88d08216 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 13 Sep 2018 14:47:11 -0700 Subject: [PATCH] Fix #84: Improve fallback extraction --- src/extraction/fallback_extraction.js | 44 +++------ .../fallback_extraction_selectors.json | 88 +++++++++++++++++ src/extraction/fathom_extraction.js | 66 +------------ src/extraction/product_extraction_data.json | 99 ------------------- src/product_info.js | 1 + src/utils.js | 98 ++++++++++++++++++ 6 files changed, 205 insertions(+), 191 deletions(-) create mode 100644 src/extraction/fallback_extraction_selectors.json delete mode 100644 src/extraction/product_extraction_data.json diff --git a/src/extraction/fallback_extraction.js b/src/extraction/fallback_extraction.js index 5b3535f..3d5bbc1 100644 --- a/src/extraction/fallback_extraction.js +++ b/src/extraction/fallback_extraction.js @@ -10,7 +10,9 @@ * Features: title, image, price */ -import extractionData from 'commerce/extraction/product_extraction_data.json'; +import extractionData from 'commerce/extraction/fallback_extraction_selectors.json'; +import {getPriceString, extractValueFromElement} from 'commerce/utils'; + const OPEN_GRAPH_PROPERTY_VALUES = { title: 'og:title', @@ -23,34 +25,16 @@ const OPEN_GRAPH_PROPERTY_VALUES = { * for the page. */ function getProductAttributeInfo() { - const hostname = new URL(window.location.href).host; - for (const [vendor, attributeInfo] of Object.entries(extractionData)) { - if (hostname.includes(vendor)) { + const url = window.location.href; + for (const [regExpStr, attributeInfo] of Object.entries(extractionData)) { + const regExp = new RegExp(regExpStr); + if (regExp.test(url)) { return attributeInfo; } } return null; } -/** - * Extracts and returns the string value for a given element property or attribute. - * - * @param {HTMLElement} element - * @param {string} extractionProperty - */ -function extractValueFromElement(element, extractionProperty) { - switch (extractionProperty) { - case 'content': - return element.getAttribute('content'); - case 'innerText': - return element.innerText; - case 'src': - return element.src; - default: - throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); - } -} - /** * Returns any product information available on the page from CSS * selectors if they exist, otherwise from Open Graph tags. @@ -59,18 +43,22 @@ export default function extractProduct() { const data = {}; const attributeInfo = getProductAttributeInfo(); if (attributeInfo) { - for (const [productAttribute, extractor] of Object.entries(attributeInfo)) { - const {selectors, extractUsing} = extractor; - for (const selector of selectors) { + for (const [productAttribute, tuples] of Object.entries(attributeInfo)) { + for (const tuple of tuples) { + const [selector, extractUsing] = tuple; const element = document.querySelector(selector); if (element) { - data[productAttribute] = extractValueFromElement(element, extractUsing); + if (productAttribute === 'price') { + data[productAttribute] = getPriceString(element, extractUsing); + } else { + data[productAttribute] = extractValueFromElement(element, extractUsing); + } if (data[productAttribute]) { break; } else { throw new Error(`Element found did not return a valid product ${productAttribute}.`); } - } else if (selector === selectors[selectors.length - 1]) { + } else if (tuple === tuples[tuples.length - 1]) { // None of the selectors matched an element on the page throw new Error(`No elements found with vendor data for product ${productAttribute}.`); } diff --git a/src/extraction/fallback_extraction_selectors.json b/src/extraction/fallback_extraction_selectors.json new file mode 100644 index 0000000..c5362c0 --- /dev/null +++ b/src/extraction/fallback_extraction_selectors.json @@ -0,0 +1,88 @@ +{ + "^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}amazon\\.com": { + "title": [ + ["#productTitle", "innerText"], + [".product-title", "innerText"] + ], + "price": [ + ["#priceblock_dealprice", "innerText"], + ["#priceblock_ourprice", "innerText"], + ["#price_inside_buybox", "innerText"], + ["#buybox .a-color-price", "innerText"], + ["input[name='displayedPrice']", "value"], + [".a-size-large.a-color-price.guild_priceblock_ourprice", "innerText"], + [".a-color-price.a-size-medium.a-align-bottom", "innerText"], + [".display-price", "innerText"], [".offer-price", "innerText"] + ], + "image": [ + ["#landingImage", "src"], + ["#imgBlkFront", "src"], + ["#ebooksImgBlkFront", "src"] + ] + }, + "^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}bestbuy\\.com": { + "title": [ + [".sku-title h1", "innerText"] + ], + "price": [ + [".priceView-hero-price.priceView-purchase-price", "innerText"] + ], + "image": [ + ["img.primary-image", "src"] + ] + }, + "^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}ebay\\.com": { + "title": [ + ["#itemTitle", "innerText"], + [".product-title", "innerText"] + ], + "price": [ + ["#prcIsum", "innerText"], + ["#orgPrc", "innerText"], + ["#mm-saleDscPrc", "innerText"], + [".display-price", "innerText"] + ], + "image": [ + ["#icImg", "src"], + [".vi-image-gallery__image.vi-image-gallery__image--absolute-center", "src"] + ] + }, + "^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}homedepot\\.com": { + "title": [ + ["h1.product-title__title", "innerText"] + ], + "price": [ + ["#ajaxPrice", "content"], + ["#ajaxPriceAlt", "innerText"] + ], + "image": [ + ["#mainImage", "src"] + ] + }, + "^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}walmart\\.com": { + "title": [ + ["h1.prod-ProductTitle", "content"], + ["h1.prod-ProductTitle", "innerText"] + ], + "price": [ + [".PriceRange.prod-PriceHero", "innerText"], + [".price-group", "aria-label"], + [".price-group", "innerText"] + ], + "image": [ + [".prod-hero-image-image", "src"], + [".prod-hero-image-carousel-image", "src"] + ] + }, + "www.mkelly.me": { + "title": [ + ["#title", "innerText"] + ], + "price": [ + ["#price", "innerText"] + ], + "image": [ + ["img", "src"] + ] + } +} diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js index eee1eb9..f819ca7 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -12,6 +12,7 @@ import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; import RulesetFactory from 'commerce/extraction/ruleset_factory'; +import {getPriceString} from 'commerce/utils'; const PRODUCT_FEATURES = ['title', 'price', 'image']; // Minimum score to be considered the "correct" feature element extracted by Fathom @@ -36,75 +37,12 @@ function runRuleset(doc) { // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { const element = fnodesList[0].element; - // Check for price units and subunits - if (feature === 'price' && element.children.length > 0) { - extractedElements[feature] = getPriceUnitElements(element); - continue; - } extractedElements[feature] = element; } } return extractedElements; } -/** - * Returns true if the string contains a number. - */ -function hasNumber(string) { - return /\d/.test(string); -} - -/** - * Get the main and sub unit elements for the product price. - * - * @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys. - */ -function getPriceUnitElements(element) { - let isMainUnit = true; - const priceElements = {}; - // Loop through children: first element containing a digit is main unit, - // second is subunit. - for (const priceSubEle of element.children) { - if (hasNumber(priceSubEle.innerText)) { - if (isMainUnit) { - priceElements.mainUnit = priceSubEle; - isMainUnit = false; - } else { - priceElements.subUnit = priceSubEle; - } - } - } - return priceElements; -} - -/** - * Checks if a price object has subunits and returns a price string. - * - * @param {Object} If the price has subunits, an object literal, else an HTML element - */ -function getPriceString(priceObj) { - // Check for subunits e.g. dollars and cents. - if ('mainUnit' in priceObj) { - const mainUnitStr = priceObj.mainUnit.innerText; - const subUnitStr = priceObj.subUnit.innerText; - return cleanPriceString(`$${mainUnitStr}.${subUnitStr}`); - } - return cleanPriceString(priceObj.innerText); -} - - -/** - * Reformats price string to be of form "$NX.XX". - */ -function cleanPriceString(priceStr) { - // Remove any commas - let cleanedPriceStr = priceStr.replace(/,/g, ''); - // Remove any characters preceding the '$' and following the '.XX' - cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$')); - cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3); - return cleanedPriceStr; -} - /** * Returns true if every key in PRODUCT_FEATURES has a truthy value. */ @@ -125,7 +63,7 @@ export default function extractProduct(doc) { continue; // Clean up price string and check for subunits } else if (feature === 'price') { - const priceStr = getPriceString(extractedElements[feature]); + const priceStr = getPriceString(extractedElements[feature], 'innerText'); extractedProduct[feature] = priceStr; continue; } diff --git a/src/extraction/product_extraction_data.json b/src/extraction/product_extraction_data.json deleted file mode 100644 index 77f745f..0000000 --- a/src/extraction/product_extraction_data.json +++ /dev/null @@ -1,99 +0,0 @@ -{ - "www.aliexpress.com": { - "title": { - "selectors": [".product-name"], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - "#j-sku-discount-price", - "#j-sku-price" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [".ui-image-viewer-thumb-frame > img"], - "extractUsing": "src" - } - }, - "www.amazon.com": { - "title": { - "selectors": [ - "#productTitle", - ".product-title" - ], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - "#priceblock_ourprice", - "#priceblock_dealprice", - ".display-price", - ".offer-price" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - "#landingImage", - "#imgBlkFront" - ], - "extractUsing": "src" - } - }, - "www.ebay.com": { - "title": { - "selectors": [ - "#itemTitle", - ".product-title" - ], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - ".display-price", - "#prcIsum", - "#orgPrc" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - "#icImg", - ".vi-image-gallery__image.vi-image-gallery__image--absolute-center" - ], - "extractUsing": "src" - } - }, - "www.walmart.com": { - "title": { - "selectors": [".prod-ProductTitle"], - "extractUsing": "innerText" - }, - "price": { - "selectors": [".price-group"], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - ".prod-hero-image-image", - ".prod-hero-image-carousel-image" - ], - "extractUsing": "src" - } - }, - "www.mkelly.me": { - "title": { - "selectors": ["#title"], - "extractUsing": "innerText" - }, - "price": { - "selectors": ["#price"], - "extractUsing": "innerText" - }, - "image": { - "selectors": ["img"], - "extractUsing": "src" - } - } -} diff --git a/src/product_info.js b/src/product_info.js index 9aa01a2..0747ed7 100644 --- a/src/product_info.js +++ b/src/product_info.js @@ -19,6 +19,7 @@ async function getProductInfo() { extractProductWithFathom(window.document) || extractProductWithFallback() ); + await browser.runtime.sendMessage({ from: 'content', subject: 'ready', diff --git a/src/utils.js b/src/utils.js index 37ebc07..779c4e3 100644 --- a/src/utils.js +++ b/src/utils.js @@ -67,3 +67,101 @@ export function priceStringToAmount(priceString) { export function validatePropType(value, propType) { return checkPropTypes({value: propType}, {value}, 'prop', 'Validation'); } + +/** + * Returns true if the string contains a number. + */ +function hasNumber(string) { + return /\d/.test(string); +} + +/** + * Returns true if the string contains a dollar sign. + */ +function hasDollarSign(string) { + return /\$/.test(string); +} + +/** + * Get the main and sub unit elements for the product price. + * + * @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys. + */ +export function getPriceUnitElements(element) { + let isMainUnit = true; + const priceElements = {}; + // Loop through children: first element containing a digit is main unit, + // second is subunit. + for (const priceSubEle of element.children) { + if (hasNumber(priceSubEle.innerText)) { + if (isMainUnit) { + priceElements.mainUnit = priceSubEle; + isMainUnit = false; + } else { + priceElements.subUnit = priceSubEle; + } + } + } + return priceElements; +} + +/** + * Reformats price string to be of form "$NX.XX". + */ +export function cleanPriceString(priceStr) { + // Remove any commas + let cleanedPriceStr = priceStr.replace(/,/g, ''); + // Add a '$' at the beginning if not present; common for strings pulled from element attributes + if (!hasDollarSign) { + cleanedPriceStr = cleanedPriceStr.replace(/^/, '$'); + } + // Remove any characters preceding the '$' and following the '.XX' + cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$')); + cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3); + return cleanedPriceStr; +} + +/** + * Checks if a price object has subunits and returns a price string. + * + * @param {HTMLElement} - The element containing the price + * @param {String} extractUsing - The property/attribute to use to get the product price + */ +export function getPriceString(element, extractUsing) { + if (element.children.length > 0) { + const priceObj = getPriceUnitElements(element); + // Check for subunits e.g. dollars and cents. + if ('mainUnit' in priceObj) { + const mainUnitStr = priceObj.mainUnit.innerText; + // If no subunits, then main units contain subunits + const subUnitStr = priceObj.subUnit ? `.${priceObj.subUnit.innerText}` : ''; + const priceStr = `${mainUnitStr}${subUnitStr}`; + return cleanPriceString(hasDollarSign(priceStr) ? priceStr : `$${priceStr}`); + } + } + const priceStr = extractValueFromElement(element, extractUsing); + return cleanPriceString(priceStr); +} + +/** + * Extracts and returns the string value for a given element property or attribute. + * + * @param {HTMLElement} element + * @param {string} extractionProperty + */ +export function extractValueFromElement(element, extractionProperty) { + switch (extractionProperty) { + case 'content': + return element.getAttribute('content'); + case 'innerText': + return element.innerText; + case 'src': + return element.src; + case 'value': + return element.getAttribute('value'); + case 'aria-label': + return element.getAttribute('aria-label'); + default: + throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); + } +}