diff --git a/src/extraction/fallback_extraction.js b/src/extraction/fallback_extraction.js index 5b3535f..76706ca 100644 --- a/src/extraction/fallback_extraction.js +++ b/src/extraction/fallback_extraction.js @@ -10,7 +10,8 @@ * Features: title, image, price */ -import extractionData from 'commerce/extraction/product_extraction_data.json'; +import extractionData from 'commerce/extraction/fallback_extraction_selectors'; + const OPEN_GRAPH_PROPERTY_VALUES = { title: 'og:title', @@ -22,33 +23,33 @@ const OPEN_GRAPH_PROPERTY_VALUES = { * Returns any extraction data found for the vendor based on the URL * for the page. */ -function getProductAttributeInfo() { +function getFeatureInfo() { const hostname = new URL(window.location.href).host; - for (const [vendor, attributeInfo] of Object.entries(extractionData)) { - if (hostname.includes(vendor)) { - return attributeInfo; + for (const siteInfo of extractionData) { + for (const domain of siteInfo.domains) { + if (hostname.includes(domain)) { + return siteInfo.features; + } } } return null; } -/** - * Extracts and returns the string value for a given element property or attribute. - * - * @param {HTMLElement} element - * @param {string} extractionProperty - */ -function extractValueFromElement(element, extractionProperty) { - switch (extractionProperty) { - case 'content': - return element.getAttribute('content'); - case 'innerText': - return element.innerText; - case 'src': - return element.src; - default: - throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); +function findValue(extractors) { + for (const [selector, extractionMethod] of extractors) { + const element = document.querySelector(selector); + if (element) { + const value = extractionMethod(element); + if (value) { + return value; + } + // eslint-disable-next-line no-console + console.warn('Element found did not return a valid value for the product feature.'); + } } + // eslint-disable-next-line no-console + console.warn('No elements found with vendor data for the product feature.'); + return null; } /** @@ -56,33 +57,19 @@ function extractValueFromElement(element, extractionProperty) { * selectors if they exist, otherwise from Open Graph tags. */ export default function extractProduct() { - const data = {}; - const attributeInfo = getProductAttributeInfo(); - if (attributeInfo) { - for (const [productAttribute, extractor] of Object.entries(attributeInfo)) { - const {selectors, extractUsing} = extractor; - for (const selector of selectors) { - const element = document.querySelector(selector); - if (element) { - data[productAttribute] = extractValueFromElement(element, extractUsing); - if (data[productAttribute]) { - break; - } else { - throw new Error(`Element found did not return a valid product ${productAttribute}.`); - } - } else if (selector === selectors[selectors.length - 1]) { - // None of the selectors matched an element on the page - throw new Error(`No elements found with vendor data for product ${productAttribute}.`); - } - } + const extractedProduct = {}; + const featureInfo = getFeatureInfo(); + if (featureInfo) { + for (const [feature, extractors] of Object.entries(featureInfo)) { + extractedProduct[feature] = findValue(extractors); } } else { - for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { - const metaEle = document.querySelector(`meta[property='${value}']`); + for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { + const metaEle = document.querySelector(`meta[property='${propertyValue}']`); if (metaEle) { - data[key] = metaEle.getAttribute('content'); + extractedProduct[feature] = metaEle.getAttribute('content'); } } } - return data; + return extractedProduct; } diff --git a/src/extraction/fallback_extraction_selectors.js b/src/extraction/fallback_extraction_selectors.js new file mode 100644 index 0000000..7d434dc --- /dev/null +++ b/src/extraction/fallback_extraction_selectors.js @@ -0,0 +1,135 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import {parsePrice} from 'commerce/extraction/utils'; + +function inUnits(fn) { + return (element) => { + const priceString = fn(element); + return parsePrice([priceString]); + }; +} + +function fromProperty(property) { + return (element => element[property]); +} + +function fromAttribute(attribute) { + return (element => element.getAttribute(attribute)); +} + + +/** + * CSS selector data by site, where each selector is paired with a method that + * extracts the value from the element returned by that selector. + */ +const fallbackExtractionData = [ + { + domains: ['amazon.com', 'www.amazon.com', 'smile.amazon.com'], + features: { + title: [ + ['#productTitle', fromProperty('innerText')], + ['.product-title', fromProperty('innerText')], + ], + price: [ + ['#priceblock_dealprice', inUnits(fromProperty('innerText'))], + ['#priceblock_ourprice', inUnits(fromProperty('innerText'))], + ['#price_inside_buybox', inUnits(fromProperty('innerText'))], + ['#buybox .a-color-price', inUnits(fromProperty('innerText'))], + ['input[name="displayedPrice"]', inUnits(fromAttribute('value'))], + ['.a-size-large.a-color-price.guild_priceblock_ourprice', inUnits(fromProperty('innerText'))], + ['.a-color-price.a-size-medium.a-align-bottom', inUnits(fromProperty('innerText'))], + ['.display-price', inUnits(fromProperty('innerText'))], + ['.offer-price', inUnits(fromProperty('innerText'))], + ], + image: [ + ['#landingImage', fromProperty('src')], + ['#imgBlkFront', fromProperty('src')], + ['#ebooksImgBlkFront', fromProperty('src')], + ], + }, + }, + { + domains: ['bestbuy.com', 'www.bestbuy.com'], + features: { + title: [ + ['.sku-title h1', fromProperty('innerText')], + ], + price: [ + ['.priceView-hero-price.priceView-purchase-price', inUnits(fromProperty('innerText'))], + ], + image: [ + ['img.primary-image', fromProperty('src')], + ], + }, + }, + { + domains: ['ebay.com', 'www.ebay.com'], + features: { + title: [ + ['#itemTitle', fromProperty('innerText')], + ['.product-title', fromProperty('innerText')], + ], + price: [ + ['#prcIsum', inUnits(fromProperty('innerText'))], + ['#orgPrc', inUnits(fromProperty('innerText'))], + ['#mm-saleDscPrc', inUnits(fromProperty('innerText'))], + ['.display-price', inUnits(fromProperty('innerText'))], + ], + image: [ + ['#icImg', fromProperty('src')], + ['.vi-image-gallery__image.vi-image-gallery__image--absolute-center', fromProperty('src')], + ], + }, + }, + { + domains: ['homedepot.com', 'www.homedepot.com'], + features: { + title: [ + ['h1.product-title__title', fromProperty('innerText')], + ], + price: [ + ['#ajaxPrice', inUnits(fromAttribute('content'))], + ['#ajaxPriceAlt', inUnits(fromProperty('innerText'))], + ], + image: [ + ['#mainImage', fromProperty('src')], + ], + }, + }, + { + domains: ['walmart.com', 'www.walmart.com'], + features: { + title: [ + ['h1.prod-ProductTitle', fromAttribute('content')], + ['h1.prod-ProductTitle', fromProperty('innerText')], + ], + price: [ + ['.PriceRange.prod-PriceHero', inUnits(fromProperty('innerText'))], + ['.price-group', inUnits(fromAttribute('aria-label'))], + ['.price-group', inUnits(fromProperty('innerText'))], + ], + image: [ + ['.prod-hero-image-image', fromProperty('src')], + ['.prod-hero-image-carousel-image', fromProperty('src')], + ], + }, + }, + { + domains: ['mkelly.me', 'www.mkelly.me'], + features: { + title: [ + ['#title', fromProperty('innerText')], + ], + price: [ + ['#price', inUnits(fromProperty('innerText'))], + ], + image: [ + ['img', fromProperty('src')], + ], + }, + }, +]; + +export default fallbackExtractionData; diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js index 8ff6141..66f70f4 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -12,7 +12,7 @@ import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; import RulesetFactory from 'commerce/extraction/ruleset_factory'; -import {getPriceInSubunits} from 'commerce/extraction/utils'; +import {parsePrice} from 'commerce/extraction/utils'; // Minimum score to be considered the "correct" feature element extracted by Fathom const SCORE_THRESHOLD = 4; @@ -42,7 +42,8 @@ const PRODUCT_FEATURES = { price: { ...FEATURE_DEFAULTS, getValueFromElement(element) { - return getPriceInSubunits(element); + const tokens = Array.from(element.childNodes).map(node => node.textContent); + return parsePrice(tokens); }, }, }; diff --git a/src/extraction/product_extraction_data.json b/src/extraction/product_extraction_data.json deleted file mode 100644 index 77f745f..0000000 --- a/src/extraction/product_extraction_data.json +++ /dev/null @@ -1,99 +0,0 @@ -{ - "www.aliexpress.com": { - "title": { - "selectors": [".product-name"], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - "#j-sku-discount-price", - "#j-sku-price" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [".ui-image-viewer-thumb-frame > img"], - "extractUsing": "src" - } - }, - "www.amazon.com": { - "title": { - "selectors": [ - "#productTitle", - ".product-title" - ], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - "#priceblock_ourprice", - "#priceblock_dealprice", - ".display-price", - ".offer-price" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - "#landingImage", - "#imgBlkFront" - ], - "extractUsing": "src" - } - }, - "www.ebay.com": { - "title": { - "selectors": [ - "#itemTitle", - ".product-title" - ], - "extractUsing": "innerText" - }, - "price": { - "selectors": [ - ".display-price", - "#prcIsum", - "#orgPrc" - ], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - "#icImg", - ".vi-image-gallery__image.vi-image-gallery__image--absolute-center" - ], - "extractUsing": "src" - } - }, - "www.walmart.com": { - "title": { - "selectors": [".prod-ProductTitle"], - "extractUsing": "innerText" - }, - "price": { - "selectors": [".price-group"], - "extractUsing": "innerText" - }, - "image": { - "selectors": [ - ".prod-hero-image-image", - ".prod-hero-image-carousel-image" - ], - "extractUsing": "src" - } - }, - "www.mkelly.me": { - "title": { - "selectors": ["#title"], - "extractUsing": "innerText" - }, - "price": { - "selectors": ["#price"], - "extractUsing": "innerText" - }, - "image": { - "selectors": ["img"], - "extractUsing": "src" - } - } -} diff --git a/src/extraction/utils.js b/src/extraction/utils.js index ff07896..8adb5e7 100644 --- a/src/extraction/utils.js +++ b/src/extraction/utils.js @@ -3,13 +3,25 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /** - * Converts a price element into a numerical price value in subunits (like cents). - * e.g. $10.00 returns 1000. If string parsing fails, returns NaN. - * @param {HTMLElement} priceEle + * Converts an array of price tokens into a numerical price value in subunits. + * E.g. ["$10.00"] and ["$", "10", "00", "/each"] both return 1000. + * If string parsing fails, returns NaN. + * @param {Array.String} The price token strings extracted from the page * @returns {Number} the price in subunits */ -export function getPriceInSubunits(priceEle) { - const priceUnits = getPriceUnits(priceEle.childNodes); +export function parsePrice(tokens) { + const priceUnits = ( + tokens + // Split tokens by $ and . to get the numbers between them + .flatMap(token => token.split(/[.$]/)) + // Filter out any tokens that do not contain a digit + .filter(token => /\d/g.test(token)) + // Remove any non-digit characters for each token in the list + .map(token => token.replace(/\D/g, '')) + // Convert price token strings to integers + .map(token => parseInt(token, 10)) + ); + // Convert units and subunits to a single integer value in subunits switch (priceUnits.length) { case 1: @@ -20,23 +32,3 @@ export function getPriceInSubunits(priceEle) { return NaN; } } - -/** - * Extracts price units by filtering and cleaning textContent from text and DOM nodes - * @param {Array.NodeList} nodes - * @returns {Array.Number} - */ -function getPriceUnits(nodes) { - const nodesArr = Array.from(nodes); - // Separate token strings in a list into substrings using '$' and '.' as separators - const allTokens = nodesArr.flatMap(token => token.textContent.split(/[.$]/)); - - // Filter out any tokens that do not contain a digit - const priceTokens = allTokens.filter(token => /\d/g.test(token)); - - // Remove any non-digit characters for each token in the list - const cleanedPriceTokens = priceTokens.map(token => token.replace(/\D/g, '')); - - // Convert price token strings to integers - return cleanedPriceTokens.map(token => parseInt(token, 10)); -}