diff --git a/src/extraction/fallback_extraction.js b/src/extraction/fallback_extraction.js
index 5b3535f..76706ca 100644
--- a/src/extraction/fallback_extraction.js
+++ b/src/extraction/fallback_extraction.js
@@ -10,7 +10,8 @@
* Features: title, image, price
*/
-import extractionData from 'commerce/extraction/product_extraction_data.json';
+import extractionData from 'commerce/extraction/fallback_extraction_selectors';
+
const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
@@ -22,33 +23,33 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
-function getProductAttributeInfo() {
+function getFeatureInfo() {
const hostname = new URL(window.location.href).host;
- for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
- if (hostname.includes(vendor)) {
- return attributeInfo;
+ for (const siteInfo of extractionData) {
+ for (const domain of siteInfo.domains) {
+ if (hostname.includes(domain)) {
+ return siteInfo.features;
+ }
}
}
return null;
}
-/**
- * Extracts and returns the string value for a given element property or attribute.
- *
- * @param {HTMLElement} element
- * @param {string} extractionProperty
- */
-function extractValueFromElement(element, extractionProperty) {
- switch (extractionProperty) {
- case 'content':
- return element.getAttribute('content');
- case 'innerText':
- return element.innerText;
- case 'src':
- return element.src;
- default:
- throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
+function findValue(extractors) {
+ for (const [selector, extractionMethod] of extractors) {
+ const element = document.querySelector(selector);
+ if (element) {
+ const value = extractionMethod(element);
+ if (value) {
+ return value;
+ }
+ // eslint-disable-next-line no-console
+ console.warn('Element found did not return a valid value for the product feature.');
+ }
}
+ // eslint-disable-next-line no-console
+ console.warn('No elements found with vendor data for the product feature.');
+ return null;
}
/**
@@ -56,33 +57,19 @@ function extractValueFromElement(element, extractionProperty) {
* selectors if they exist, otherwise from Open Graph tags.
*/
export default function extractProduct() {
- const data = {};
- const attributeInfo = getProductAttributeInfo();
- if (attributeInfo) {
- for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
- const {selectors, extractUsing} = extractor;
- for (const selector of selectors) {
- const element = document.querySelector(selector);
- if (element) {
- data[productAttribute] = extractValueFromElement(element, extractUsing);
- if (data[productAttribute]) {
- break;
- } else {
- throw new Error(`Element found did not return a valid product ${productAttribute}.`);
- }
- } else if (selector === selectors[selectors.length - 1]) {
- // None of the selectors matched an element on the page
- throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
- }
- }
+ const extractedProduct = {};
+ const featureInfo = getFeatureInfo();
+ if (featureInfo) {
+ for (const [feature, extractors] of Object.entries(featureInfo)) {
+ extractedProduct[feature] = findValue(extractors);
}
} else {
- for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
- const metaEle = document.querySelector(`meta[property='${value}']`);
+ for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
+ const metaEle = document.querySelector(`meta[property='${propertyValue}']`);
if (metaEle) {
- data[key] = metaEle.getAttribute('content');
+ extractedProduct[feature] = metaEle.getAttribute('content');
}
}
}
- return data;
+ return extractedProduct;
}
diff --git a/src/extraction/fallback_extraction_selectors.js b/src/extraction/fallback_extraction_selectors.js
new file mode 100644
index 0000000..7d434dc
--- /dev/null
+++ b/src/extraction/fallback_extraction_selectors.js
@@ -0,0 +1,135 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import {parsePrice} from 'commerce/extraction/utils';
+
+function inUnits(fn) {
+ return (element) => {
+ const priceString = fn(element);
+ return parsePrice([priceString]);
+ };
+}
+
+function fromProperty(property) {
+ return (element => element[property]);
+}
+
+function fromAttribute(attribute) {
+ return (element => element.getAttribute(attribute));
+}
+
+
+/**
+ * CSS selector data by site, where each selector is paired with a method that
+ * extracts the value from the element returned by that selector.
+ */
+const fallbackExtractionData = [
+ {
+ domains: ['amazon.com', 'www.amazon.com', 'smile.amazon.com'],
+ features: {
+ title: [
+ ['#productTitle', fromProperty('innerText')],
+ ['.product-title', fromProperty('innerText')],
+ ],
+ price: [
+ ['#priceblock_dealprice', inUnits(fromProperty('innerText'))],
+ ['#priceblock_ourprice', inUnits(fromProperty('innerText'))],
+ ['#price_inside_buybox', inUnits(fromProperty('innerText'))],
+ ['#buybox .a-color-price', inUnits(fromProperty('innerText'))],
+ ['input[name="displayedPrice"]', inUnits(fromAttribute('value'))],
+ ['.a-size-large.a-color-price.guild_priceblock_ourprice', inUnits(fromProperty('innerText'))],
+ ['.a-color-price.a-size-medium.a-align-bottom', inUnits(fromProperty('innerText'))],
+ ['.display-price', inUnits(fromProperty('innerText'))],
+ ['.offer-price', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['#landingImage', fromProperty('src')],
+ ['#imgBlkFront', fromProperty('src')],
+ ['#ebooksImgBlkFront', fromProperty('src')],
+ ],
+ },
+ },
+ {
+ domains: ['bestbuy.com', 'www.bestbuy.com'],
+ features: {
+ title: [
+ ['.sku-title h1', fromProperty('innerText')],
+ ],
+ price: [
+ ['.priceView-hero-price.priceView-purchase-price', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['img.primary-image', fromProperty('src')],
+ ],
+ },
+ },
+ {
+ domains: ['ebay.com', 'www.ebay.com'],
+ features: {
+ title: [
+ ['#itemTitle', fromProperty('innerText')],
+ ['.product-title', fromProperty('innerText')],
+ ],
+ price: [
+ ['#prcIsum', inUnits(fromProperty('innerText'))],
+ ['#orgPrc', inUnits(fromProperty('innerText'))],
+ ['#mm-saleDscPrc', inUnits(fromProperty('innerText'))],
+ ['.display-price', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['#icImg', fromProperty('src')],
+ ['.vi-image-gallery__image.vi-image-gallery__image--absolute-center', fromProperty('src')],
+ ],
+ },
+ },
+ {
+ domains: ['homedepot.com', 'www.homedepot.com'],
+ features: {
+ title: [
+ ['h1.product-title__title', fromProperty('innerText')],
+ ],
+ price: [
+ ['#ajaxPrice', inUnits(fromAttribute('content'))],
+ ['#ajaxPriceAlt', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['#mainImage', fromProperty('src')],
+ ],
+ },
+ },
+ {
+ domains: ['walmart.com', 'www.walmart.com'],
+ features: {
+ title: [
+ ['h1.prod-ProductTitle', fromAttribute('content')],
+ ['h1.prod-ProductTitle', fromProperty('innerText')],
+ ],
+ price: [
+ ['.PriceRange.prod-PriceHero', inUnits(fromProperty('innerText'))],
+ ['.price-group', inUnits(fromAttribute('aria-label'))],
+ ['.price-group', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['.prod-hero-image-image', fromProperty('src')],
+ ['.prod-hero-image-carousel-image', fromProperty('src')],
+ ],
+ },
+ },
+ {
+ domains: ['mkelly.me', 'www.mkelly.me'],
+ features: {
+ title: [
+ ['#title', fromProperty('innerText')],
+ ],
+ price: [
+ ['#price', inUnits(fromProperty('innerText'))],
+ ],
+ image: [
+ ['img', fromProperty('src')],
+ ],
+ },
+ },
+];
+
+export default fallbackExtractionData;
diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js
index 8ff6141..66f70f4 100644
--- a/src/extraction/fathom_extraction.js
+++ b/src/extraction/fathom_extraction.js
@@ -12,7 +12,7 @@
import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
-import {getPriceInSubunits} from 'commerce/extraction/utils';
+import {parsePrice} from 'commerce/extraction/utils';
// Minimum score to be considered the "correct" feature element extracted by Fathom
const SCORE_THRESHOLD = 4;
@@ -42,7 +42,8 @@ const PRODUCT_FEATURES = {
price: {
...FEATURE_DEFAULTS,
getValueFromElement(element) {
- return getPriceInSubunits(element);
+ const tokens = Array.from(element.childNodes).map(node => node.textContent);
+ return parsePrice(tokens);
},
},
};
diff --git a/src/extraction/product_extraction_data.json b/src/extraction/product_extraction_data.json
deleted file mode 100644
index 77f745f..0000000
--- a/src/extraction/product_extraction_data.json
+++ /dev/null
@@ -1,99 +0,0 @@
-{
- "www.aliexpress.com": {
- "title": {
- "selectors": [".product-name"],
- "extractUsing": "innerText"
- },
- "price": {
- "selectors": [
- "#j-sku-discount-price",
- "#j-sku-price"
- ],
- "extractUsing": "innerText"
- },
- "image": {
- "selectors": [".ui-image-viewer-thumb-frame > img"],
- "extractUsing": "src"
- }
- },
- "www.amazon.com": {
- "title": {
- "selectors": [
- "#productTitle",
- ".product-title"
- ],
- "extractUsing": "innerText"
- },
- "price": {
- "selectors": [
- "#priceblock_ourprice",
- "#priceblock_dealprice",
- ".display-price",
- ".offer-price"
- ],
- "extractUsing": "innerText"
- },
- "image": {
- "selectors": [
- "#landingImage",
- "#imgBlkFront"
- ],
- "extractUsing": "src"
- }
- },
- "www.ebay.com": {
- "title": {
- "selectors": [
- "#itemTitle",
- ".product-title"
- ],
- "extractUsing": "innerText"
- },
- "price": {
- "selectors": [
- ".display-price",
- "#prcIsum",
- "#orgPrc"
- ],
- "extractUsing": "innerText"
- },
- "image": {
- "selectors": [
- "#icImg",
- ".vi-image-gallery__image.vi-image-gallery__image--absolute-center"
- ],
- "extractUsing": "src"
- }
- },
- "www.walmart.com": {
- "title": {
- "selectors": [".prod-ProductTitle"],
- "extractUsing": "innerText"
- },
- "price": {
- "selectors": [".price-group"],
- "extractUsing": "innerText"
- },
- "image": {
- "selectors": [
- ".prod-hero-image-image",
- ".prod-hero-image-carousel-image"
- ],
- "extractUsing": "src"
- }
- },
- "www.mkelly.me": {
- "title": {
- "selectors": ["#title"],
- "extractUsing": "innerText"
- },
- "price": {
- "selectors": ["#price"],
- "extractUsing": "innerText"
- },
- "image": {
- "selectors": ["img"],
- "extractUsing": "src"
- }
- }
-}
diff --git a/src/extraction/utils.js b/src/extraction/utils.js
index ff07896..8adb5e7 100644
--- a/src/extraction/utils.js
+++ b/src/extraction/utils.js
@@ -3,13 +3,25 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
- * Converts a price element into a numerical price value in subunits (like cents).
- * e.g. $10.00 returns 1000. If string parsing fails, returns NaN.
- * @param {HTMLElement} priceEle
+ * Converts an array of price tokens into a numerical price value in subunits.
+ * E.g. ["$10.00"] and ["$", "10", "00", "/each"] both return 1000.
+ * If string parsing fails, returns NaN.
+ * @param {Array.String} The price token strings extracted from the page
* @returns {Number} the price in subunits
*/
-export function getPriceInSubunits(priceEle) {
- const priceUnits = getPriceUnits(priceEle.childNodes);
+export function parsePrice(tokens) {
+ const priceUnits = (
+ tokens
+ // Split tokens by $ and . to get the numbers between them
+ .flatMap(token => token.split(/[.$]/))
+ // Filter out any tokens that do not contain a digit
+ .filter(token => /\d/g.test(token))
+ // Remove any non-digit characters for each token in the list
+ .map(token => token.replace(/\D/g, ''))
+ // Convert price token strings to integers
+ .map(token => parseInt(token, 10))
+ );
+
// Convert units and subunits to a single integer value in subunits
switch (priceUnits.length) {
case 1:
@@ -20,23 +32,3 @@ export function getPriceInSubunits(priceEle) {
return NaN;
}
}
-
-/**
- * Extracts price units by filtering and cleaning textContent from text and DOM nodes
- * @param {Array.NodeList} nodes
- * @returns {Array.Number}
- */
-function getPriceUnits(nodes) {
- const nodesArr = Array.from(nodes);
- // Separate token strings in a list into substrings using '$' and '.' as separators
- const allTokens = nodesArr.flatMap(token => token.textContent.split(/[.$]/));
-
- // Filter out any tokens that do not contain a digit
- const priceTokens = allTokens.filter(token => /\d/g.test(token));
-
- // Remove any non-digit characters for each token in the list
- const cleanedPriceTokens = priceTokens.map(token => token.replace(/\D/g, ''));
-
- // Convert price token strings to integers
- return cleanedPriceTokens.map(token => parseInt(token, 10));
-}