Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Fix #84: Improve fallback extraction
Browse files Browse the repository at this point in the history
Improve fallback extraction by CSS selectors:
* Update selectors for the top 5 sites. Add Home Depot and Best Buy.
* Rename selectors JSON file to be more descriptive (was 'product_extraction_data.json', now 'fallback_extraction_selectors.json').
* Represent supported sites in 'fallback_extraction_selectors.json' as regular expression strings so that fallback extraction works for any subdomain of the site (e.g. 'smile.amazon.com').
* Represent CSS selectors by tuples in 'fallback_extraction_selectors.json', so that each selector can specify which attribute or property to read for that selector.
* Clean price strings from fallback extraction using the same methods as used by Fathom extraction (PR #111); consolidate and move shared methods to 'utils.js'.
  • Loading branch information
biancadanforth committed Sep 28, 2018
1 parent ca3b02d commit 4f38438
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 127 deletions.
44 changes: 16 additions & 28 deletions src/extraction/fallback_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/product_extraction_data.json';
import extractionData from 'commerce/extraction/fallback_extraction_selectors.json';
import {getPriceString, extractValueFromElement} from 'commerce/utils';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand All @@ -23,34 +25,16 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
const url = window.location.href;
for (const [regExpStr, attributeInfo] of Object.entries(extractionData)) {
const regExp = new RegExp(regExpStr);
if (regExp.test(url)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
Expand All @@ -59,18 +43,22 @@ export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
for (const [productAttribute, tuples] of Object.entries(attributeInfo)) {
for (const tuple of tuples) {
const [selector, extractUsing] = tuple;
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (productAttribute === 'price') {
data[productAttribute] = getPriceString(element, extractUsing);
} else {
data[productAttribute] = extractValueFromElement(element, extractUsing);
}
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
} else if (tuple === tuples[tuples.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
Expand Down
88 changes: 88 additions & 0 deletions src/extraction/fallback_extraction_selectors.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}amazon\\.com": {
"title": [
["#productTitle", "innerText"],
[".product-title", "innerText"]
],
"price": [
["#priceblock_dealprice", "innerText"],
["#priceblock_ourprice", "innerText"],
["#price_inside_buybox", "innerText"],
["#buybox .a-color-price", "innerText"],
["input[name='displayedPrice']", "value"],
[".a-size-large.a-color-price.guild_priceblock_ourprice", "innerText"],
[".a-color-price.a-size-medium.a-align-bottom", "innerText"],
[".display-price", "innerText"], [".offer-price", "innerText"]
],
"image": [
["#landingImage", "src"],
["#imgBlkFront", "src"],
["#ebooksImgBlkFront", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}bestbuy\\.com": {
"title": [
[".sku-title h1", "innerText"]
],
"price": [
[".priceView-hero-price.priceView-purchase-price", "innerText"]
],
"image": [
["img.primary-image", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}ebay\\.com": {
"title": [
["#itemTitle", "innerText"],
[".product-title", "innerText"]
],
"price": [
["#prcIsum", "innerText"],
["#orgPrc", "innerText"],
["#mm-saleDscPrc", "innerText"],
[".display-price", "innerText"]
],
"image": [
["#icImg", "src"],
[".vi-image-gallery__image.vi-image-gallery__image--absolute-center", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}homedepot\\.com": {
"title": [
["h1.product-title__title", "innerText"]
],
"price": [
["#ajaxPrice", "content"],
["#ajaxPriceAlt", "innerText"]
],
"image": [
["#mainImage", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}walmart\\.com": {
"title": [
["h1.prod-ProductTitle", "content"],
["h1.prod-ProductTitle", "innerText"]
],
"price": [
[".PriceRange.prod-PriceHero", "innerText"],
[".price-group", "aria-label"],
[".price-group", "innerText"]
],
"image": [
[".prod-hero-image-image", "src"],
[".prod-hero-image-carousel-image", "src"]
]
},
"www.mkelly.me": {
"title": [
["#title", "innerText"]
],
"price": [
["#price", "innerText"]
],
"image": [
["img", "src"]
]
}
}
99 changes: 0 additions & 99 deletions src/extraction/product_extraction_data.json

This file was deleted.

98 changes: 98 additions & 0 deletions src/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,101 @@ export async function retry(callback, maxRetries = 5, delayFactor = 2, initialDe
export function validatePropType(value, propType) {
return checkPropTypes({value: propType}, {value}, 'prop', 'Validation');
}

/**
* Returns true if the string contains a number.
*/
function hasNumber(string) {
return /\d/.test(string);
}

/**
* Returns true if the string contains a dollar sign.
*/
function hasDollarSign(string) {
return /\$/.test(string);
}

/**
* Get the main and sub unit elements for the product price.
*
* @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys.
*/
export function getPriceUnitElements(element) {
let isMainUnit = true;
const priceElements = {};
// Loop through children: first element containing a digit is main unit,
// second is subunit.
for (const priceSubEle of element.children) {
if (hasNumber(priceSubEle.innerText)) {
if (isMainUnit) {
priceElements.mainUnit = priceSubEle;
isMainUnit = false;
} else {
priceElements.subUnit = priceSubEle;
}
}
}
return priceElements;
}

/**
* Reformats price string to be of form "$NX.XX".
*/
export function cleanPriceString(priceStr) {
// Remove any commas
let cleanedPriceStr = priceStr.replace(/,/g, '');
// Add a '$' at the beginning if not present; common for strings pulled from element attributes
if (!hasDollarSign) {
cleanedPriceStr = cleanedPriceStr.replace(/^/, '$');
}
// Remove any characters preceding the '$' and following the '.XX'
cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$'));
cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3);
return cleanedPriceStr;
}

/**
* Checks if a price object has subunits and returns a price string.
*
* @param {HTMLElement} - The element containing the price
* @param {String} extractUsing - The property/attribute to use to get the product price
*/
export function getPriceString(element, extractUsing) {
if (element.children.length > 0) {
const priceObj = getPriceUnitElements(element);
// Check for subunits e.g. dollars and cents.
if ('mainUnit' in priceObj) {
const mainUnitStr = priceObj.mainUnit.innerText;
// If no subunits, then main units contain subunits
const subUnitStr = priceObj.subUnit ? `.${priceObj.subUnit.innerText}` : '';
const priceStr = `${mainUnitStr}${subUnitStr}`;
return cleanPriceString(hasDollarSign(priceStr) ? priceStr : `$${priceStr}`);
}
}
const priceStr = extractValueFromElement(element, extractUsing);
return cleanPriceString(priceStr);
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {String} extractUsing - The property/attribute to use to get the product price
*/
export function extractValueFromElement(element, extractUsing) {
switch (extractUsing) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
case 'value':
return element.getAttribute('value');
case 'aria-label':
return element.getAttribute('aria-label');
default:
throw new Error(`Unrecognized extraction property or attribute '${extractUsing}'.`);
}
}

0 comments on commit 4f38438

Please sign in to comment.