Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Fix #84: Improve fallback extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
biancadanforth committed Sep 14, 2018
1 parent 33d7a01 commit 5808d41
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 191 deletions.
44 changes: 16 additions & 28 deletions src/extraction/fallback_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/product_extraction_data.json';
import extractionData from 'commerce/extraction/fallback_extraction_selectors.json';
import {getPriceString, extractValueFromElement} from 'commerce/utils';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand All @@ -23,34 +25,16 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
const url = window.location.href;
for (const [regExpStr, attributeInfo] of Object.entries(extractionData)) {
const regExp = new RegExp(regExpStr);
if (regExp.test(url)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
Expand All @@ -59,18 +43,22 @@ export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
for (const [productAttribute, tuples] of Object.entries(attributeInfo)) {
for (const tuple of tuples) {
const [selector, extractUsing] = tuple;
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (productAttribute === 'price') {
data[productAttribute] = getPriceString(element, extractUsing);
} else {
data[productAttribute] = extractValueFromElement(element, extractUsing);
}
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
} else if (tuple === tuples[tuples.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
Expand Down
88 changes: 88 additions & 0 deletions src/extraction/fallback_extraction_selectors.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}amazon\\.com": {
"title": [
["#productTitle", "innerText"],
[".product-title", "innerText"]
],
"price": [
["#priceblock_dealprice", "innerText"],
["#priceblock_ourprice", "innerText"],
["#price_inside_buybox", "innerText"],
["#buybox .a-color-price", "innerText"],
["input[name='displayedPrice']", "value"],
[".a-size-large.a-color-price.guild_priceblock_ourprice", "innerText"],
[".a-color-price.a-size-medium.a-align-bottom", "innerText"],
[".display-price", "innerText"], [".offer-price", "innerText"]
],
"image": [
["#landingImage", "src"],
["#imgBlkFront", "src"],
["#ebooksImgBlkFront", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}bestbuy\\.com": {
"title": [
[".sku-title h1", "innerText"]
],
"price": [
[".priceView-hero-price.priceView-purchase-price", "innerText"]
],
"image": [
["img.primary-image", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}ebay\\.com": {
"title": [
["#itemTitle", "innerText"],
[".product-title", "innerText"]
],
"price": [
["#prcIsum", "innerText"],
["#orgPrc", "innerText"],
["#mm-saleDscPrc", "innerText"],
[".display-price", "innerText"]
],
"image": [
["#icImg", "src"],
[".vi-image-gallery__image.vi-image-gallery__image--absolute-center", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}homedepot\\.com": {
"title": [
["h1.product-title__title", "innerText"]
],
"price": [
["#ajaxPrice", "content"],
["#ajaxPriceAlt", "innerText"]
],
"image": [
["#mainImage", "src"]
]
},
"^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}walmart\\.com": {
"title": [
["h1.prod-ProductTitle", "content"],
["h1.prod-ProductTitle", "innerText"]
],
"price": [
[".PriceRange.prod-PriceHero", "innerText"],
[".price-group", "aria-label"],
[".price-group", "innerText"]
],
"image": [
[".prod-hero-image-image", "src"],
[".prod-hero-image-carousel-image", "src"]
]
},
"www.mkelly.me": {
"title": [
["#title", "innerText"]
],
"price": [
["#price", "innerText"]
],
"image": [
["img", "src"]
]
}
}
66 changes: 2 additions & 64 deletions src/extraction/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
import {getPriceString} from 'commerce/utils';

const PRODUCT_FEATURES = ['title', 'price', 'image'];
// Minimum score to be considered the "correct" feature element extracted by Fathom
Expand All @@ -36,75 +37,12 @@ function runRuleset(doc) {
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
const element = fnodesList[0].element;
// Check for price units and subunits
if (feature === 'price' && element.children.length > 0) {
extractedElements[feature] = getPriceUnitElements(element);
continue;
}
extractedElements[feature] = element;
}
}
return extractedElements;
}

/**
* Returns true if the string contains a number.
*/
function hasNumber(string) {
return /\d/.test(string);
}

/**
* Get the main and sub unit elements for the product price.
*
* @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys.
*/
function getPriceUnitElements(element) {
let isMainUnit = true;
const priceElements = {};
// Loop through children: first element containing a digit is main unit,
// second is subunit.
for (const priceSubEle of element.children) {
if (hasNumber(priceSubEle.innerText)) {
if (isMainUnit) {
priceElements.mainUnit = priceSubEle;
isMainUnit = false;
} else {
priceElements.subUnit = priceSubEle;
}
}
}
return priceElements;
}

/**
* Checks if a price object has subunits and returns a price string.
*
* @param {Object} If the price has subunits, an object literal, else an HTML element
*/
function getPriceString(priceObj) {
// Check for subunits e.g. dollars and cents.
if ('mainUnit' in priceObj) {
const mainUnitStr = priceObj.mainUnit.innerText;
const subUnitStr = priceObj.subUnit.innerText;
return cleanPriceString(`$${mainUnitStr}.${subUnitStr}`);
}
return cleanPriceString(priceObj.innerText);
}


/**
* Reformats price string to be of form "$NX.XX".
*/
function cleanPriceString(priceStr) {
// Remove any commas
let cleanedPriceStr = priceStr.replace(/,/g, '');
// Remove any characters preceding the '$' and following the '.XX'
cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$'));
cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3);
return cleanedPriceStr;
}

/**
* Returns true if every key in PRODUCT_FEATURES has a truthy value.
*/
Expand All @@ -125,7 +63,7 @@ export default function extractProduct(doc) {
continue;
// Clean up price string and check for subunits
} else if (feature === 'price') {
const priceStr = getPriceString(extractedElements[feature]);
const priceStr = getPriceString(extractedElements[feature], 'innerText');
extractedProduct[feature] = priceStr;
continue;
}
Expand Down
99 changes: 0 additions & 99 deletions src/extraction/product_extraction_data.json

This file was deleted.

1 change: 1 addition & 0 deletions src/product_info.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ async function getProductInfo() {
extractProductWithFathom(window.document)
|| extractProductWithFallback()
);

await browser.runtime.sendMessage({
from: 'content',
subject: 'ready',
Expand Down
Loading

0 comments on commit 5808d41

Please sign in to comment.