Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Merge pull request #114 from mozilla/84-fallback-extraction
Browse files Browse the repository at this point in the history
Fix #84: Improve fallback extraction
  • Loading branch information
biancadanforth authored Sep 28, 2018
2 parents 9622b29 + 9e546b7 commit 4667bb1
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 170 deletions.
75 changes: 31 additions & 44 deletions src/extraction/fallback_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/product_extraction_data.json';
import extractionData from 'commerce/extraction/fallback_extraction_selectors';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand All @@ -22,67 +23,53 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
function getFeatureInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
for (const siteInfo of extractionData) {
for (const domain of siteInfo.domains) {
if (hostname.includes(domain)) {
return siteInfo.features;
}
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
function findValue(extractors) {
for (const [selector, extractionMethod] of extractors) {
const element = document.querySelector(selector);
if (element) {
const value = extractionMethod(element);
if (value) {
return value;
}
// eslint-disable-next-line no-console
console.warn('Element found did not return a valid value for the product feature.');
}
}
// eslint-disable-next-line no-console
console.warn('No elements found with vendor data for the product feature.');
return null;
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
const extractedProduct = {};
const featureInfo = getFeatureInfo();
if (featureInfo) {
for (const [feature, extractors] of Object.entries(featureInfo)) {
extractedProduct[feature] = findValue(extractors);
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
extractedProduct[feature] = metaEle.getAttribute('content');
}
}
}
return data;
return extractedProduct;
}
135 changes: 135 additions & 0 deletions src/extraction/fallback_extraction_selectors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {parsePrice} from 'commerce/extraction/utils';

function inUnits(fn) {
return (element) => {
const priceString = fn(element);
return parsePrice([priceString]);
};
}

function fromProperty(property) {
return (element => element[property]);
}

function fromAttribute(attribute) {
return (element => element.getAttribute(attribute));
}


/**
* CSS selector data by site, where each selector is paired with a method that
* extracts the value from the element returned by that selector.
*/
const fallbackExtractionData = [
{
domains: ['amazon.com', 'www.amazon.com', 'smile.amazon.com'],
features: {
title: [
['#productTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#priceblock_dealprice', inUnits(fromProperty('innerText'))],
['#priceblock_ourprice', inUnits(fromProperty('innerText'))],
['#price_inside_buybox', inUnits(fromProperty('innerText'))],
['#buybox .a-color-price', inUnits(fromProperty('innerText'))],
['input[name="displayedPrice"]', inUnits(fromAttribute('value'))],
['.a-size-large.a-color-price.guild_priceblock_ourprice', inUnits(fromProperty('innerText'))],
['.a-color-price.a-size-medium.a-align-bottom', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
['.offer-price', inUnits(fromProperty('innerText'))],
],
image: [
['#landingImage', fromProperty('src')],
['#imgBlkFront', fromProperty('src')],
['#ebooksImgBlkFront', fromProperty('src')],
],
},
},
{
domains: ['bestbuy.com', 'www.bestbuy.com'],
features: {
title: [
['.sku-title h1', fromProperty('innerText')],
],
price: [
['.priceView-hero-price.priceView-purchase-price', inUnits(fromProperty('innerText'))],
],
image: [
['img.primary-image', fromProperty('src')],
],
},
},
{
domains: ['ebay.com', 'www.ebay.com'],
features: {
title: [
['#itemTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#prcIsum', inUnits(fromProperty('innerText'))],
['#orgPrc', inUnits(fromProperty('innerText'))],
['#mm-saleDscPrc', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
],
image: [
['#icImg', fromProperty('src')],
['.vi-image-gallery__image.vi-image-gallery__image--absolute-center', fromProperty('src')],
],
},
},
{
domains: ['homedepot.com', 'www.homedepot.com'],
features: {
title: [
['h1.product-title__title', fromProperty('innerText')],
],
price: [
['#ajaxPrice', inUnits(fromAttribute('content'))],
['#ajaxPriceAlt', inUnits(fromProperty('innerText'))],
],
image: [
['#mainImage', fromProperty('src')],
],
},
},
{
domains: ['walmart.com', 'www.walmart.com'],
features: {
title: [
['h1.prod-ProductTitle', fromAttribute('content')],
['h1.prod-ProductTitle', fromProperty('innerText')],
],
price: [
['.PriceRange.prod-PriceHero', inUnits(fromProperty('innerText'))],
['.price-group', inUnits(fromAttribute('aria-label'))],
['.price-group', inUnits(fromProperty('innerText'))],
],
image: [
['.prod-hero-image-image', fromProperty('src')],
['.prod-hero-image-carousel-image', fromProperty('src')],
],
},
},
{
domains: ['mkelly.me', 'www.mkelly.me'],
features: {
title: [
['#title', fromProperty('innerText')],
],
price: [
['#price', inUnits(fromProperty('innerText'))],
],
image: [
['img', fromProperty('src')],
],
},
},
];

export default fallbackExtractionData;
5 changes: 3 additions & 2 deletions src/extraction/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
import {getPriceInSubunits} from 'commerce/extraction/utils';
import {parsePrice} from 'commerce/extraction/utils';

// Minimum score to be considered the "correct" feature element extracted by Fathom
const SCORE_THRESHOLD = 4;
Expand Down Expand Up @@ -42,7 +42,8 @@ const PRODUCT_FEATURES = {
price: {
...FEATURE_DEFAULTS,
getValueFromElement(element) {
return getPriceInSubunits(element);
const tokens = Array.from(element.childNodes).map(node => node.textContent);
return parsePrice(tokens);
},
},
};
Expand Down
99 changes: 0 additions & 99 deletions src/extraction/product_extraction_data.json

This file was deleted.

Loading

0 comments on commit 4667bb1

Please sign in to comment.