Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Fix #84: Improve fallback extraction #114

Merged
merged 4 commits into from
Sep 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 31 additions & 44 deletions src/extraction/fallback_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/product_extraction_data.json';
import extractionData from 'commerce/extraction/fallback_extraction_selectors';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand All @@ -22,67 +23,53 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
function getFeatureInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
for (const siteInfo of extractionData) {
for (const domain of siteInfo.domains) {
if (hostname.includes(domain)) {
return siteInfo.features;
}
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
function findValue(extractors) {
for (const [selector, extractionMethod] of extractors) {
const element = document.querySelector(selector);
if (element) {
const value = extractionMethod(element);
if (value) {
return value;
}
// eslint-disable-next-line no-console
console.warn('Element found did not return a valid value for the product feature.');
}
}
// eslint-disable-next-line no-console
console.warn('No elements found with vendor data for the product feature.');
return null;
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
const extractedProduct = {};
const featureInfo = getFeatureInfo();
if (featureInfo) {
for (const [feature, extractors] of Object.entries(featureInfo)) {
extractedProduct[feature] = findValue(extractors);
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
extractedProduct[feature] = metaEle.getAttribute('content');
}
}
}
return data;
return extractedProduct;
}
135 changes: 135 additions & 0 deletions src/extraction/fallback_extraction_selectors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {parsePrice} from 'commerce/extraction/utils';

function inUnits(fn) {
return (element) => {
const priceString = fn(element);
return parsePrice([priceString]);
};
}

function fromProperty(property) {
return (element => element[property]);
}

function fromAttribute(attribute) {
return (element => element.getAttribute(attribute));
}


/**
* CSS selector data by site, where each selector is paired with a method that
* extracts the value from the element returned by that selector.
*/
const fallbackExtractionData = [
{
domains: ['amazon.com', 'www.amazon.com', 'smile.amazon.com'],
features: {
title: [
['#productTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#priceblock_dealprice', inUnits(fromProperty('innerText'))],
['#priceblock_ourprice', inUnits(fromProperty('innerText'))],
['#price_inside_buybox', inUnits(fromProperty('innerText'))],
['#buybox .a-color-price', inUnits(fromProperty('innerText'))],
['input[name="displayedPrice"]', inUnits(fromAttribute('value'))],
['.a-size-large.a-color-price.guild_priceblock_ourprice', inUnits(fromProperty('innerText'))],
['.a-color-price.a-size-medium.a-align-bottom', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
['.offer-price', inUnits(fromProperty('innerText'))],
],
image: [
['#landingImage', fromProperty('src')],
['#imgBlkFront', fromProperty('src')],
['#ebooksImgBlkFront', fromProperty('src')],
],
},
},
{
domains: ['bestbuy.com', 'www.bestbuy.com'],
features: {
title: [
['.sku-title h1', fromProperty('innerText')],
],
price: [
['.priceView-hero-price.priceView-purchase-price', inUnits(fromProperty('innerText'))],
],
image: [
['img.primary-image', fromProperty('src')],
],
},
},
{
domains: ['ebay.com', 'www.ebay.com'],
features: {
title: [
['#itemTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#prcIsum', inUnits(fromProperty('innerText'))],
['#orgPrc', inUnits(fromProperty('innerText'))],
['#mm-saleDscPrc', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
],
image: [
['#icImg', fromProperty('src')],
['.vi-image-gallery__image.vi-image-gallery__image--absolute-center', fromProperty('src')],
],
},
},
{
domains: ['homedepot.com', 'www.homedepot.com'],
features: {
title: [
['h1.product-title__title', fromProperty('innerText')],
],
price: [
['#ajaxPrice', inUnits(fromAttribute('content'))],
['#ajaxPriceAlt', inUnits(fromProperty('innerText'))],
],
image: [
['#mainImage', fromProperty('src')],
],
},
},
{
domains: ['walmart.com', 'www.walmart.com'],
features: {
title: [
['h1.prod-ProductTitle', fromAttribute('content')],
['h1.prod-ProductTitle', fromProperty('innerText')],
],
price: [
['.PriceRange.prod-PriceHero', inUnits(fromProperty('innerText'))],
['.price-group', inUnits(fromAttribute('aria-label'))],
['.price-group', inUnits(fromProperty('innerText'))],
],
image: [
['.prod-hero-image-image', fromProperty('src')],
['.prod-hero-image-carousel-image', fromProperty('src')],
],
},
},
{
domains: ['mkelly.me', 'www.mkelly.me'],
features: {
title: [
['#title', fromProperty('innerText')],
],
price: [
['#price', inUnits(fromProperty('innerText'))],
],
image: [
['img', fromProperty('src')],
],
},
},
];

export default fallbackExtractionData;
5 changes: 3 additions & 2 deletions src/extraction/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
import {getPriceInSubunits} from 'commerce/extraction/utils';
import {parsePrice} from 'commerce/extraction/utils';

// Minimum score to be considered the "correct" feature element extracted by Fathom
const SCORE_THRESHOLD = 4;
Expand Down Expand Up @@ -42,7 +42,8 @@ const PRODUCT_FEATURES = {
price: {
...FEATURE_DEFAULTS,
getValueFromElement(element) {
return getPriceInSubunits(element);
const tokens = Array.from(element.childNodes).map(node => node.textContent);
return parsePrice(tokens);
},
},
};
Expand Down
99 changes: 0 additions & 99 deletions src/extraction/product_extraction_data.json

This file was deleted.

Loading