Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Update with changes from dependency: PR#111
Browse files Browse the repository at this point in the history
* Replaces ‘fallback_extraction_selectors.json’ with a JS version, which allows the tuples associating a CSS selector with how to extract information to point to a site-specific extraction method that returns the desired value when executed.
* Renamed some variables in various functions in ‘fallback_extraction.js’ so that it more closely matches its sister functions in ‘fathom_extraction.js’ for improved readability.
* Modified ‘getPriceInSubunits’ function in './src/extraction/utils.js' to take in an HTML element (from Fathom extraction) OR a string (from fallback extraction).
  * Refactored some of the supporting functions.
  • Loading branch information
biancadanforth committed Sep 21, 2018
1 parent 0ba13fc commit 200bca9
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 122 deletions.
39 changes: 17 additions & 22 deletions src/extraction/fallback_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/fallback_extraction_selectors.json';
import {getPriceString, extractValueFromElement} from 'commerce/utils';
import extractionData from 'commerce/extraction/fallback_extraction_selectors';


const OPEN_GRAPH_PROPERTY_VALUES = {
Expand All @@ -24,12 +23,12 @@ const OPEN_GRAPH_PROPERTY_VALUES = {
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
function getFeatureInfo() {
const url = window.location.href;
for (const [regExpStr, attributeInfo] of Object.entries(extractionData)) {
for (const [regExpStr, featureInfo] of Object.entries(extractionData)) {
const regExp = new RegExp(regExpStr);
if (regExp.test(url)) {
return attributeInfo;
return featureInfo;
}
}
return null;
Expand All @@ -40,37 +39,33 @@ function getProductAttributeInfo() {
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, tuples] of Object.entries(attributeInfo)) {
for (const tuple of tuples) {
const [selector, extractUsing] = tuple;
const extractedProduct = {};
const featureInfo = getFeatureInfo();
if (featureInfo) {
for (const [feature, routines] of Object.entries(featureInfo)) {
for (const routine of routines) {
const [selector, extractionMethod] = routine;
const element = document.querySelector(selector);
if (element) {
if (productAttribute === 'price') {
data[productAttribute] = getPriceString(element, extractUsing);
} else {
data[productAttribute] = extractValueFromElement(element, extractUsing);
}
if (data[productAttribute]) {
extractedProduct[feature] = extractionMethod(element);
if (extractedProduct[feature]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
throw new Error(`Element found did not return a valid product ${feature}.`);
}
} else if (tuple === tuples[tuples.length - 1]) {
} else if (routine === routines[routines.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
throw new Error(`No elements found with vendor data for product ${feature}.`);
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
extractedProduct[key] = metaEle.getAttribute('content');
}
}
}
return data;
return extractedProduct;
}
114 changes: 114 additions & 0 deletions src/extraction/fallback_extraction_selectors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {getPriceInSubunits} from 'commerce/extraction/utils';

function inUnits(fn) {
return element => getPriceInSubunits(fn(element));
}

function fromProperty(property) {
return (element => element[property]);
}

function fromAttribute(attribute) {
return (element => element.getAttribute(attribute));
}


/**
* CSS selector data by site (represented by a regular expression), where each selector is paired
* with a method that extracts the value from the element returned by that selector.
*/
const fallbackExtractionData = {
'^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}amazon\\.com': {
title: [
['#productTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#priceblock_dealprice', inUnits(fromProperty('innerText'))],
['#priceblock_ourprice', inUnits(fromProperty('innerText'))],
['#price_inside_buybox', inUnits(fromProperty('innerText'))],
['#buybox .a-color-price', inUnits(fromProperty('innerText'))],
['input[name="displayedPrice"]', inUnits(fromAttribute('value'))],
['.a-size-large.a-color-price.guild_priceblock_ourprice', inUnits(fromProperty('innerText'))],
['.a-color-price.a-size-medium.a-align-bottom', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
['.offer-price', inUnits(fromProperty('innerText'))],
],
image: [
['#landingImage', fromProperty('src')],
['#imgBlkFront', fromProperty('src')],
['#ebooksImgBlkFront', fromProperty('src')],
],
},
'^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}bestbuy\\.com': {
title: [
['.sku-title h1', fromProperty('innerText')],
],
price: [
['.priceView-hero-price.priceView-purchase-price', inUnits(fromProperty('innerText'))],
],
image: [
['img.primary-image', fromProperty('src')],
],
},
'^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}ebay\\.com': {
title: [
['#itemTitle', fromProperty('innerText')],
['.product-title', fromProperty('innerText')],
],
price: [
['#prcIsum', inUnits(fromProperty('innerText'))],
['#orgPrc', inUnits(fromProperty('innerText'))],
['#mm-saleDscPrc', inUnits(fromProperty('innerText'))],
['.display-price', inUnits(fromProperty('innerText'))],
],
image: [
['#icImg', fromProperty('src')],
['.vi-image-gallery__image.vi-image-gallery__image--absolute-center', fromProperty('src')],
],
},
'^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}homedepot\\.com': {
title: [
['h1.product-title__title', fromProperty('innerText')],
],
price: [
['#ajaxPrice', inUnits(fromAttribute('content'))],
['#ajaxPriceAlt', inUnits(fromProperty('innerText'))],
],
image: [
['#mainImage', fromProperty('src')],
],
},
'^https?:\\/\\/([a-zA-Z0-9]+\\.){0,}walmart\\.com': {
title: [
['h1.prod-ProductTitle', fromAttribute('content')],
['h1.prod-ProductTitle', fromProperty('innerText')],
],
price: [
['.PriceRange.prod-PriceHero', inUnits(fromProperty('innerText'))],
['.price-group', inUnits(fromAttribute('aria-label'))],
['.price-group', inUnits(fromProperty('innerText'))],
],
image: [
['.prod-hero-image-image', fromProperty('src')],
['.prod-hero-image-carousel-image', fromProperty('src')],
],
},
'www.mkelly.me': {
title: [
['#title', fromProperty('innerText')],
],
price: [
['#price', inUnits(fromProperty('innerText'))],
],
image: [
['img', fromProperty('src')],
],
},
};

export default fallbackExtractionData;
88 changes: 0 additions & 88 deletions src/extraction/fallback_extraction_selectors.json

This file was deleted.

51 changes: 39 additions & 12 deletions src/extraction/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Converts a price element into a numerical price value in subunits (like cents).
* e.g. <span>$10.00</span> returns 1000. If string parsing fails, returns NaN.
* @param {HTMLElement} priceEle
* Converts a price element (from Fathom extraction) or string (from fallback extraction) into
* a numerical price value in subunits (like cents); e.g. <span>$10.00</span> and "$10.00" both
* return 1000. If string parsing fails, returns NaN.
* @param {HTMLElement|string} price
* @returns {Number} the price in subunits
*/
export function getPriceInSubunits(priceEle) {
const priceUnits = getPriceUnits(priceEle.childNodes);
export function getPriceInSubunits(price) {
let priceUnits = [];
if (typeof price === 'string') {
priceUnits = getPriceUnitsFromStr(price);
} else {
priceUnits = getPriceUnitsFromArr(Array.from(price.childNodes));
}
// Convert units and subunits to a single integer value in subunits
switch (priceUnits.length) {
case 1:
Expand All @@ -22,21 +28,42 @@ export function getPriceInSubunits(priceEle) {
}

/**
* Extracts price units by filtering and cleaning textContent from text and DOM nodes
* @param {Array.NodeList} nodes
* Extracts price units from textContent from text and/or DOM nodes
* @param {Array} Array of DOM nodes
* @returns {Array.Number}
*/
function getPriceUnits(nodes) {
const nodesArr = Array.from(nodes);
// Separate token strings in a list into substrings using '$' and '.' as separators
const allTokens = nodesArr.flatMap(token => token.textContent.split(/[.$]/));
function getPriceUnitsFromArr(arr) {
return cleanPriceTokens(arr.flatMap(token => splitString(token.textContent)));
}

/**
* Extracts price units from a string
* @param {String}
* @returns {Array.Number}
*/
function getPriceUnitsFromStr(str) {
return cleanPriceTokens(splitString(str));
}

/**
* Filters and cleans string tokens
* @param {Array.String}
* @returns {Array.Number}
*/
function cleanPriceTokens(tokens) {
// Filter out any tokens that do not contain a digit
const priceTokens = allTokens.filter(token => /\d/g.test(token));
const priceTokens = tokens.filter(token => /\d/g.test(token));

// Remove any non-digit characters for each token in the list
const cleanedPriceTokens = priceTokens.map(token => token.replace(/\D/g, ''));

// Convert price token strings to integers
return cleanedPriceTokens.map(token => parseInt(token, 10));
}

/**
* Separates a string into an array of substrings using '$' and '.' as separators
*/
function splitString(str) {
return str.split(/[.$]/);
}

0 comments on commit 200bca9

Please sign in to comment.