Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Merge pull request #38 from mozilla/36-fathom
Browse files Browse the repository at this point in the history
#36: Integrate Fathom-based page extraction with a simple ruleset.
  • Loading branch information
biancadanforth authored Aug 3, 2018
2 parents 5ed5552 + 505c16d commit 6391a7d
Show file tree
Hide file tree
Showing 10 changed files with 2,808 additions and 2,526 deletions.
5,055 changes: 2,615 additions & 2,440 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
},
"dependencies": {
"autobind-decorator": "2.1.0",
"fathom-web": "2.3.0",
"prop-types": "15.6.2",
"react": "16.4.1",
"react-dom": "16.4.1"
Expand Down
4 changes: 2 additions & 2 deletions src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

browser.runtime.onConnect.addListener((port) => {
port.onMessage.addListener((message) => {
if (message.type === 'product-data') {
console.log(message.data); // eslint-disable-line no-console
if (message.from === 'content' && message.subject === 'ready') {
console.log(message.extractedProduct); // eslint-disable-line no-console
}
});
port.postMessage({
Expand Down
88 changes: 88 additions & 0 deletions src/fallback_extraction.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
* Uses CSS selectors, or failing that, Open Graph <meta> tags to extract
* a product from its product page, where a 'product' is defined by the bundle
* of features that makes it identifiable.
*
* Features: title, image, price
*/

import extractionData from 'commerce/product_extraction_data.json';

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
}
}
}
return data;
}
3 changes: 3 additions & 0 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"hasPriceClass": 2
}
70 changes: 70 additions & 0 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
* Uses Fathom to extract a product from its product page,
* where a 'product' is defined by the bundle of features that
* makes it identifiable.
*
* Features: title, image, price
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass;

/**
* Scores fnodes with a "price" class
*/
function hasPriceClass(fnode) {
if (fnode.element.classList.contains('price')) {
return fathomCoeffs.hasPriceClass;
}
return 1;
}

/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
// get all elements that could contain the price
rule(dom('div'), type('priceish')),

// check class names to see if they contain 'price'
rule(type('priceish'), score(hasPriceClass)),

// return price element with max score
rule(type('priceish').max(), out('product-price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
let fnodesList = rules.against(doc).get('product-price');
fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
return fnodesList[0].element;
}
return null;
}

/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
const priceEle = runRuleset(doc);
if (priceEle) {
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
if (price) {
return {
price,
};
}
}
return null;
}
4 changes: 3 additions & 1 deletion src/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
"content_scripts": [
{
"matches": ["<all_urls>"],
"js": ["product_info.bundle.js"]
"js": ["product_info.bundle.js"],
"run_at": "document_idle",
"all_frames": false
}
],
"permissions": [
Expand Down
1 change: 1 addition & 0 deletions src/product_extraction_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"price": {
"selectors": [
"#priceblock_ourprice",
"#priceblock_dealprice",
".display-price",
".offer-price"
],
Expand Down
98 changes: 15 additions & 83 deletions src/product_info.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {retry} from 'commerce/utils';
import extractionData from './product_extraction_data.json';
/**
* Note that this page is defined in manifest.json to run at "document_idle"
* which is after all DOM content has been loaded.
*/

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};
import extractProductWithFathom from 'commerce/fathom_extraction';
import extractProductWithFallback from 'commerce/fallback_extraction';
import {retry} from 'commerce/utils';

/**
* Open a Port to the background script and wait for the background script to
Expand Down Expand Up @@ -54,80 +54,12 @@ async function openBackgroundPort() {
* and if so, sends it to the background script via the port.
*/
async function getProductInfo(port) {
const productInfo = extractData();
if (productInfo) {
port.postMessage({
type: 'product-data',
data: productInfo,
});
}
}

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
function extractData() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
}
}
}
return data;
const extractedProduct = (extractProductWithFathom(window.document)
|| extractProductWithFallback());
extractedProduct.url = window.document.URL;
port.postMessage({
from: 'content',
subject: 'ready',
extractedProduct,
});
}
10 changes: 10 additions & 0 deletions webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,18 @@ module.exports = {
{loader: 'css-loader'},
],
},
// jsdom is imported by fathom-web utils.js; it's only used for testing
{
test: /jsdom.*/,
use: {
loader: 'null-loader',
},
},
],
},
node: {
fs: 'empty',
},
plugins: [
new CopyWebpackPlugin([
// Static files
Expand Down

0 comments on commit 6391a7d

Please sign in to comment.