Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

#36: Integrate Fathom-based page extraction with a simple ruleset. #38

Merged
merged 5 commits into from
Aug 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5,055 changes: 2,615 additions & 2,440 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
},
"dependencies": {
"autobind-decorator": "2.1.0",
"fathom-web": "2.3.0",
"prop-types": "15.6.2",
"react": "16.4.1",
"react-dom": "16.4.1"
Expand Down
4 changes: 2 additions & 2 deletions src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

browser.runtime.onConnect.addListener((port) => {
port.onMessage.addListener((message) => {
if (message.type === 'product-data') {
console.log(message.data); // eslint-disable-line no-console
if (message.from === 'content' && message.subject === 'ready') {
console.log(message.extractedProduct); // eslint-disable-line no-console
}
});
port.postMessage({
Expand Down
88 changes: 88 additions & 0 deletions src/fallback_extraction.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
* Uses CSS selectors, or failing that, Open Graph <meta> tags to extract
* a product from its product page, where a 'product' is defined by the bundle
* of features that makes it identifiable.
*
* Features: title, image, price
*/

import extractionData from 'commerce/product_extraction_data.json';

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
export default function extractProduct() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
}
}
}
return data;
}
3 changes: 3 additions & 0 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"hasPriceClass": 2
}
70 changes: 70 additions & 0 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
* Uses Fathom to extract a product from its product page,
* where a 'product' is defined by the bundle of features that
* makes it identifiable.
*
* Features: title, image, price
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass;

/**
* Scores fnodes with a "price" class
*/
function hasPriceClass(fnode) {
if (fnode.element.classList.contains('price')) {
return fathomCoeffs.hasPriceClass;
}
return 1;
}

/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
// get all elements that could contain the price
rule(dom('div'), type('priceish')),

// check class names to see if they contain 'price'
rule(type('priceish'), score(hasPriceClass)),

// return price element with max score
rule(type('priceish').max(), out('product-price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
let fnodesList = rules.against(doc).get('product-price');
fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
return fnodesList[0].element;
}
return null;
}

/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
const priceEle = runRuleset(doc);
if (priceEle) {
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
if (price) {
return {
price,
};
}
}
return null;
}
4 changes: 3 additions & 1 deletion src/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
"content_scripts": [
{
"matches": ["<all_urls>"],
"js": ["product_info.bundle.js"]
"js": ["product_info.bundle.js"],
"run_at": "document_idle",
"all_frames": false
}
],
"permissions": [
Expand Down
1 change: 1 addition & 0 deletions src/product_extraction_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"price": {
"selectors": [
"#priceblock_ourprice",
"#priceblock_dealprice",
".display-price",
".offer-price"
],
Expand Down
98 changes: 15 additions & 83 deletions src/product_info.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import {retry} from 'commerce/utils';
import extractionData from './product_extraction_data.json';
/**
* Note that this page is defined in manifest.json to run at "document_idle"
* which is after all DOM content has been loaded.
*/

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};
import extractProductWithFathom from 'commerce/fathom_extraction';
import extractProductWithFallback from 'commerce/fallback_extraction';
import {retry} from 'commerce/utils';

/**
* Open a Port to the background script and wait for the background script to
Expand Down Expand Up @@ -54,80 +54,12 @@ async function openBackgroundPort() {
* and if so, sends it to the background script via the port.
*/
async function getProductInfo(port) {
const productInfo = extractData();
if (productInfo) {
port.postMessage({
type: 'product-data',
data: productInfo,
});
}
}

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
}
}
return null;
}

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
function extractData() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
}
}
}
return data;
const extractedProduct = (extractProductWithFathom(window.document)
|| extractProductWithFallback());
extractedProduct.url = window.document.URL;
port.postMessage({
from: 'content',
subject: 'ready',
extractedProduct,
});
}
10 changes: 10 additions & 0 deletions webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,18 @@ module.exports = {
{loader: 'css-loader'},
],
},
// jsdom is imported by fathom-web utils.js; it's only used for testing
{
test: /jsdom.*/,
use: {
loader: 'null-loader',
},
},
],
},
node: {
fs: 'empty',
},
plugins: [
new CopyWebpackPlugin([
// Static files
Expand Down