Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

#36: Integrate Fathom-based page extraction with a simple ruleset. #38

Merged
merged 5 commits into from
Aug 3, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5,055 changes: 2,615 additions & 2,440 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
},
"dependencies": {
"autobind-decorator": "2.1.0",
"fathom-web": "2.3.0",
"prop-types": "15.6.2",
"react": "16.4.1",
"react-dom": "16.4.1"
Expand Down
4 changes: 2 additions & 2 deletions src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

browser.runtime.onConnect.addListener((port) => {
port.onMessage.addListener((message) => {
if (message.type === 'product-data') {
console.log(message.data); // eslint-disable-line no-console
if (message.from === 'content' && message.subject === 'ready') {
console.log(message.extractedProduct); // eslint-disable-line no-console
}
});
port.postMessage({
Expand Down
3 changes: 3 additions & 0 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"hasDivWithPriceClass": 1
}
54 changes: 54 additions & 0 deletions src/fathom_ruleset.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/** This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double asterisk for the doc comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doc comment is an excellent summary of the file!

* Using Fathom to extract a product from its product page,
* where a 'product' is defined by the bundle of features that
* makes it identifiable.
*
* Features: title, image, price
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

/**
* Checks to see if an element is a <div> with a class of "price".
* Returns an integer corresponding to the coefficient to use for
* scoring an element with this rule.
*/
function hasDivWithPriceClass(fnode) {
if (fnode.element.classList.contains('price')) {
return fathomCoeffs.hasDivWithPriceClass;
}
return 1;
}

/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
// get all elements that could contain the price
rule(dom('div'), type('priceish')),

// check class names to see if they contain 'price'
rule(type('priceish'), score(hasDivWithPriceClass)),

// return price element with max score
rule(type('priceish').max(), out('product-price')),
);

/**
* Extracts the highest scoring element for a given feature contained
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "given feature" bit of this comment doesn't really apply anymore. There's no feature being passed in anymore, it just returns all the features we currently know how to extract.

* in a page's HTML document.
*/
export default function runTuningRoutine(doc) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ruleset isn't really a "tuning routine". Maybe rename this runRuleset?

Copy link
Collaborator Author

@biancadanforth biancadanforth Aug 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'm not really sure what "tuningRoutine" means in this context, but we certainly aren't tuning anything here. This was a relic of Swathi's/Victor's code.

const fnodesList = rules.against(doc).get('product-price');
// It is possible for multiple elements to have the same highest score.
const elementsList = fnodesList.map(fnode => fnode.element);
if (elementsList.length === 1) {
return elementsList[0];
}
return null;
}
4 changes: 3 additions & 1 deletion src/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
"content_scripts": [
{
"matches": ["<all_urls>"],
"js": ["product_info.bundle.js"]
"js": ["product_info.bundle.js"],
"run_at": "document_idle",
"all_frames": false
}
],
"permissions": [
Expand Down
173 changes: 100 additions & 73 deletions src/product_info.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,14 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Note that this page is defined in manifest.json to run at "document_idle"
* which is after all DOM content has been loaded.
*/

import runTuningRoutine from 'commerce/fathom_ruleset';
import {retry} from 'commerce/utils';
import extractionData from './product_extraction_data.json';
import extractionData from 'commerce/product_extraction_data.json';

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand Down Expand Up @@ -49,85 +55,106 @@ async function openBackgroundPort() {
}
}());

/**
* Checks to see if any product information for the page was found,
* and if so, sends it to the background script via the port.
*/
async function getProductInfo(port) {
const productInfo = extractData();
if (productInfo) {
port.postMessage({
type: 'product-data',
data: productInfo,
});
}
}

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
function getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
const fallbackExtraction = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we define each of these extraction objects in separate files and import them? That'd help focus this module a bit more.

For the Fathom one, you can probably just define that in the existing fathom file and export it instead of the "return DOM nodes" function.

/**
* Returns any extraction data found for the vendor based on the URL
* for the page.
*/
getProductAttributeInfo() {
const hostname = new URL(window.location.href).host;
for (const [vendor, attributeInfo] of Object.entries(extractionData)) {
if (hostname.includes(vendor)) {
return attributeInfo;
}
}
}
return null;
}
return null;
},

/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
function extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
}
/**
* Extracts and returns the string value for a given element property or attribute.
*
* @param {HTMLElement} element
* @param {string} extractionProperty
*/
extractValueFromElement(element, extractionProperty) {
switch (extractionProperty) {
case 'content':
return element.getAttribute('content');
case 'innerText':
return element.innerText;
case 'src':
return element.src;
default:
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`);
}
},

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
function extractData() {
const data = {};
const attributeInfo = getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
*/
extractProduct() {
const data = {};
const attributeInfo = this.getProductAttributeInfo();
if (attributeInfo) {
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) {
const {selectors, extractUsing} = extractor;
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element) {
data[productAttribute] = this.extractValueFromElement(element, extractUsing);
if (data[productAttribute]) {
break;
} else {
throw new Error(`Element found did not return a valid product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
} else if (selector === selectors[selectors.length - 1]) {
// None of the selectors matched an element on the page
throw new Error(`No elements found with vendor data for product ${productAttribute}.`);
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
}
}
}
} else {
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${value}']`);
if (metaEle) {
data[key] = metaEle.getAttribute('content');
data.url = window.document.URL;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extraction doesn't change per-extraction-method, so we can just add it in in getProductInfo instead of duplicating it.

return data;
},
};

const fathomExtraction = {
/*
* Run the ruleset for the product features against the current window document
*/
extractProduct() {
const priceEle = runTuningRoutine(window.document);
if (priceEle) {
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
if (price) {
return {
price,
url: window.document.URL,
};
}
}
}
return data;
return null;
},
};

/**
* Checks to see if any product information for the page was found,
* and if so, sends it to the background script via the port.
*/
async function getProductInfo(port) {
const extractedProduct = fathomExtraction.extractProduct() || fallbackExtraction.extractProduct();
port.postMessage({
from: 'content',
subject: 'ready',
extractedProduct,
});
}
10 changes: 10 additions & 0 deletions webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,18 @@ module.exports = {
{loader: 'css-loader'},
],
},
// jsdom is imported by fathom-web utils.js; it's only used for testing
{
test: /jsdom.*/,
use: {
loader: 'null-loader',
},
},
],
},
node: {
fs: 'empty',
},
plugins: [
new CopyWebpackPlugin([
// Static files
Expand Down