From 02c220d21a688ba18a22e630163dac63f6cd331f Mon Sep 17 00:00:00 2001 From: Michael Kelly Date: Mon, 1 Oct 2018 10:59:42 -0700 Subject: [PATCH 1/3] Mild refactor of the extraction module. --- src/background/index.js | 2 +- .../{fallback_extraction.js => fallback/index.js} | 2 +- .../selectors.js} | 0 .../coefficients.json} | 0 .../{fathom_extraction.js => fathom/index.js} | 4 ++-- src/extraction/{ => fathom}/ruleset_factory.js | 14 +++++--------- src/extraction/{ => fathom}/trainees.js | 11 +++++------ src/{product_info.js => extraction/index.js} | 15 ++++++++------- webpack.config.js | 2 +- 9 files changed, 23 insertions(+), 27 deletions(-) rename src/extraction/{fallback_extraction.js => fallback/index.js} (96%) rename src/extraction/{fallback_extraction_selectors.js => fallback/selectors.js} (100%) rename src/extraction/{fathom_default_coefficients.json => fathom/coefficients.json} (100%) rename src/extraction/{fathom_extraction.js => fathom/index.js} (94%) rename src/extraction/{ => fathom}/ruleset_factory.js (97%) rename src/extraction/{ => fathom}/trainees.js (86%) rename src/{product_info.js => extraction/index.js} (86%) diff --git a/src/background/index.js b/src/background/index.js index 448ec90..94391eb 100644 --- a/src/background/index.js +++ b/src/background/index.js @@ -53,7 +53,7 @@ import {loadStateFromStorage} from 'commerce/state/sync'; window.registeredContentScript = browser.contentScripts.register({ matches: [''], js: [ - {file: 'product_info.bundle.js'}, + {file: 'extraction.bundle.js'}, ], runAt: 'document_idle', allFrames: true, diff --git a/src/extraction/fallback_extraction.js b/src/extraction/fallback/index.js similarity index 96% rename from src/extraction/fallback_extraction.js rename to src/extraction/fallback/index.js index 76706ca..6dee32d 100644 --- a/src/extraction/fallback_extraction.js +++ b/src/extraction/fallback/index.js @@ -10,7 +10,7 @@ * Features: title, image, price */ -import extractionData from 'commerce/extraction/fallback_extraction_selectors'; +import extractionData from 'commerce/extraction/fallback/selectors'; const OPEN_GRAPH_PROPERTY_VALUES = { diff --git a/src/extraction/fallback_extraction_selectors.js b/src/extraction/fallback/selectors.js similarity index 100% rename from src/extraction/fallback_extraction_selectors.js rename to src/extraction/fallback/selectors.js diff --git a/src/extraction/fathom_default_coefficients.json b/src/extraction/fathom/coefficients.json similarity index 100% rename from src/extraction/fathom_default_coefficients.json rename to src/extraction/fathom/coefficients.json diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom/index.js similarity index 94% rename from src/extraction/fathom_extraction.js rename to src/extraction/fathom/index.js index 66f70f4..b264ef8 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom/index.js @@ -10,8 +10,8 @@ * Features: title, image, price */ -import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; -import RulesetFactory from 'commerce/extraction/ruleset_factory'; +import defaultCoefficients from 'commerce/extraction/fathom/coefficients.json'; +import RulesetFactory from 'commerce/extraction/fathom/ruleset_factory'; import {parsePrice} from 'commerce/extraction/utils'; // Minimum score to be considered the "correct" feature element extracted by Fathom diff --git a/src/extraction/ruleset_factory.js b/src/extraction/fathom/ruleset_factory.js similarity index 97% rename from src/extraction/ruleset_factory.js rename to src/extraction/fathom/ruleset_factory.js index 8a9778e..c5989ba 100644 --- a/src/extraction/ruleset_factory.js +++ b/src/extraction/fathom/ruleset_factory.js @@ -2,12 +2,6 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/** - * Exports a RulesetFactory class, which when instantiated, binds Fathom - * coefficients to a ruleset. An instance of this class is used for product - * feature extraction (`fathom_extraction.js`) and for training (`trainees.js`). - */ - import {dom, out, rule, ruleset, score, type} from 'fathom-web'; // Since the fathom-trainees add-on currently uses a submodule of Fathom, for // training, replace 'utils' with 'utilsForFrontend' @@ -20,11 +14,13 @@ const TOP_BUFFER = 150; const ZEROISH = 0.08; const ONEISH = 0.9; +/** + * Creates Fathom ruleset instances, and holds individual rule methods for + * easier testing. + */ export default class RulesetFactory { /** - * Create a ruleset factory. - * - * @param {Array.number} coefficients The coefficients to apply for each rule + * @param {number[]} coefficients */ constructor(coefficients) { [ diff --git a/src/extraction/trainees.js b/src/extraction/fathom/trainees.js similarity index 86% rename from src/extraction/trainees.js rename to src/extraction/fathom/trainees.js index 2e82854..0f0a4f0 100644 --- a/src/extraction/trainees.js +++ b/src/extraction/fathom/trainees.js @@ -3,7 +3,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* eslint-disable import/no-unresolved */ -import defaultCoefficients from './fathom_default_coefficients.json'; +import defaultCoefficients from './coefficients.json'; import RulesetFactory from './ruleset_factory'; // Array of numbers corresponding to the coefficients in order @@ -21,11 +21,10 @@ const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients); * * How to train: * 1. Fork the `mozilla/fathom-trainees` repo, - * 2. In the `fathom-trainees` add-on, copy this file, - * `./extraction/fathom_default_coefficients.json` and - * `./extraction/ruleset_factory.js` to the `./src` folder. - * * Note: You will have to replace 'utils' with 'utilsForFrontend' on the - * import in `ruleset_factory.js`. See that file for more information. + * 2. In the `fathom-trainees` add-on, copy `src/extraction/fathom` to the + * `./src` folder. + * * Note: You will have to replace 'utils' with 'utilsForFrontend' on the + * import in `ruleset_factory.js`. See that file for more information. * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer. * * Notes: diff --git a/src/product_info.js b/src/extraction/index.js similarity index 86% rename from src/product_info.js rename to src/extraction/index.js index 624adff..c57e0a8 100644 --- a/src/product_info.js +++ b/src/extraction/index.js @@ -3,23 +3,24 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /** - * Note that this page is defined in the background script to run at - * "document_idle", which is after all DOM content has been loaded. + * Content script injected into tabs to attempt extracting information about a + * product from the webpage. */ import config from 'commerce/config/content'; -import extractProductWithFathom from 'commerce/extraction/fathom_extraction'; -import extractProductWithFallback from 'commerce/extraction/fallback_extraction'; +import extractProductWithFathom from 'commerce/extraction/fathom'; +import extractProductWithFallback from 'commerce/extraction/fallback'; /** * Checks to see if any product information for the page was found, * and if so, sends it to the background script. */ -async function getProductInfo() { +async function attemptExtraction() { const extractedProduct = ( extractProductWithFathom(window.document) || extractProductWithFallback() ); + await browser.runtime.sendMessage({ from: 'content', subject: 'ready', @@ -52,8 +53,8 @@ async function getProductInfo() { // Make sure the page has finished loading, as JS could alter the DOM. if (document.readyState === 'complete') { - getProductInfo(); + attemptExtraction(); } else { - window.addEventListener('load', getProductInfo); + window.addEventListener('load', attemptExtraction); } }()); diff --git a/webpack.config.js b/webpack.config.js index 2f64e20..aa0fe71 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -21,7 +21,7 @@ module.exports = { target: 'web', entry: { background: './src/background/index', - product_info: './src/product_info', + extraction: './src/extraction', browser_action: './src/browser_action/index', }, output: { From 3af22ed8285e08bf65bae8bc05e290ba5765fcc0 Mon Sep 17 00:00:00 2001 From: Michael Kelly Date: Mon, 1 Oct 2018 11:40:25 -0700 Subject: [PATCH 2/3] Fix #29: Attempt extraction after parsing is finished, before loading. --- src/background/index.js | 2 +- src/extraction/index.js | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/background/index.js b/src/background/index.js index 94391eb..8cfae69 100644 --- a/src/background/index.js +++ b/src/background/index.js @@ -55,7 +55,7 @@ import {loadStateFromStorage} from 'commerce/state/sync'; js: [ {file: 'extraction.bundle.js'}, ], - runAt: 'document_idle', + runAt: 'document_end', allFrames: true, }); diff --git a/src/extraction/index.js b/src/extraction/index.js index c57e0a8..1391a9a 100644 --- a/src/extraction/index.js +++ b/src/extraction/index.js @@ -4,7 +4,8 @@ /** * Content script injected into tabs to attempt extracting information about a - * product from the webpage. + * product from the webpage. Set to run at "document_end" after the page has + * been parsed but before all resources have been loaded. */ import config from 'commerce/config/content'; @@ -21,15 +22,17 @@ async function attemptExtraction() { || extractProductWithFallback() ); - await browser.runtime.sendMessage({ - from: 'content', - subject: 'ready', - extractedProduct: { - ...extractedProduct, - url: document.location.href, - date: (new Date()).toISOString(), - }, - }); + if (extractedProduct) { + await browser.runtime.sendMessage({ + from: 'content', + subject: 'ready', + extractedProduct: { + ...extractedProduct, + url: document.location.href, + date: (new Date()).toISOString(), + }, + }); + } } (async function main() { @@ -51,10 +54,9 @@ async function attemptExtraction() { return; } - // Make sure the page has finished loading, as JS could alter the DOM. - if (document.readyState === 'complete') { + // Extract immediately, and again if the readyState changes. + attemptExtraction(); + document.addEventListener('readystatechange', () => { attemptExtraction(); - } else { - window.addEventListener('load', attemptExtraction); - } + }); }()); From 150cd9bcf538d79addd3f19ac598b6c5d4eee9aa Mon Sep 17 00:00:00 2001 From: Michael Kelly Date: Mon, 8 Oct 2018 11:02:20 -0700 Subject: [PATCH 3/3] Split extraction methods and ensure they return null when no match is found. Open Graph and selector-based extraction are now separate forms of extraction instead of a single, "fallback" extraction method. --- src/extraction/index.js | 36 +++++++++++++++---- src/extraction/open_graph.js | 33 +++++++++++++++++ .../{fallback => selector}/index.js | 30 +++++++--------- .../{fallback => selector}/selectors.js | 0 4 files changed, 75 insertions(+), 24 deletions(-) create mode 100644 src/extraction/open_graph.js rename src/extraction/{fallback => selector}/index.js (72%) rename src/extraction/{fallback => selector}/selectors.js (100%) diff --git a/src/extraction/index.js b/src/extraction/index.js index 1391a9a..046f383 100644 --- a/src/extraction/index.js +++ b/src/extraction/index.js @@ -10,18 +10,42 @@ import config from 'commerce/config/content'; import extractProductWithFathom from 'commerce/extraction/fathom'; -import extractProductWithFallback from 'commerce/extraction/fallback'; +import extractProductWithFallback from 'commerce/extraction/selector'; +import extractProductWithOpenGraph from 'commerce/extraction/open_graph'; + +/** + * Extraction methods are given the document object for the page, and must + * return either a valid ExtractedProduct, or null if a valid product could not + * be found. + */ +const EXTRACTION_METHODS = [ + extractProductWithFathom, + extractProductWithFallback, + extractProductWithOpenGraph, +]; + +/** + * Perform product extraction, trying each method from EXTRACTION_METHODS in + * order until one of them returns a truthy result. + * @return {ExtractedProduct|null} + */ +function extractProduct() { + for (const extract of EXTRACTION_METHODS) { + const extractedProduct = extract(window.document); + if (extractedProduct) { + return extractedProduct; + } + } + + return null; +} /** * Checks to see if any product information for the page was found, * and if so, sends it to the background script. */ async function attemptExtraction() { - const extractedProduct = ( - extractProductWithFathom(window.document) - || extractProductWithFallback() - ); - + const extractedProduct = extractProduct(); if (extractedProduct) { await browser.runtime.sendMessage({ from: 'content', diff --git a/src/extraction/open_graph.js b/src/extraction/open_graph.js new file mode 100644 index 0000000..144f7fc --- /dev/null +++ b/src/extraction/open_graph.js @@ -0,0 +1,33 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Product extraction via Open Graph tags. + */ + +const OPEN_GRAPH_PROPERTY_VALUES = { + title: 'og:title', + image: 'og:image', + price: 'og:price:amount', +}; + +/** + * Returns any product information available on the page from Open Graph + * tags. + */ +export default function extractProduct() { + const extractedProduct = {}; + for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { + const metaEle = document.querySelector(`meta[property='${propertyValue}']`); + + // Fail early if any required tags aren't found. + if (!metaEle) { + return null; + } + + extractedProduct[feature] = metaEle.getAttribute('content'); + } + + return extractedProduct; +} diff --git a/src/extraction/fallback/index.js b/src/extraction/selector/index.js similarity index 72% rename from src/extraction/fallback/index.js rename to src/extraction/selector/index.js index 6dee32d..ccd87bb 100644 --- a/src/extraction/fallback/index.js +++ b/src/extraction/selector/index.js @@ -10,14 +10,7 @@ * Features: title, image, price */ -import extractionData from 'commerce/extraction/fallback/selectors'; - - -const OPEN_GRAPH_PROPERTY_VALUES = { - title: 'og:title', - image: 'og:image', - price: 'og:price:amount', -}; +import extractionData from 'commerce/extraction/selector/selectors'; /** * Returns any extraction data found for the vendor based on the URL @@ -54,22 +47,23 @@ function findValue(extractors) { /** * Returns any product information available on the page from CSS - * selectors if they exist, otherwise from Open Graph tags. + * selectors if they exist. */ export default function extractProduct() { - const extractedProduct = {}; const featureInfo = getFeatureInfo(); if (featureInfo) { + const extractedProduct = {}; for (const [feature, extractors] of Object.entries(featureInfo)) { - extractedProduct[feature] = findValue(extractors); - } - } else { - for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { - const metaEle = document.querySelector(`meta[property='${propertyValue}']`); - if (metaEle) { - extractedProduct[feature] = metaEle.getAttribute('content'); + const featureValue = findValue(extractors); + if (!featureValue) { + return null; } + + extractedProduct[feature] = featureValue; } + + return extractedProduct; } - return extractedProduct; + + return null; } diff --git a/src/extraction/fallback/selectors.js b/src/extraction/selector/selectors.js similarity index 100% rename from src/extraction/fallback/selectors.js rename to src/extraction/selector/selectors.js