Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Merge pull request #143 from mozilla/robust-extraction
Browse files Browse the repository at this point in the history
Fix #29: Attempt extraction after parsing is finished, before loading.
  • Loading branch information
Osmose authored Oct 10, 2018
2 parents b5a5689 + 150cd9b commit 644b70c
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 97 deletions.
4 changes: 2 additions & 2 deletions src/background/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ import {loadStateFromStorage} from 'commerce/state/sync';
window.registeredContentScript = browser.contentScripts.register({
matches: ['<all_urls>'],
js: [
{file: 'product_info.bundle.js'},
{file: 'extraction.bundle.js'},
],
runAt: 'document_idle',
runAt: 'document_end',
allFrames: true,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
* Features: title, image, price
*/

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
import defaultCoefficients from 'commerce/extraction/fathom/coefficients.json';
import RulesetFactory from 'commerce/extraction/fathom/ruleset_factory';
import {parsePrice} from 'commerce/extraction/utils';

// Minimum score to be considered the "correct" feature element extracted by Fathom
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Exports a RulesetFactory class, which when instantiated, binds Fathom
* coefficients to a ruleset. An instance of this class is used for product
* feature extraction (`fathom_extraction.js`) and for training (`trainees.js`).
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
// Since the fathom-trainees add-on currently uses a submodule of Fathom, for
// training, replace 'utils' with 'utilsForFrontend'
Expand All @@ -20,11 +14,13 @@ const TOP_BUFFER = 150;
const ZEROISH = 0.08;
const ONEISH = 0.9;

/**
* Creates Fathom ruleset instances, and holds individual rule methods for
* easier testing.
*/
export default class RulesetFactory {
/**
* Create a ruleset factory.
*
* @param {Array.number} coefficients The coefficients to apply for each rule
* @param {number[]} coefficients
*/
constructor(coefficients) {
[
Expand Down
11 changes: 5 additions & 6 deletions src/extraction/trainees.js → src/extraction/fathom/trainees.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/* eslint-disable import/no-unresolved */
import defaultCoefficients from './fathom_default_coefficients.json';
import defaultCoefficients from './coefficients.json';
import RulesetFactory from './ruleset_factory';

// Array of numbers corresponding to the coefficients in order
Expand All @@ -21,11 +21,10 @@ const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients);
*
* How to train:
* 1. Fork the `mozilla/fathom-trainees` repo,
* 2. In the `fathom-trainees` add-on, copy this file,
* `./extraction/fathom_default_coefficients.json` and
* `./extraction/ruleset_factory.js` to the `./src` folder.
* * Note: You will have to replace 'utils' with 'utilsForFrontend' on the
* import in `ruleset_factory.js`. See that file for more information.
* 2. In the `fathom-trainees` add-on, copy `src/extraction/fathom` to the
* `./src` folder.
* * Note: You will have to replace 'utils' with 'utilsForFrontend' on the
* import in `ruleset_factory.js`. See that file for more information.
* 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer.
*
* Notes:
Expand Down
86 changes: 86 additions & 0 deletions src/extraction/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Content script injected into tabs to attempt extracting information about a
* product from the webpage. Set to run at "document_end" after the page has
* been parsed but before all resources have been loaded.
*/

import config from 'commerce/config/content';
import extractProductWithFathom from 'commerce/extraction/fathom';
import extractProductWithFallback from 'commerce/extraction/selector';
import extractProductWithOpenGraph from 'commerce/extraction/open_graph';

/**
* Extraction methods are given the document object for the page, and must
* return either a valid ExtractedProduct, or null if a valid product could not
* be found.
*/
const EXTRACTION_METHODS = [
extractProductWithFathom,
extractProductWithFallback,
extractProductWithOpenGraph,
];

/**
* Perform product extraction, trying each method from EXTRACTION_METHODS in
* order until one of them returns a truthy result.
* @return {ExtractedProduct|null}
*/
function extractProduct() {
for (const extract of EXTRACTION_METHODS) {
const extractedProduct = extract(window.document);
if (extractedProduct) {
return extractedProduct;
}
}

return null;
}

/**
* Checks to see if any product information for the page was found,
* and if so, sends it to the background script.
*/
async function attemptExtraction() {
const extractedProduct = extractProduct();
if (extractedProduct) {
await browser.runtime.sendMessage({
from: 'content',
subject: 'ready',
extractedProduct: {
...extractedProduct,
url: document.location.href,
date: (new Date()).toISOString(),
},
});
}
}

(async function main() {
// If we're in an iframe, don't bother extracting a product EXCEPT if we were
// started by the background script for a price check.
const isInIframe = window !== window.top;
const isBackgroundUpdate = window.location.hash === '#moz-commerce-background';
if (isInIframe && !isBackgroundUpdate) {
return;
}

// Only perform extraction on allowlisted sites. Background updates get a
// pass; we don't want to accidentally freeze updates for products that are
// being tracked no matter what.
const url = new URL(document.location.href);
const allowList = await config.get('extractionAllowlist');
const allowAll = allowList.length === 1 && allowList[0] === '*';
if (!allowAll && !isBackgroundUpdate && !allowList.includes(url.host)) {
return;
}

// Extract immediately, and again if the readyState changes.
attemptExtraction();
document.addEventListener('readystatechange', () => {
attemptExtraction();
});
}());
33 changes: 33 additions & 0 deletions src/extraction/open_graph.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Product extraction via Open Graph tags.
*/

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};

/**
* Returns any product information available on the page from Open Graph <meta>
* tags.
*/
export default function extractProduct() {
const extractedProduct = {};
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);

// Fail early if any required tags aren't found.
if (!metaEle) {
return null;
}

extractedProduct[feature] = metaEle.getAttribute('content');
}

return extractedProduct;
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/fallback_extraction_selectors';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};
import extractionData from 'commerce/extraction/selector/selectors';

/**
* Returns any extraction data found for the vendor based on the URL
Expand Down Expand Up @@ -54,22 +47,23 @@ function findValue(extractors) {

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
* selectors if they exist.
*/
export default function extractProduct() {
const extractedProduct = {};
const featureInfo = getFeatureInfo();
if (featureInfo) {
const extractedProduct = {};
for (const [feature, extractors] of Object.entries(featureInfo)) {
extractedProduct[feature] = findValue(extractors);
}
} else {
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);
if (metaEle) {
extractedProduct[feature] = metaEle.getAttribute('content');
const featureValue = findValue(extractors);
if (!featureValue) {
return null;
}

extractedProduct[feature] = featureValue;
}

return extractedProduct;
}
return extractedProduct;

return null;
}
59 changes: 0 additions & 59 deletions src/product_info.js

This file was deleted.

2 changes: 1 addition & 1 deletion webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ module.exports = {
target: 'web',
entry: {
background: './src/background/index',
product_info: './src/product_info',
extraction: './src/extraction',
browser_action: './src/browser_action/index',
},
output: {
Expand Down

0 comments on commit 644b70c

Please sign in to comment.