Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Fix #29: Attempt extraction after parsing is finished, before loading. #143

Merged
merged 3 commits into from
Oct 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/background/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ import {loadStateFromStorage} from 'commerce/state/sync';
window.registeredContentScript = browser.contentScripts.register({
matches: ['<all_urls>'],
js: [
{file: 'product_info.bundle.js'},
{file: 'extraction.bundle.js'},
],
runAt: 'document_idle',
runAt: 'document_end',
allFrames: true,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
* Features: title, image, price
*/

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';
import defaultCoefficients from 'commerce/extraction/fathom/coefficients.json';
import RulesetFactory from 'commerce/extraction/fathom/ruleset_factory';
import {parsePrice} from 'commerce/extraction/utils';

// Minimum score to be considered the "correct" feature element extracted by Fathom
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Exports a RulesetFactory class, which when instantiated, binds Fathom
* coefficients to a ruleset. An instance of this class is used for product
* feature extraction (`fathom_extraction.js`) and for training (`trainees.js`).
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
// Since the fathom-trainees add-on currently uses a submodule of Fathom, for
// training, replace 'utils' with 'utilsForFrontend'
Expand All @@ -20,11 +14,13 @@ const TOP_BUFFER = 150;
const ZEROISH = 0.08;
const ONEISH = 0.9;

/**
* Creates Fathom ruleset instances, and holds individual rule methods for
* easier testing.
*/
export default class RulesetFactory {
/**
* Create a ruleset factory.
*
* @param {Array.number} coefficients The coefficients to apply for each rule
* @param {number[]} coefficients
*/
constructor(coefficients) {
[
Expand Down
11 changes: 5 additions & 6 deletions src/extraction/trainees.js → src/extraction/fathom/trainees.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/* eslint-disable import/no-unresolved */
import defaultCoefficients from './fathom_default_coefficients.json';
import defaultCoefficients from './coefficients.json';
import RulesetFactory from './ruleset_factory';

// Array of numbers corresponding to the coefficients in order
Expand All @@ -21,11 +21,10 @@ const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients);
*
* How to train:
* 1. Fork the `mozilla/fathom-trainees` repo,
* 2. In the `fathom-trainees` add-on, copy this file,
* `./extraction/fathom_default_coefficients.json` and
* `./extraction/ruleset_factory.js` to the `./src` folder.
* * Note: You will have to replace 'utils' with 'utilsForFrontend' on the
* import in `ruleset_factory.js`. See that file for more information.
* 2. In the `fathom-trainees` add-on, copy `src/extraction/fathom` to the
* `./src` folder.
* * Note: You will have to replace 'utils' with 'utilsForFrontend' on the
* import in `ruleset_factory.js`. See that file for more information.
* 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer.
*
* Notes:
Expand Down
86 changes: 86 additions & 0 deletions src/extraction/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Content script injected into tabs to attempt extracting information about a
* product from the webpage. Set to run at "document_end" after the page has
* been parsed but before all resources have been loaded.
*/

import config from 'commerce/config/content';
import extractProductWithFathom from 'commerce/extraction/fathom';
import extractProductWithFallback from 'commerce/extraction/selector';
import extractProductWithOpenGraph from 'commerce/extraction/open_graph';

/**
* Extraction methods are given the document object for the page, and must
* return either a valid ExtractedProduct, or null if a valid product could not
* be found.
*/
const EXTRACTION_METHODS = [
extractProductWithFathom,
extractProductWithFallback,
extractProductWithOpenGraph,
];

/**
* Perform product extraction, trying each method from EXTRACTION_METHODS in
* order until one of them returns a truthy result.
* @return {ExtractedProduct|null}
*/
function extractProduct() {
for (const extract of EXTRACTION_METHODS) {
const extractedProduct = extract(window.document);
if (extractedProduct) {
return extractedProduct;
}
}

return null;
}

/**
* Checks to see if any product information for the page was found,
* and if so, sends it to the background script.
*/
async function attemptExtraction() {
const extractedProduct = extractProduct();
if (extractedProduct) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fallback extraction does not currently return null if it doesn’t find anything, so this would be a good time to change that before adding this existence check. (My bad)

We do also check for correct data types via propTypes further downstream, but it'd be nice to be consistent between the two extraction methods.

await browser.runtime.sendMessage({
from: 'content',
subject: 'ready',
extractedProduct: {
...extractedProduct,
url: document.location.href,
date: (new Date()).toISOString(),
},
});
}
}

(async function main() {
// If we're in an iframe, don't bother extracting a product EXCEPT if we were
// started by the background script for a price check.
const isInIframe = window !== window.top;
const isBackgroundUpdate = window.location.hash === '#moz-commerce-background';
if (isInIframe && !isBackgroundUpdate) {
return;
}

// Only perform extraction on allowlisted sites. Background updates get a
// pass; we don't want to accidentally freeze updates for products that are
// being tracked no matter what.
const url = new URL(document.location.href);
const allowList = await config.get('extractionAllowlist');
const allowAll = allowList.length === 1 && allowList[0] === '*';
if (!allowAll && !isBackgroundUpdate && !allowList.includes(url.host)) {
return;
}

// Extract immediately, and again if the readyState changes.
attemptExtraction();
document.addEventListener('readystatechange', () => {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably only want to add this listener and attempt extraction again if the first attempt failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea was to perform a second extraction in case JavaScript has modified the page and changed the product info, to ensure we get the correct product info every time. I'm not sure how common this is, though, or how performing two extractions affects the user's experience.

I think we should keep it, but can be convinced otherwise.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah hmm... so see if we can get something at all to put in the popup sooner, and then update with the second round of extraction...

Perhaps we could add a scalar probe that increments if the results of the first to the second extraction are different and compare that to the number of extraction attempts? Might be a wishlist probe.

What do you think about that idea @Osmose ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I wouldn't bother adding a probe like that initially.

attemptExtraction();
});
}());
33 changes: 33 additions & 0 deletions src/extraction/open_graph.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Product extraction via Open Graph tags.
*/

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};

/**
* Returns any product information available on the page from Open Graph <meta>
* tags.
*/
export default function extractProduct() {
const extractedProduct = {};
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);

// Fail early if any required tags aren't found.
if (!metaEle) {
return null;
}

extractedProduct[feature] = metaEle.getAttribute('content');
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a bug here, since the propType for extractedProduct expects a type of Number for the price value downstream. I took care of this for fallback and Fathom extraction earlier, but I neglected to add it here. I filed a follow-up issue #154 . I'll take care of that right now.

}

return extractedProduct;
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@
* Features: title, image, price
*/

import extractionData from 'commerce/extraction/fallback_extraction_selectors';


const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
image: 'og:image',
price: 'og:price:amount',
};
import extractionData from 'commerce/extraction/selector/selectors';

/**
* Returns any extraction data found for the vendor based on the URL
Expand Down Expand Up @@ -54,22 +47,23 @@ function findValue(extractors) {

/**
* Returns any product information available on the page from CSS
* selectors if they exist, otherwise from Open Graph <meta> tags.
* selectors if they exist.
*/
export default function extractProduct() {
const extractedProduct = {};
const featureInfo = getFeatureInfo();
if (featureInfo) {
const extractedProduct = {};
for (const [feature, extractors] of Object.entries(featureInfo)) {
extractedProduct[feature] = findValue(extractors);
}
} else {
for (const [feature, propertyValue] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) {
const metaEle = document.querySelector(`meta[property='${propertyValue}']`);
if (metaEle) {
extractedProduct[feature] = metaEle.getAttribute('content');
const featureValue = findValue(extractors);
if (!featureValue) {
return null;
}

extractedProduct[feature] = featureValue;
}

return extractedProduct;
}
return extractedProduct;

return null;
}
59 changes: 0 additions & 59 deletions src/product_info.js

This file was deleted.

2 changes: 1 addition & 1 deletion webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ module.exports = {
target: 'web',
entry: {
background: './src/background/index',
product_info: './src/product_info',
extraction: './src/extraction',
browser_action: './src/browser_action/index',
},
output: {
Expand Down