diff --git a/.eslintrc.json b/.eslintrc.json index 02081a6..7ff4655 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -14,6 +14,7 @@ "no-restricted-syntax": ["off"], "no-use-before-define": ["error", {"functions": false}], "no-prototype-builtins": ["off"], + "class-methods-use-this": ["off"], "react/jsx-one-expression-per-line": ["off"], "react/prefer-stateless-function": ["off"], diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/README.md b/README.md index 4fc09a6..7c802c8 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ After this, you can run `pipenv run test` to run the automated test suite. - `src/background` contains the background scripts that trigger UI elements (such as the page action) and periodically check for price updates. - `src/browser_action` contains the toolbar popup for managing the list of currently-tracked products. +- `src/extraction` contains the content scripts that extract product information from product web pages. - `src/page_action` contains the URL bar popup for viewing and tracking the product in the current tab. - `src/state` contains the Redux-based code for managing global extension state. - `src/tests` contains the automated test suite. diff --git a/src/fallback_extraction.js b/src/extraction/fallback_extraction.js similarity index 97% rename from src/fallback_extraction.js rename to src/extraction/fallback_extraction.js index db1f6ee..5b3535f 100644 --- a/src/fallback_extraction.js +++ b/src/extraction/fallback_extraction.js @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/* +/** * Uses CSS selectors, or failing that, Open Graph tags to extract * a product from its product page, where a 'product' is defined by the bundle * of features that makes it identifiable. @@ -10,7 +10,7 @@ * Features: title, image, price */ -import extractionData from 'commerce/product_extraction_data.json'; +import extractionData from 'commerce/extraction/product_extraction_data.json'; const OPEN_GRAPH_PROPERTY_VALUES = { title: 'og:title', diff --git a/src/extraction/fathom_default_coefficients.json b/src/extraction/fathom_default_coefficients.json new file mode 100644 index 0000000..7a70565 --- /dev/null +++ b/src/extraction/fathom_default_coefficients.json @@ -0,0 +1,12 @@ +{ + "hasDollarSignCoeff": 8, + "hasPriceInClassNameCoeff": 2, + "hasPriceInIDCoeff": 17, + "hasPriceishPatternCoeff": 15, + "isAboveTheFoldImageCoeff": 13, + "isAboveTheFoldPriceCoeff": 33, + "isNearbyImageXAxisPriceCoeff": 5, + "isNearbyImageYAxisTitleCoeff": 5, + "largerFontSizeCoeff": 7, + "largerImageCoeff": 2 +} diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js new file mode 100644 index 0000000..00cebbc --- /dev/null +++ b/src/extraction/fathom_extraction.js @@ -0,0 +1,67 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Uses Fathom to extract a product from its product page, + * where a 'product' is defined by the bundle of features that + * makes it identifiable. + * + * Features: title, image, price + */ + +import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; +import RulesetFactory from 'commerce/extraction/ruleset_factory'; + +const PRODUCT_FEATURES = ['title', 'price', 'image']; +// Minimum score to be considered the "correct" feature element extracted by Fathom +const SCORE_THRESHOLD = 4; +// Array of numbers corresponding to the coefficients in order +const coefficients = RulesetFactory.getCoeffsInOrder(defaultCoefficients); +// For production, we don't need to generate a new ruleset factory +// and ruleset every time we run Fathom, since the coefficients are static. +const rulesetFactory = new RulesetFactory(coefficients); +const rules = rulesetFactory.makeRuleset(); + +/** + * Extracts the highest scoring element above a score threshold + * contained in a page's HTML document. + */ +function runRuleset(doc) { + const extractedElements = {}; + const results = rules.against(doc); + for (const feature of PRODUCT_FEATURES) { + let fnodesList = results.get(feature); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } + } + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); +} + +/* + * Run the ruleset for the product features against the current window document + */ +export default function extractProduct(doc) { + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + if (feature === 'image') { + extractedProduct[feature] = extractedElements[feature].src; + } else { + extractedProduct[feature] = extractedElements[feature].innerText; + } + } + } + return hasAllFeatures(extractedProduct) ? extractedProduct : null; +} diff --git a/src/product_extraction_data.json b/src/extraction/product_extraction_data.json similarity index 100% rename from src/product_extraction_data.json rename to src/extraction/product_extraction_data.json diff --git a/src/extraction/ruleset_factory.js b/src/extraction/ruleset_factory.js new file mode 100644 index 0000000..8a9778e --- /dev/null +++ b/src/extraction/ruleset_factory.js @@ -0,0 +1,340 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Exports a RulesetFactory class, which when instantiated, binds Fathom + * coefficients to a ruleset. An instance of this class is used for product + * feature extraction (`fathom_extraction.js`) and for training (`trainees.js`). + */ + +import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +// Since the fathom-trainees add-on currently uses a submodule of Fathom, for +// training, replace 'utils' with 'utilsForFrontend' +import {ancestors} from 'fathom-web/utils'; + +const DEFAULT_BODY_FONT_SIZE = 14; +const DEFAULT_SCORE = 1; +const TOP_BUFFER = 150; +// From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; + +export default class RulesetFactory { + /** + * Create a ruleset factory. + * + * @param {Array.number} coefficients The coefficients to apply for each rule + */ + constructor(coefficients) { + [ + this.hasDollarSignCoeff, + this.hasPriceInClassNameCoeff, + this.hasPriceInIDCoeff, + this.hasPriceishPatternCoeff, + this.isAboveTheFoldImageCoeff, + this.isAboveTheFoldPriceCoeff, + this.isNearbyImageXAxisPriceCoeff, + this.isNearbyImageYAxisTitleCoeff, + this.largerFontSizeCoeff, + this.largerImageCoeff, + ] = coefficients; + } + + /** + * Scores fnode in direct proportion to its size + */ + largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * this.largerImageCoeff; + } + + /** + * Scores fnode in proportion to its font size + */ + largerFontSize(fnode) { + const size = window.getComputedStyle(fnode.element).fontSize; + // Normalize the multiplier by the default font size + const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; + return sizeMultiplier * this.largerFontSizeCoeff; + } + + /** + * Scores fnode with a '$' in its innerText + */ + hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return this.hasDollarSignCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its id or its parent's id + */ + hasPriceInID(fnode) { + const id = fnode.element.id; + const parentID = fnode.element.parentElement.id; + if (id.toLowerCase().includes('price')) { + return this.hasPriceInIDCoeff; + } + if (parentID.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInIDCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its class name or its parent's class name + */ + hasPriceInClassName(fnode) { + const className = fnode.element.className; + const parentClassName = fnode.element.parentElement.className; + if (className.toLowerCase().includes('price')) { + return this.hasPriceInClassNameCoeff; + } + if (parentClassName.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInClassNameCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + isAboveTheFold(fnode, featureCoeff) { + const viewportHeight = window.innerHeight; + const top = fnode.element.getBoundingClientRect().top; + const upperHeightLimit = viewportHeight * 2; + + // If the node is below the fold by more than a viewport's length, + // return a low score. + if (top >= upperHeightLimit) { + return ZEROISH * featureCoeff; + } + + // If the node is above the fold, return a high score. + if (top <= viewportHeight) { + return ONEISH * featureCoeff; + } + + // Otherwise, scale the score linearly between the fold and a viewport's + // length below it. + const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); + return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + isNearbyImageXAxisPrice(fnode) { + const viewportWidth = window.innerWidth; + const eleDOMRect = fnode.element.getBoundingClientRect(); + const imageElement = this.getHighestScoringImage(fnode); + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaRight = eleDOMRect.left - imageDOMRect.right; + const deltaLeft = imageDOMRect.left - eleDOMRect.right; + // True if element is completely to the right or left of the image element + const noOverlap = (deltaRight > 0 || deltaLeft > 0); + let deltaX; + if (noOverlap) { + if (deltaRight > 0) { + deltaX = deltaRight; + } else { + deltaX = deltaLeft; + } + // Give a higher score the closer it is to the image, normalized by viewportWidth + return (viewportWidth / deltaX) * this.isNearbyImageXAxisPriceCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode based on its y distance from the highest scoring image element + */ + isNearbyImageYAxisTitle(fnode) { + const viewportHeight = window.innerHeight; + const DOMRect = fnode.element.getBoundingClientRect(); + const imageElement = this.getHighestScoringImage(fnode); + const imageDOMRect = imageElement.getBoundingClientRect(); + // Some titles (like on Ebay) are above the image, so include a top buffer + const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); + const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; + // Give elements in a specific vertical band a higher score + if (isEleTopNearby && isEleBottomNearby) { + const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); + // Give a higher score the closer it is to the image, normalized by viewportHeight + return (viewportHeight / deltaY) * this.isNearbyImageYAxisTitleCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode whose innerText matches a priceish RegExp pattern + */ + hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return this.hasPriceishPatternCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Checks to see if a 'priceish' fnode is eligible for scoring + */ + isEligiblePrice(fnode) { + return ( + this.isVisible(fnode) + && this.hasDifferentInnerTextThanChildren(fnode) + && this.isNearbyImageYAxisPrice(fnode) + ); + } + + /** + * Checks to see if a 'titleish' fnode is eligible for scoring + */ + isEligibleTitle(fnode) { + return ( + this.isVisible(fnode) + // Don't use hasDifferentInnerTextThanChildren, because

tags + // for Amazon and Walmart have and
element children, + // respectively, with the same innerText. + // + // Don't use isNearbyImageYAxisTitle here, as unlike for price, there + // is a strong correlation for vertical proximity to image, so we want + // to score it proportionally rather than have a hard cut-off. + ); + } + + /** + * Checks if fnode has different innerText compared to any of its children + */ + hasDifferentInnerTextThanChildren(fnode) { + const element = fnode.element; + const children = element.children; + if (children.length > 0) { + for (const descendant of children) { + if (descendant.innerText === element.innerText) { + return false; + } + } + } + return true; + } + + /** + * Checks if fnode is nearby the top scoring image element in the y-axis + * Unlike for 'title', 'price' elements had worse accuracy when scored + * in proportion to y-axis proximity to the image. + */ + isNearbyImageYAxisPrice(fnode) { + const element = fnode.element; + const DOMRect = element.getBoundingClientRect(); + const imageElement = this.getHighestScoringImage(fnode); + const imageDOMRect = imageElement.getBoundingClientRect(); + if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) + && DOMRect.bottom <= imageDOMRect.bottom) { + return true; + } + return false; + } + + isVisible(fnode) { + for (const ancestor of ancestors(fnode.element)) { + const style = getComputedStyle(ancestor); + const isElementHidden = ( + style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0' + ); + if (isElementHidden) { + return false; + } + } + return true; + } + + /** + * Using coefficients passed into the constructor method, returns a weighted + * ruleset used to score elements in an HTML document. + */ + makeRuleset() { + return ruleset( + /** + * Image rules + */ + // consider all visible img elements + rule(dom('img').when(this.isVisible.bind(this)), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldImageCoeff))), + // better score for larger images + rule(type('imageish'), score(this.largerImage.bind(this))), + // return image element(s) with max score + rule(type('imageish').max(), out('image')), + + /** + * Title rules + */ + // consider all eligible h1 elements + rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('titleish')), + // better score based on y-axis proximity to max scoring image element + rule(type('titleish'), score(this.isNearbyImageYAxisTitle.bind(this))), + // return title element(s) with max score + rule(type('titleish').max(), out('title')), + + /** + * Price rules + */ + // consider all eligible span and h2 elements + rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(this.hasDollarSign.bind(this))), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(this.hasPriceInID.bind(this))), + // check if any class names have "price" in them + rule(type('priceish'), score(this.hasPriceInClassName.bind(this))), + // better score for larger font size + rule(type('priceish'), score(this.largerFontSize.bind(this))), + // better score based on x-axis proximity to max scoring image element + rule(type('priceish'), score(this.isNearbyImageXAxisPrice.bind(this))), + // check if innerText has a priceish pattern + rule(type('priceish'), score(this.hasPriceishPattern.bind(this))), + // return price element(s) with max score + rule(type('priceish').max(), out('price')), + ); + } + + /** + * Takes in a coefficients object and returns a coefficients array in the + * same order. + */ + static getCoeffsInOrder(coeffsObj) { + const coeffsKeys = Object.keys(coeffsObj); + coeffsKeys.sort(); // sort keys in string Unicode order + const coeffs = []; + for (const key of coeffsKeys) { + coeffs.push(coeffsObj[key]); + } + return coeffs; + } + + getHighestScoringImage(fnode) { + return fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + } +} diff --git a/src/extraction/trainees.js b/src/extraction/trainees.js new file mode 100644 index 0000000..2e82854 --- /dev/null +++ b/src/extraction/trainees.js @@ -0,0 +1,76 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* eslint-disable import/no-unresolved */ +import defaultCoefficients from './fathom_default_coefficients.json'; +import RulesetFactory from './ruleset_factory'; + +// Array of numbers corresponding to the coefficients in order +const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients); + +/** + * Rulesets to train using Fathom. + * + * More mechanically, a map of names to {coeffs, rulesetMaker} objects. + * rulesetMaker is a function that takes an Array of coefficients and returns a + * ruleset that uses them. coeffs is typically the best-yet-found coefficients + * for a ruleset but can also be some more widely flung ones that you want to + * start the trainer from. The rulesets you specify here show up in the Train + * UI, from which you can kick off a training run. + * + * How to train: + * 1. Fork the `mozilla/fathom-trainees` repo, + * 2. In the `fathom-trainees` add-on, copy this file, + * `./extraction/fathom_default_coefficients.json` and + * `./extraction/ruleset_factory.js` to the `./src` folder. + * * Note: You will have to replace 'utils' with 'utilsForFrontend' on the + * import in `ruleset_factory.js`. See that file for more information. + * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer. + * + * Notes: + * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` + * attribute is the same as the `out`-ed string. Example: An element tagged with + * `data-fathom="image"` will map to `rule(..., out("image"))`. + * - The Trainer assumes that the name of the ruleset and the out-rule of interest + * are the same. Therefore, if a ruleset contains more than one out-rule, + * each `out`-ed feature must have its own key in the `trainees` map. You can + * select which feature to train from the dropdown menu on FathomFox's Trainer page. + * - I would not recommend using the Corpus Collector to build up a training set, + * because you can only batch freeze original pages, meaning tagged pages would be + * re-freezed, and there are non-obvious side effects in the diff (an issue with + * the freeze-dried library Fathom uses). + */ + +function rulesetMaker(coefficients) { + // The coefficients are updated over time during training, so create a new factory for + // each iteration + const rulesetFactory = new RulesetFactory(coefficients); + return rulesetFactory.makeRuleset(); +} + +const trainees = new Map([ + [ + 'image', + { + coeffs, + rulesetMaker, + }, + ], + [ + 'title', + { + coeffs, + rulesetMaker, + }, + ], + [ + 'price', + { + coeffs, + rulesetMaker, + }, + ], +]); + +export default trainees; diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json deleted file mode 100644 index 83b8f89..0000000 --- a/src/fathom_coefficients.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "hasPriceClass": 2 -} diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js deleted file mode 100644 index 6efad7f..0000000 --- a/src/fathom_extraction.js +++ /dev/null @@ -1,70 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -/* - * Uses Fathom to extract a product from its product page, - * where a 'product' is defined by the bundle of features that - * makes it identifiable. - * - * Features: title, image, price - */ - -import {dom, out, rule, ruleset, score, type} from 'fathom-web'; -import fathomCoeffs from 'commerce/fathom_coefficients.json'; - -const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass; - -/** - * Scores fnodes with a "price" class - */ -function hasPriceClass(fnode) { - if (fnode.element.classList.contains('price')) { - return fathomCoeffs.hasPriceClass; - } - return 1; -} - -/** - * Ruleset for product features. Each feature has its own type. - */ -const rules = ruleset( - // get all elements that could contain the price - rule(dom('div'), type('priceish')), - - // check class names to see if they contain 'price' - rule(type('priceish'), score(hasPriceClass)), - - // return price element with max score - rule(type('priceish').max(), out('product-price')), -); - -/** - * Extracts the highest scoring element above a score threshold - * contained in a page's HTML document. - */ -function runRuleset(doc) { - let fnodesList = rules.against(doc).get('product-price'); - fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); - // It is possible for multiple elements to have the same highest score. - if (fnodesList.length >= 1) { - return fnodesList[0].element; - } - return null; -} - -/* - * Run the ruleset for the product features against the current window document - */ -export default function extractProduct(doc) { - const priceEle = runRuleset(doc); - if (priceEle) { - const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); - if (price) { - return { - price, - }; - } - } - return null; -} diff --git a/src/product_info.js b/src/product_info.js index e5de47c..9aa01a2 100644 --- a/src/product_info.js +++ b/src/product_info.js @@ -7,8 +7,8 @@ * "document_idle", which is after all DOM content has been loaded. */ -import extractProductWithFathom from 'commerce/fathom_extraction'; -import extractProductWithFallback from 'commerce/fallback_extraction'; +import extractProductWithFathom from 'commerce/extraction/fathom_extraction'; +import extractProductWithFallback from 'commerce/extraction/fallback_extraction'; /** * Checks to see if any product information for the page was found,