diff --git a/.eslintrc.json b/.eslintrc.json index 02081a6..7ff4655 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -14,6 +14,7 @@ "no-restricted-syntax": ["off"], "no-use-before-define": ["error", {"functions": false}], "no-prototype-builtins": ["off"], + "class-methods-use-this": ["off"], "react/jsx-one-expression-per-line": ["off"], "react/prefer-stateless-function": ["off"], diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/README.md b/README.md index 4fc09a6..7c802c8 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ After this, you can run `pipenv run test` to run the automated test suite. - `src/background` contains the background scripts that trigger UI elements (such as the page action) and periodically check for price updates. - `src/browser_action` contains the toolbar popup for managing the list of currently-tracked products. +- `src/extraction` contains the content scripts that extract product information from product web pages. - `src/page_action` contains the URL bar popup for viewing and tracking the product in the current tab. - `src/state` contains the Redux-based code for managing global extension state. - `src/tests` contains the automated test suite. diff --git a/src/fallback_extraction.js b/src/extraction/fallback_extraction.js similarity index 97% rename from src/fallback_extraction.js rename to src/extraction/fallback_extraction.js index db1f6ee..5b3535f 100644 --- a/src/fallback_extraction.js +++ b/src/extraction/fallback_extraction.js @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/* +/** * Uses CSS selectors, or failing that, Open Graph tags to extract * a product from its product page, where a 'product' is defined by the bundle * of features that makes it identifiable. @@ -10,7 +10,7 @@ * Features: title, image, price */ -import extractionData from 'commerce/product_extraction_data.json'; +import extractionData from 'commerce/extraction/product_extraction_data.json'; const OPEN_GRAPH_PROPERTY_VALUES = { title: 'og:title', diff --git a/src/extraction/fathom_default_coefficients.json b/src/extraction/fathom_default_coefficients.json new file mode 100644 index 0000000..7a70565 --- /dev/null +++ b/src/extraction/fathom_default_coefficients.json @@ -0,0 +1,12 @@ +{ + "hasDollarSignCoeff": 8, + "hasPriceInClassNameCoeff": 2, + "hasPriceInIDCoeff": 17, + "hasPriceishPatternCoeff": 15, + "isAboveTheFoldImageCoeff": 13, + "isAboveTheFoldPriceCoeff": 33, + "isNearbyImageXAxisPriceCoeff": 5, + "isNearbyImageYAxisTitleCoeff": 5, + "largerFontSizeCoeff": 7, + "largerImageCoeff": 2 +} diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js new file mode 100644 index 0000000..00cebbc --- /dev/null +++ b/src/extraction/fathom_extraction.js @@ -0,0 +1,67 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Uses Fathom to extract a product from its product page, + * where a 'product' is defined by the bundle of features that + * makes it identifiable. + * + * Features: title, image, price + */ + +import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; +import RulesetFactory from 'commerce/extraction/ruleset_factory'; + +const PRODUCT_FEATURES = ['title', 'price', 'image']; +// Minimum score to be considered the "correct" feature element extracted by Fathom +const SCORE_THRESHOLD = 4; +// Array of numbers corresponding to the coefficients in order +const coefficients = RulesetFactory.getCoeffsInOrder(defaultCoefficients); +// For production, we don't need to generate a new ruleset factory +// and ruleset every time we run Fathom, since the coefficients are static. +const rulesetFactory = new RulesetFactory(coefficients); +const rules = rulesetFactory.makeRuleset(); + +/** + * Extracts the highest scoring element above a score threshold + * contained in a page's HTML document. + */ +function runRuleset(doc) { + const extractedElements = {}; + const results = rules.against(doc); + for (const feature of PRODUCT_FEATURES) { + let fnodesList = results.get(feature); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } + } + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); +} + +/* + * Run the ruleset for the product features against the current window document + */ +export default function extractProduct(doc) { + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + if (feature === 'image') { + extractedProduct[feature] = extractedElements[feature].src; + } else { + extractedProduct[feature] = extractedElements[feature].innerText; + } + } + } + return hasAllFeatures(extractedProduct) ? extractedProduct : null; +} diff --git a/src/product_extraction_data.json b/src/extraction/product_extraction_data.json similarity index 100% rename from src/product_extraction_data.json rename to src/extraction/product_extraction_data.json diff --git a/src/extraction/ruleset_factory.js b/src/extraction/ruleset_factory.js new file mode 100644 index 0000000..8a9778e --- /dev/null +++ b/src/extraction/ruleset_factory.js @@ -0,0 +1,340 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Exports a RulesetFactory class, which when instantiated, binds Fathom + * coefficients to a ruleset. An instance of this class is used for product + * feature extraction (`fathom_extraction.js`) and for training (`trainees.js`). + */ + +import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +// Since the fathom-trainees add-on currently uses a submodule of Fathom, for +// training, replace 'utils' with 'utilsForFrontend' +import {ancestors} from 'fathom-web/utils'; + +const DEFAULT_BODY_FONT_SIZE = 14; +const DEFAULT_SCORE = 1; +const TOP_BUFFER = 150; +// From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; + +export default class RulesetFactory { + /** + * Create a ruleset factory. + * + * @param {Array.number} coefficients The coefficients to apply for each rule + */ + constructor(coefficients) { + [ + this.hasDollarSignCoeff, + this.hasPriceInClassNameCoeff, + this.hasPriceInIDCoeff, + this.hasPriceishPatternCoeff, + this.isAboveTheFoldImageCoeff, + this.isAboveTheFoldPriceCoeff, + this.isNearbyImageXAxisPriceCoeff, + this.isNearbyImageYAxisTitleCoeff, + this.largerFontSizeCoeff, + this.largerImageCoeff, + ] = coefficients; + } + + /** + * Scores fnode in direct proportion to its size + */ + largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * this.largerImageCoeff; + } + + /** + * Scores fnode in proportion to its font size + */ + largerFontSize(fnode) { + const size = window.getComputedStyle(fnode.element).fontSize; + // Normalize the multiplier by the default font size + const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; + return sizeMultiplier * this.largerFontSizeCoeff; + } + + /** + * Scores fnode with a '$' in its innerText + */ + hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return this.hasDollarSignCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its id or its parent's id + */ + hasPriceInID(fnode) { + const id = fnode.element.id; + const parentID = fnode.element.parentElement.id; + if (id.toLowerCase().includes('price')) { + return this.hasPriceInIDCoeff; + } + if (parentID.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInIDCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its class name or its parent's class name + */ + hasPriceInClassName(fnode) { + const className = fnode.element.className; + const parentClassName = fnode.element.parentElement.className; + if (className.toLowerCase().includes('price')) { + return this.hasPriceInClassNameCoeff; + } + if (parentClassName.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInClassNameCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + isAboveTheFold(fnode, featureCoeff) { + const viewportHeight = window.innerHeight; + const top = fnode.element.getBoundingClientRect().top; + const upperHeightLimit = viewportHeight * 2; + + // If the node is below the fold by more than a viewport's length, + // return a low score. + if (top >= upperHeightLimit) { + return ZEROISH * featureCoeff; + } + + // If the node is above the fold, return a high score. + if (top <= viewportHeight) { + return ONEISH * featureCoeff; + } + + // Otherwise, scale the score linearly between the fold and a viewport's + // length below it. + const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); + return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + isNearbyImageXAxisPrice(fnode) { + const viewportWidth = window.innerWidth; + const eleDOMRect = fnode.element.getBoundingClientRect(); + const imageElement = this.getHighestScoringImage(fnode); + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaRight = eleDOMRect.left - imageDOMRect.right; + const deltaLeft = imageDOMRect.left - eleDOMRect.right; + // True if element is completely to the right or left of the image element + const noOverlap = (deltaRight > 0 || deltaLeft > 0); + let deltaX; + if (noOverlap) { + if (deltaRight > 0) { + deltaX = deltaRight; + } else { + deltaX = deltaLeft; + } + // Give a higher score the closer it is to the image, normalized by viewportWidth + return (viewportWidth / deltaX) * this.isNearbyImageXAxisPriceCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode based on its y distance from the highest scoring image element + */ + isNearbyImageYAxisTitle(fnode) { + const viewportHeight = window.innerHeight; + const DOMRect = fnode.element.getBoundingClientRect(); + const imageElement = this.getHighestScoringImage(fnode); + const imageDOMRect = imageElement.getBoundingClientRect(); + // Some titles (like on Ebay) are above the image, so include a top buffer + const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); + const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; + // Give elements in a specific vertical band a higher score + if (isEleTopNearby && isEleBottomNearby) { + const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); + // Give a higher score the closer it is to the image, normalized by viewportHeight + return (viewportHeight / deltaY) * this.isNearbyImageYAxisTitleCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode whose innerText matches a priceish RegExp pattern + */ + hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return this.hasPriceishPatternCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Checks to see if a 'priceish' fnode is eligible for scoring + */ + isEligiblePrice(fnode) { + return ( + this.isVisible(fnode) + && this.hasDifferentInnerTextThanChildren(fnode) + && this.isNearbyImageYAxisPrice(fnode) + ); + } + + /** + * Checks to see if a 'titleish' fnode is eligible for scoring + */ + isEligibleTitle(fnode) { + return ( + this.isVisible(fnode) + // Don't use hasDifferentInnerTextThanChildren, because