From cb26fb8236bea8622cb36ab1bfdb893c6aec213a Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Tue, 7 Aug 2018 15:25:52 -0700 Subject: [PATCH] #36: Make fathom_ruleset.js mirror Fathom's trainer script. This enables the same script to be used for training and running in the commerce webextension. How to train a ruleset with Fathom: 1. Follow Fathom's [instructions](https://github.com/erikrose/fathom-fox). 2. Open the [Fathom Trainees](https://github.com/mozilla/fathom-trainees) add-on in a new profile. 3. Install FathomFox in that window from AMO. 4. Drag and drop the training corpus (HTML files in ./training-set) into that window. 5. Copy ./src/fathom_ruleset.js into fathom-trainees/src/trainees.js and save over it. 6. Choose a feature to train, 'price', 'title' or 'image', and edit `trainees.set()` so that one of those features is the first argument. 7. Comment out the rules pertaining to all but that feature. 8. Click the FathomFox browserAction and select "Train" 9. Select the feature from the dropdown list and click the "Train against the tabs in this window" button. 10. You will see the accuracy based on the initial coefficients passed in, and Fathom will start generating optimized coefficients. This could take a while. 11. When Fathom is done, those coefficients will be logged to the Fathom page. --- src/fathom_coefficients.json | 14 +-- src/fathom_extraction.js | 168 ++++------------------------ src/fathom_ruleset.js | 205 +++++++++++++++++++++++++++++++++++ 3 files changed, 232 insertions(+), 155 deletions(-) create mode 100644 src/fathom_ruleset.js diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index ceec556..d54851e 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,9 @@ { - "largerImage": 3, - "largerFontSize": 1, - "hasDollarSign": 3, - "hasTitleInID": 10, - "hasTitleInClassName": 5, - "isHidden": -100, - "isHeaderElement": 10 + "largerImageCoeff": 3, + "largerFontSizeCoeff": 1, + "hasDollarSignCoeff": 3, + "hasTitleInIDCoeff": 10, + "hasTitleInClassNameCoeff": 5, + "isHiddenCoeff": -100, + "isHeaderElementCoeff": 10 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 19413d0..65790d2 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -10,156 +10,19 @@ * Features: title, image, price */ -import {dom, out, rule, ruleset, score, type} from 'fathom-web'; -import fathomCoeffs from 'commerce/fathom_coefficients.json'; +import productRuleset from 'commerce/fathom_ruleset'; +import { + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasTitleInIDCoeff, + hasTitleInClassNameCoeff, + isHiddenCoeff, + isHeaderElementCoeff, +} from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; const SCORE_THRESHOLD = 4; -const DEFAULT_BODY_FONT_SIZE = 14; -const DEFAULT_SCORE = 1; -const VIEWPORT_HEIGHT = window.innerHeight; - -/** - * Returns true if the fnode is above the fold - */ -function isAboveTheFold(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { - return true; - } - return false; -} - -/** - * Scores fnode in direct proportion to its size - */ -function largerImage(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - const area = (domRect.width) * (domRect.height); - if (area === 0) { - return DEFAULT_SCORE; - } - return area * fathomCoeffs.largerImage; -} - -/** - * Scores fnode with a '$' in its innerText - */ -function hasDollarSign(fnode) { - if (fnode.element.innerText.includes('$')) { - return fathomCoeffs.hasDollarSign; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode in direct proportion to its font size - */ -function largerFontSize(fnode) { - const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; - const size = sizeWithUnits.replace('px', ''); - if (size) { - // normalize the multiplier by the default font size - const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * fathomCoeffs.largerFontSize); - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode with "title" in its id - */ -function hasTitleInID(fnode) { - const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return fathomCoeffs.hasTitleInID; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode with "title" in a class name - */ -function hasTitleInClassName(fnode) { - const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return fathomCoeffs.hasTitleInClassName; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode that is hidden - */ -function isHidden(fnode) { - const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return fathomCoeffs.isHidden; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode that is an H1 element - */ -function isHeaderElement(fnode) { - if (fnode.element.tagName === 'H1') { - return fathomCoeffs.isHeaderElement; - } - return DEFAULT_SCORE; -} - -/** - * Ruleset for product features; each feature has its own type. - */ -const rules = ruleset( - /** - * Image rules - */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), - // better score for larger images - rule(type('imageish'), score(largerImage)), - // return image element with max score - rule(type('imageish').max(), out('image')), - - /** - * Title rules - */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), - // score higher for h1 elements - rule(type('titleish'), score(isHeaderElement)), - // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), - // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), - // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), - // return title element with max score - rule(type('titleish').max(), out('title')), - - /** - * Price rules - */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), - // check if the element has a '$' in its innerText - rule(type('priceish'), score(hasDollarSign)), - // better score for larger font size - rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), - // return price element with max score - rule(type('priceish').max(), out('price')), -); /** * Extracts the highest scoring element above a score threshold @@ -167,8 +30,17 @@ const rules = ruleset( */ function runRuleset(doc) { const extractedElements = {}; + const rules = productRuleset.get('product').rulesetMaker; for (const feature of PRODUCT_FEATURES) { - let fnodesList = rules.against(doc).get(`${feature}`); + let fnodesList = rules([ + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasTitleInIDCoeff, + hasTitleInClassNameCoeff, + isHiddenCoeff, + isHeaderElementCoeff, + ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js new file mode 100644 index 0000000..39ef04e --- /dev/null +++ b/src/fathom_ruleset.js @@ -0,0 +1,205 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import {dom, out, rule, ruleset, score, type} from 'fathom-web'; + +const DEFAULT_BODY_FONT_SIZE = 14; +const DEFAULT_SCORE = 1; +const VIEWPORT_HEIGHT = window.innerHeight; + +/** + * Rulesets to train. + * + * Drop this file into the fathom-trainees/src folder (replacing the default file) + * to train Fathom against this ruleset. + * + * More mechanically, a map of names to {coeffs, rulesetMaker} objects. + * rulesetMaker is a function that takes an Array of coefficients and returns a + * ruleset that uses them. coeffs is typically the best-yet-found coefficients + * for a ruleset but can also be some more widely flung ones that you want to + * start the trainer from. The rulesets you specify here show up in the Train + * UI, from which you can kick off a training run. + * + * Fathom notes: + * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` + * attribute is the same as the `out`-ed string. Example: An element tagged with + * `data-fathom="image"` will map to `rule(..., out("image"))`. + * - I would not recommend using the Corpus Collector to build up a training set, + * because you can only batch freeze original pages, meaning tagged pages would be + * re-freezed, and there are non-obvious side effects in the diff (an issue with + * the freeze-dried library Fathom uses). + */ + +const trainees = new Map(); + +trainees.set( + /** + * A ruleset that finds the main product title, image and price on a product page. + * IMPORTANT: Currently, the Trainer assumes that the name of the ruleset and the + * out-rule of interest are the same. A multi-out ruleset will not work without + * commenting out all but one `out` and setting the ruleset name to that `out`. + */ + 'product', // 'product' for production and 'title', 'image' or 'price' for training + { + coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + rulesetMaker([ + coeffLargerImage, + coeffLargerFontSize, + coeffHasDollarSign, + coeffHasTitleInID, + coeffHasTitleInClassName, + coeffIsHidden, + coeffIsHeaderElement, + ]) { + /** + * Scores fnode in direct proportion to its size + */ + function largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * coeffLargerImage; + } + + /** + * Scores fnode in direct proportion to its font size + */ + function largerFontSize(fnode) { + const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; + const size = sizeWithUnits.replace('px', ''); + if (size) { + // normalize the multiplier by the default font size + const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; + return (sizeMultiplier * coeffLargerFontSize); + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with a '$' in its innerText + */ + function hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return coeffHasDollarSign; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with "title" in its id + */ + function hasTitleInID(fnode) { + const id = fnode.element.id; + if (id.includes('title') || id.includes('Title')) { + return coeffHasTitleInID; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with "title" in a class name + */ + function hasTitleInClassName(fnode) { + const className = fnode.element.className; + if (className.includes('title') || className.includes('Title')) { + return coeffHasTitleInClassName; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode that is hidden + */ + function isHidden(fnode) { + const element = fnode.element; + const style = window.getComputedStyle(element); + if (!element.offsetParent // null if the offsetParent has a display set to "none" + || style.visibility === 'hidden' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return coeffIsHidden; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode that is an H1 element + */ + function isHeaderElement(fnode) { + if (fnode.element.tagName === 'H1') { + return coeffIsHeaderElement; + } + return DEFAULT_SCORE; + } + + /** + * Returns true if the fnode is above the fold + */ + function isAboveTheFold(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= VIEWPORT_HEIGHT) { + return true; + } + return false; + } + + /* The actual ruleset */ + const rules = ruleset( + /** + * Image rules + * + * If training, comment out unless training 'image'. + */ + // consider all img elements near the top of the page + rule(dom('img').when(isAboveTheFold), type('imageish')), + // better score for larger images + rule(type('imageish'), score(largerImage)), + // return image element with max score + rule(type('imageish').max(), out('image')), + + /** + * Title rules + * + * If training, comment out unless training 'title'. + */ + // consider all h1 and span elements near the top of the page + rule(dom('h1, span').when(isAboveTheFold), type('titleish')), + // score higher for h1 elements + rule(type('titleish'), score(isHeaderElement)), + // check if the id has "title" in it + rule(type('titleish'), score(hasTitleInID)), + // check if any class names have "title" in them + rule(type('titleish'), score(hasTitleInClassName)), + // better score for larger font size + rule(type('titleish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('titleish'), score(isHidden)), + // return title element with max score + rule(type('titleish').max(), out('title')), + + /** + * Price rules + * + * If training, comment out unless training 'price'. + */ + // consider all span and h2 elements near the top of the page + rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(hasDollarSign)), + // better score for larger font size + rule(type('priceish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('priceish'), score(isHidden)), + // return price element with max score + rule(type('priceish').max(), out('price')), + ); + return rules; + }, + }, +); + +export default trainees;