From 6d345f82a9e0c2203f4b789535e5336efafdccd4 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 2 Aug 2018 17:16:23 -0700 Subject: [PATCH 1/9] #36: Add more sophisticated Fathom rules. These rules successfully pull out product title, price and image from the following product pages (one each from the 5 top sites): * [Amazon](https://www.amazon.com/KitchenAid-KL26M1XER-Professional-6-Qt-Bowl-Lift/dp/B01LYV1U30?smid=ATVPDKIKX0DER&pf_rd_p=0c7b792f-241a-4510-94f4-dd184a76f201&pf_rd_r=AZD7BGV3JZGTB23F30X3) * [Ebay](https://www.ebay.com/p/Best-Choice-Products-650W-6-speed-5-5QT-Kitchen-Food-Stand-Mixer-with-Stainless-Steels-Bowl-Black/3018375728?iid=253733404998) * [Walmart](https://www.walmart.com/ip/KitchenAid-Classic-Series-4-5-Quart-Tilt-Head-Stand-Mixer-Onyx-Black-K45SSOB/29474640) * [Best Buy](https://www.bestbuy.com/site/jbl-everest-elite-750nc-wireless-over-ear-noise-cancelling-headphones-gunmetal/5840136.p?skuId=5840136) * [Home Depot](https://www.homedepot.com/p/Husky-SAE-Combination-Wrench-Set-10-Piece-HCW10PCSAE/202934501) TODO: * Create a training set with FathomFox and run these rules against them to measure their accuracy for 50 product pages (10 from each top site). * Modify trimTitle method, so it doesn't cut off the color from the title for the product on Ebay. * Generalize formatPrice method. @Osmose, would you have any suggestions? --- src/fathom_coefficients.json | 4 +- src/fathom_extraction.js | 171 ++++++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 26 deletions(-) diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 83b8f89..2846242 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,3 +1,5 @@ { - "hasPriceClass": 2 + "largerImage": 3, + "largerFontSize": 2, + "hasDollarSign": 3 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 6efad7f..4e75202 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -13,30 +13,97 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; -const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass; +const PRODUCT_FEATURES = ['title', 'price', 'image']; +const SCORE_THRESHOLD = 3; +const DEFAULT_SCORE = 1; +const VIEWPORT_HEIGHT = window.innerHeight; /** - * Scores fnodes with a "price" class + * Each of these functions represents a rule check: if the fnode passes + * the rule, it gets a weighted score from 'fathom_coefficients.json'; + * otherwise, it gets the default score. */ -function hasPriceClass(fnode) { - if (fnode.element.classList.contains('price')) { - return fathomCoeffs.hasPriceClass; + +/** + * Returns true if the fnode is above the fold + */ +function isAboveTheFold(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= VIEWPORT_HEIGHT) { + return true; } - return 1; + return false; +} + +/** + * Scores fnode in direct proportion to its size + */ +function largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * fathomCoeffs.largerImage; +} + +/** + * Scores fnode with a '$' in its innerText + */ +function hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return fathomCoeffs.hasDollarSign; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode in direct proportion to its font size + */ +function largerFontSize(fnode) { + const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size'); + const size = sizeWithUnits.replace('px', ''); + if (size) { + return (parseInt(size, 10) * fathomCoeffs.largerFontSize); + } + return DEFAULT_SCORE; } /** * Ruleset for product features. Each feature has its own type. */ const rules = ruleset( - // get all elements that could contain the price - rule(dom('div'), type('priceish')), + /** + * Image rules + */ + // consider all img elements near the top of the page + rule(dom('img').when(isAboveTheFold), type('imageish')), + // better score for larger images + rule(type('imageish'), score(largerImage)), + // return image element with max score + rule(type('imageish').max(), out('image')), - // check class names to see if they contain 'price' - rule(type('priceish'), score(hasPriceClass)), + /** + * Title rules + */ + // consider only the title element + rule(dom('title'), type('titleish')), + // give the title element the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element with max score + rule(type('titleish').max(), out('title')), + /** + * Price rules + */ + // consider all span and h2 elements near the top of the page + rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(hasDollarSign)), + // better score for larger font size + rule(type('priceish'), score(largerFontSize)), // return price element with max score - rule(type('priceish').max(), out('product-price')), + rule(type('priceish').max(), out('price')), ); /** @@ -44,27 +111,81 @@ const rules = ruleset( * contained in a page's HTML document. */ function runRuleset(doc) { - let fnodesList = rules.against(doc).get('product-price'); - fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); - // It is possible for multiple elements to have the same highest score. - if (fnodesList.length >= 1) { - return fnodesList[0].element; + const extractedElements = {}; + for (const feature of PRODUCT_FEATURES) { + let fnodesList = rules.against(doc).get(`${feature}`); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } + } + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); +} + +// Trim off the shorter substring between ' - ', ': ' or ' | ' +function trimTitle(title) { + let textArr = []; + // TODO: This currently cuts of the " - Black" substring on E-bay + if (title.includes(' - ')) { + textArr = title.split(' - '); + } + if (title.includes(': ')) { + textArr = title.split(': '); + } + if (textArr.length >= 1) { + return textArr.reduce((a, b) => ((a.length > b.length) ? a : b)); + } + return title; +} + +/** + * Takes a price string of the form "$1997 /each" and turns + * it into "$19.97". + * TODO: Can this be generalized/simplified? This is very specific + * to Home Depot's product pages. + */ +function formatPrice(price) { + let formattedPrice = price; + if (price.includes('/')) { + const index = price.indexOf('/'); + formattedPrice = price.slice(0, index); + formattedPrice = formattedPrice.trim(); + const decimalIndex = formattedPrice.length - 2; + const rightSide = formattedPrice.substring(decimalIndex); + const leftSide = formattedPrice.replace(rightSide, ''); + formattedPrice = `${leftSide}.${rightSide}`; } - return null; + return formattedPrice; } /* * Run the ruleset for the product features against the current window document */ export default function extractProduct(doc) { - const priceEle = runRuleset(doc); - if (priceEle) { - const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); - if (price) { - return { - price, - }; + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + let text = extractedElements[feature].innerText; + if (feature === 'title') { + text = trimTitle(text); + } + if (feature === 'price') { + text = formatPrice(text); + } + extractedProduct[feature] = (feature === 'image' + ? extractedElements[feature].src + : text + ); } } - return null; + return hasAllFeatures(extractedProduct) ? extractedProduct : null; } From 9afa26632755d27c8f6f7687b868fed3d6d2609c Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Tue, 7 Aug 2018 11:33:39 -0700 Subject: [PATCH 2/9] #36: Update product title rules. Product title rules previously pulled the unique 'title' element from the 'head' element on the page (part of the pages metadata). While this ostensibly requires less processing (we don't have to search the DOM or score any other elements), the title string often requires site-specific cleaning such as to remove the vendor name, and the final, cleaned up string cannot not be verified as accurate by Fathom, which only tells us if our rules picked the right element. The alternative approach, implemented here, is to pull the title from the corresponding element in the content of the page. Since Fathom can verify that the right element was selected, and the string from this element would not require any cleaning, this approach is a much better proxy for extracting the correct product title. --- src/fathom_coefficients.json | 8 ++- src/fathom_extraction.js | 128 +++++++++++++++++++---------------- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 2846242..ceec556 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,5 +1,9 @@ { "largerImage": 3, - "largerFontSize": 2, - "hasDollarSign": 3 + "largerFontSize": 1, + "hasDollarSign": 3, + "hasTitleInID": 10, + "hasTitleInClassName": 5, + "isHidden": -100, + "isHeaderElement": 10 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 4e75202..19413d0 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -14,16 +14,11 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; -const SCORE_THRESHOLD = 3; +const SCORE_THRESHOLD = 4; +const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; const VIEWPORT_HEIGHT = window.innerHeight; -/** - * Each of these functions represents a rule check: if the fnode passes - * the rule, it gets a weighted score from 'fathom_coefficients.json'; - * otherwise, it gets the default score. - */ - /** * Returns true if the fnode is above the fold */ @@ -61,16 +56,66 @@ function hasDollarSign(fnode) { * Scores fnode in direct proportion to its font size */ function largerFontSize(fnode) { - const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size'); + const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; const size = sizeWithUnits.replace('px', ''); if (size) { - return (parseInt(size, 10) * fathomCoeffs.largerFontSize); + // normalize the multiplier by the default font size + const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; + return (sizeMultiplier * fathomCoeffs.largerFontSize); + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode with "title" in its id + */ +function hasTitleInID(fnode) { + const id = fnode.element.id; + if (id.includes('title') || id.includes('Title')) { + return fathomCoeffs.hasTitleInID; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode with "title" in a class name + */ +function hasTitleInClassName(fnode) { + const className = fnode.element.className; + if (className.includes('title') || className.includes('Title')) { + return fathomCoeffs.hasTitleInClassName; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode that is hidden + */ +function isHidden(fnode) { + const element = fnode.element; + const style = window.getComputedStyle(element); + if (!element.offsetParent // null if the offsetParent has a display set to "none" + || style.visibility === 'hidden' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return fathomCoeffs.isHidden; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode that is an H1 element + */ +function isHeaderElement(fnode) { + if (fnode.element.tagName === 'H1') { + return fathomCoeffs.isHeaderElement; } return DEFAULT_SCORE; } /** - * Ruleset for product features. Each feature has its own type. + * Ruleset for product features; each feature has its own type. */ const rules = ruleset( /** @@ -86,10 +131,18 @@ const rules = ruleset( /** * Title rules */ - // consider only the title element - rule(dom('title'), type('titleish')), - // give the title element the minimum score - rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // consider all h1 and span elements near the top of the page + rule(dom('h1, span').when(isAboveTheFold), type('titleish')), + // score higher for h1 elements + rule(type('titleish'), score(isHeaderElement)), + // check if the id has "title" in it + rule(type('titleish'), score(hasTitleInID)), + // check if any class names have "title" in them + rule(type('titleish'), score(hasTitleInClassName)), + // better score for larger font size + rule(type('titleish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('titleish'), score(isHidden)), // return title element with max score rule(type('titleish').max(), out('title')), @@ -102,6 +155,8 @@ const rules = ruleset( rule(type('priceish'), score(hasDollarSign)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('priceish'), score(isHidden)), // return price element with max score rule(type('priceish').max(), out('price')), ); @@ -130,42 +185,6 @@ function hasAllFeatures(obj) { return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); } -// Trim off the shorter substring between ' - ', ': ' or ' | ' -function trimTitle(title) { - let textArr = []; - // TODO: This currently cuts of the " - Black" substring on E-bay - if (title.includes(' - ')) { - textArr = title.split(' - '); - } - if (title.includes(': ')) { - textArr = title.split(': '); - } - if (textArr.length >= 1) { - return textArr.reduce((a, b) => ((a.length > b.length) ? a : b)); - } - return title; -} - -/** - * Takes a price string of the form "$1997 /each" and turns - * it into "$19.97". - * TODO: Can this be generalized/simplified? This is very specific - * to Home Depot's product pages. - */ -function formatPrice(price) { - let formattedPrice = price; - if (price.includes('/')) { - const index = price.indexOf('/'); - formattedPrice = price.slice(0, index); - formattedPrice = formattedPrice.trim(); - const decimalIndex = formattedPrice.length - 2; - const rightSide = formattedPrice.substring(decimalIndex); - const leftSide = formattedPrice.replace(rightSide, ''); - formattedPrice = `${leftSide}.${rightSide}`; - } - return formattedPrice; -} - /* * Run the ruleset for the product features against the current window document */ @@ -174,16 +193,9 @@ export default function extractProduct(doc) { const extractedElements = runRuleset(doc); if (hasAllFeatures(extractedElements)) { for (const feature of PRODUCT_FEATURES) { - let text = extractedElements[feature].innerText; - if (feature === 'title') { - text = trimTitle(text); - } - if (feature === 'price') { - text = formatPrice(text); - } extractedProduct[feature] = (feature === 'image' ? extractedElements[feature].src - : text + : extractedElements[feature].innerText ); } } From d94071d1079a4484366b1e01ed761b94b6bb7016 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Tue, 7 Aug 2018 15:25:52 -0700 Subject: [PATCH 3/9] #36: Make fathom_ruleset.js mirror Fathom's trainer script. This enables the same script to be used for training and running in the commerce webextension. How to train a ruleset with Fathom: 1. Follow Fathom's [Trainer instructions](https://github.com/erikrose/fathom-fox#the-trainer). 2. Open the [Fathom Trainees](https://github.com/mozilla/fathom-trainees) add-on in a new profile. 3. Install FathomFox in that window from AMO. 4. Drag and drop the training corpus into that window. - Note: The training corpus are HTML files frozen using [FathomFox's DevTools panel](https://github.com/erikrose/fathom-fox#the-developer-tools-panel); our training corpus is on the shared "commerce" Google drive. - Note: As of the date of this commit, the Corpus Collector is not a recommended option for building a training corpus due to a `freeze-dry` dependency bug that inserts a bunch of extra garbage when re-freezing a frozen page. 5. Copy ./src/fathom_ruleset.js into fathom-trainees/src/trainees.js and save over it. 6. Choose a feature to train, 'price', 'title' or 'image', and edit `trainees.set()` so that one of those features is the first argument. 7. Comment out the rules pertaining to all but that feature. - Currently, you can only train one ruleset at a time with Fathom, and only one `out` (e.g. 'title', 'image' or 'product') at a time for a given ruleset. - If you have multiple `out`s you'd like to train simultaneously, repeat this process for the remaining features so Fathom is running in a separate browser window for each feature and its corresponding rules. 8. Click the FathomFox browserAction and select "Train" 9. Select the feature from the dropdown list and click the "Train against the tabs in this window" button. 10. The array of coefficients displayed on the training page will update over time as Fathom optimizes them; this could take a while. --- src/fathom_coefficients.json | 14 +-- src/fathom_extraction.js | 168 ++++------------------------ src/fathom_ruleset.js | 205 +++++++++++++++++++++++++++++++++++ 3 files changed, 232 insertions(+), 155 deletions(-) create mode 100644 src/fathom_ruleset.js diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index ceec556..d54851e 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,9 @@ { - "largerImage": 3, - "largerFontSize": 1, - "hasDollarSign": 3, - "hasTitleInID": 10, - "hasTitleInClassName": 5, - "isHidden": -100, - "isHeaderElement": 10 + "largerImageCoeff": 3, + "largerFontSizeCoeff": 1, + "hasDollarSignCoeff": 3, + "hasTitleInIDCoeff": 10, + "hasTitleInClassNameCoeff": 5, + "isHiddenCoeff": -100, + "isHeaderElementCoeff": 10 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 19413d0..65790d2 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -10,156 +10,19 @@ * Features: title, image, price */ -import {dom, out, rule, ruleset, score, type} from 'fathom-web'; -import fathomCoeffs from 'commerce/fathom_coefficients.json'; +import productRuleset from 'commerce/fathom_ruleset'; +import { + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasTitleInIDCoeff, + hasTitleInClassNameCoeff, + isHiddenCoeff, + isHeaderElementCoeff, +} from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; const SCORE_THRESHOLD = 4; -const DEFAULT_BODY_FONT_SIZE = 14; -const DEFAULT_SCORE = 1; -const VIEWPORT_HEIGHT = window.innerHeight; - -/** - * Returns true if the fnode is above the fold - */ -function isAboveTheFold(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { - return true; - } - return false; -} - -/** - * Scores fnode in direct proportion to its size - */ -function largerImage(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - const area = (domRect.width) * (domRect.height); - if (area === 0) { - return DEFAULT_SCORE; - } - return area * fathomCoeffs.largerImage; -} - -/** - * Scores fnode with a '$' in its innerText - */ -function hasDollarSign(fnode) { - if (fnode.element.innerText.includes('$')) { - return fathomCoeffs.hasDollarSign; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode in direct proportion to its font size - */ -function largerFontSize(fnode) { - const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; - const size = sizeWithUnits.replace('px', ''); - if (size) { - // normalize the multiplier by the default font size - const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * fathomCoeffs.largerFontSize); - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode with "title" in its id - */ -function hasTitleInID(fnode) { - const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return fathomCoeffs.hasTitleInID; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode with "title" in a class name - */ -function hasTitleInClassName(fnode) { - const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return fathomCoeffs.hasTitleInClassName; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode that is hidden - */ -function isHidden(fnode) { - const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return fathomCoeffs.isHidden; - } - return DEFAULT_SCORE; -} - -/** - * Scores fnode that is an H1 element - */ -function isHeaderElement(fnode) { - if (fnode.element.tagName === 'H1') { - return fathomCoeffs.isHeaderElement; - } - return DEFAULT_SCORE; -} - -/** - * Ruleset for product features; each feature has its own type. - */ -const rules = ruleset( - /** - * Image rules - */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), - // better score for larger images - rule(type('imageish'), score(largerImage)), - // return image element with max score - rule(type('imageish').max(), out('image')), - - /** - * Title rules - */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), - // score higher for h1 elements - rule(type('titleish'), score(isHeaderElement)), - // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), - // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), - // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), - // return title element with max score - rule(type('titleish').max(), out('title')), - - /** - * Price rules - */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), - // check if the element has a '$' in its innerText - rule(type('priceish'), score(hasDollarSign)), - // better score for larger font size - rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), - // return price element with max score - rule(type('priceish').max(), out('price')), -); /** * Extracts the highest scoring element above a score threshold @@ -167,8 +30,17 @@ const rules = ruleset( */ function runRuleset(doc) { const extractedElements = {}; + const rules = productRuleset.get('product').rulesetMaker; for (const feature of PRODUCT_FEATURES) { - let fnodesList = rules.against(doc).get(`${feature}`); + let fnodesList = rules([ + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasTitleInIDCoeff, + hasTitleInClassNameCoeff, + isHiddenCoeff, + isHeaderElementCoeff, + ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js new file mode 100644 index 0000000..39ef04e --- /dev/null +++ b/src/fathom_ruleset.js @@ -0,0 +1,205 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import {dom, out, rule, ruleset, score, type} from 'fathom-web'; + +const DEFAULT_BODY_FONT_SIZE = 14; +const DEFAULT_SCORE = 1; +const VIEWPORT_HEIGHT = window.innerHeight; + +/** + * Rulesets to train. + * + * Drop this file into the fathom-trainees/src folder (replacing the default file) + * to train Fathom against this ruleset. + * + * More mechanically, a map of names to {coeffs, rulesetMaker} objects. + * rulesetMaker is a function that takes an Array of coefficients and returns a + * ruleset that uses them. coeffs is typically the best-yet-found coefficients + * for a ruleset but can also be some more widely flung ones that you want to + * start the trainer from. The rulesets you specify here show up in the Train + * UI, from which you can kick off a training run. + * + * Fathom notes: + * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` + * attribute is the same as the `out`-ed string. Example: An element tagged with + * `data-fathom="image"` will map to `rule(..., out("image"))`. + * - I would not recommend using the Corpus Collector to build up a training set, + * because you can only batch freeze original pages, meaning tagged pages would be + * re-freezed, and there are non-obvious side effects in the diff (an issue with + * the freeze-dried library Fathom uses). + */ + +const trainees = new Map(); + +trainees.set( + /** + * A ruleset that finds the main product title, image and price on a product page. + * IMPORTANT: Currently, the Trainer assumes that the name of the ruleset and the + * out-rule of interest are the same. A multi-out ruleset will not work without + * commenting out all but one `out` and setting the ruleset name to that `out`. + */ + 'product', // 'product' for production and 'title', 'image' or 'price' for training + { + coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + rulesetMaker([ + coeffLargerImage, + coeffLargerFontSize, + coeffHasDollarSign, + coeffHasTitleInID, + coeffHasTitleInClassName, + coeffIsHidden, + coeffIsHeaderElement, + ]) { + /** + * Scores fnode in direct proportion to its size + */ + function largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * coeffLargerImage; + } + + /** + * Scores fnode in direct proportion to its font size + */ + function largerFontSize(fnode) { + const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; + const size = sizeWithUnits.replace('px', ''); + if (size) { + // normalize the multiplier by the default font size + const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; + return (sizeMultiplier * coeffLargerFontSize); + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with a '$' in its innerText + */ + function hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return coeffHasDollarSign; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with "title" in its id + */ + function hasTitleInID(fnode) { + const id = fnode.element.id; + if (id.includes('title') || id.includes('Title')) { + return coeffHasTitleInID; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with "title" in a class name + */ + function hasTitleInClassName(fnode) { + const className = fnode.element.className; + if (className.includes('title') || className.includes('Title')) { + return coeffHasTitleInClassName; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode that is hidden + */ + function isHidden(fnode) { + const element = fnode.element; + const style = window.getComputedStyle(element); + if (!element.offsetParent // null if the offsetParent has a display set to "none" + || style.visibility === 'hidden' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return coeffIsHidden; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode that is an H1 element + */ + function isHeaderElement(fnode) { + if (fnode.element.tagName === 'H1') { + return coeffIsHeaderElement; + } + return DEFAULT_SCORE; + } + + /** + * Returns true if the fnode is above the fold + */ + function isAboveTheFold(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= VIEWPORT_HEIGHT) { + return true; + } + return false; + } + + /* The actual ruleset */ + const rules = ruleset( + /** + * Image rules + * + * If training, comment out unless training 'image'. + */ + // consider all img elements near the top of the page + rule(dom('img').when(isAboveTheFold), type('imageish')), + // better score for larger images + rule(type('imageish'), score(largerImage)), + // return image element with max score + rule(type('imageish').max(), out('image')), + + /** + * Title rules + * + * If training, comment out unless training 'title'. + */ + // consider all h1 and span elements near the top of the page + rule(dom('h1, span').when(isAboveTheFold), type('titleish')), + // score higher for h1 elements + rule(type('titleish'), score(isHeaderElement)), + // check if the id has "title" in it + rule(type('titleish'), score(hasTitleInID)), + // check if any class names have "title" in them + rule(type('titleish'), score(hasTitleInClassName)), + // better score for larger font size + rule(type('titleish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('titleish'), score(isHidden)), + // return title element with max score + rule(type('titleish').max(), out('title')), + + /** + * Price rules + * + * If training, comment out unless training 'price'. + */ + // consider all span and h2 elements near the top of the page + rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(hasDollarSign)), + // better score for larger font size + rule(type('priceish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('priceish'), score(isHidden)), + // return price element with max score + rule(type('priceish').max(), out('price')), + ); + return rules; + }, + }, +); + +export default trainees; From 444c9baead62501a392f84ac726c3e85ce98e10c Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 9 Aug 2018 22:16:32 -0700 Subject: [PATCH 4/9] #36: Update rules for better accuracy. These rules and coefficients yield the following accuracy based on a training corpus of 50 product pages from our top 5 sites (Amazon, Ebay, Walmart, Best Buy and Home Depot): * 100% for product 'image' * 96% for product 'title' * 94% for product 'price' Product 'price' and 'title' features have proximity rules based on the highest scoring product 'image' element. For now, this is done by accessing the image fnode using an internal '_ruleset' object; @erikrose is working on better support for this use case in the very near future, so this implementation can be improved at that time. --- .gitignore | 1 + src/fathom_coefficients.json | 16 ++- src/fathom_extraction.js | 20 +-- src/fathom_ruleset.js | 268 +++++++++++++++++++++++++++-------- 4 files changed, 227 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index d54851e..a90f91a 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,11 @@ { - "largerImageCoeff": 3, - "largerFontSizeCoeff": 1, - "hasDollarSignCoeff": 3, - "hasTitleInIDCoeff": 10, - "hasTitleInClassNameCoeff": 5, - "isHiddenCoeff": -100, - "isHeaderElementCoeff": 10 + "largerImageCoeff": 2, + "largerFontSizeCoeff": 7, + "hasDollarSignCoeff": 8, + "hasPriceInIDCoeff": 17, + "hasPriceInClassNameCoeff": 2, + "isAboveTheFoldPriceCoeff": 33, + "isAboveTheFoldImageCoeff": 13, + "isNearbyImageXAxisCoeff": 5, + "hasPriceishPatternCoeff": 15 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 65790d2..b804b44 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -15,10 +15,12 @@ import { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, } from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; @@ -36,10 +38,12 @@ function runRuleset(doc) { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js index 39ef04e..bfcd0f9 100644 --- a/src/fathom_ruleset.js +++ b/src/fathom_ruleset.js @@ -3,10 +3,17 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +import {ancestors} from 'fathom-web/utils'; // for training: utilsForFrontend const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; +const SCORE_THRESHOLD = 4; +const TOP_BUFFER = 150; const VIEWPORT_HEIGHT = window.innerHeight; +const VIEWPORT_WIDTH = window.innerWidth; +// Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; /** * Rulesets to train. @@ -40,17 +47,20 @@ trainees.set( * out-rule of interest are the same. A multi-out ruleset will not work without * commenting out all but one `out` and setting the ruleset name to that `out`. */ - 'product', // 'product' for production and 'title', 'image' or 'price' for training + 'product', // Ruleset name: 'product' for production and 'title', 'image' or 'price' for training { - coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + // For training only: input rule coefficients in order here + coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 15], rulesetMaker([ - coeffLargerImage, - coeffLargerFontSize, - coeffHasDollarSign, - coeffHasTitleInID, - coeffHasTitleInClassName, - coeffIsHidden, - coeffIsHeaderElement, + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]) { /** * Scores fnode in direct proportion to its size @@ -61,11 +71,11 @@ trainees.set( if (area === 0) { return DEFAULT_SCORE; } - return area * coeffLargerImage; + return area * largerImageCoeff; } /** - * Scores fnode in direct proportion to its font size + * Scores fnode in proportion to its font size */ function largerFontSize(fnode) { const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; @@ -73,7 +83,7 @@ trainees.set( if (size) { // normalize the multiplier by the default font size const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * coeffLargerFontSize); + return (sizeMultiplier * largerFontSizeCoeff); } return DEFAULT_SCORE; } @@ -83,82 +93,214 @@ trainees.set( */ function hasDollarSign(fnode) { if (fnode.element.innerText.includes('$')) { - return coeffHasDollarSign; + return hasDollarSignCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in its id + * Scores fnode with 'price' in its id or its parent's id */ - function hasTitleInID(fnode) { - const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return coeffHasTitleInID; + function hasPriceInID(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const ID = element.id; + const parentID = parentElement.id; + if (ID.includes('price') || ID.includes('Price')) { + return hasPriceInIDCoeff; + } + if (parentID.includes('price') || parentID.includes('Price')) { + return 0.75 * hasPriceInIDCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in a class name + * Scores fnode with 'price' in its class name or its parent's class name */ - function hasTitleInClassName(fnode) { - const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return coeffHasTitleInClassName; + function hasPriceInClassName(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const className = element.className; + const parentClassName = parentElement.className; + if (className.includes('price') || className.includes('Price')) { + return hasPriceInClassNameCoeff; + } + if (parentClassName.includes('price') || parentClassName.includes('Price')) { + return 0.75 * hasPriceInClassNameCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is hidden + * Checks if fnode is visible + */ + function isVisible(fnode) { + const element = fnode.element; + for (const ancestor of ancestors(element)) { + const style = getComputedStyle(ancestor); + if (style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return false; + } + } + return true; + } + + /** + * Scale a number to the range [ZEROISH, ONEISH]. + * + * Taken from: https://github.com/mozilla/fathom-trainees + * + * For a rising trapezoid, the result is ZEROISH until the input + * reaches zeroAt, then increases linearly until oneAt, at which it + * becomes ONEISH. To make a falling trapezoid, where the result is + * ONEISH to the left and ZEROISH to the right, use a zeroAt greater + * than oneAt. + */ + function trapezoid(number, zeroAt, oneAt) { + const isRising = zeroAt < oneAt; + if (isRising) { + if (number <= zeroAt) { + return ZEROISH; + } + if (number >= oneAt) { + return ONEISH; + } + } else { + if (number >= zeroAt) { + return ZEROISH; + } + if (number <= oneAt) { + return ONEISH; + } + } + const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); + return slope * (number - zeroAt) + ZEROISH; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + function isAboveTheFold(fnode, featureCoeff) { + const domRect = fnode.element.getBoundingClientRect(); + // Use a falling trapezoid to score the element; + // result is ONEISH until the input reaches VIEWPORT_HEIGHT, then decreases + // linearly until VIEWPORT_HEIGHT * 2, where it becomes ZEROISH. + return trapezoid(domRect.top, VIEWPORT_HEIGHT * 2, VIEWPORT_HEIGHT) * featureCoeff; + } + + /** + * Checks to see if fnode is eligible for scoring + * Note: This is a compound method, because `.when` chaining these methods onto + * a `dom` rule does not currently work. + */ + function isEligible(fnode, featureType) { + if (featureType === 'priceish') { + return ( + isVisible(fnode) + && removeRedundantAncestors(fnode) + && isNearbyImageYAxis(fnode) + ); + } + if (featureType === 'titleish') { + return ( + isVisible(fnode) + /** + * Don't removeRedundantAncestors, because

tags for + * Amazon and Walmart have and
element children, + * respectively, with the same innerText. + */ + && isNearbyImageYAxis(fnode) + ); + } + return false; + } + + /** + * Checks if fnode has the same innerText as any of its children */ - function isHidden(fnode) { + function removeRedundantAncestors(fnode) { const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return coeffIsHidden; + const children = element.children; + if (children.length > 0) { + for (const descendant of children) { + if (descendant.innerText === element.innerText) { + return false; + } + } + } + return true; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + function isNearbyImageXAxis(fnode) { + const element = fnode.element; + const eleDOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaX = eleDOMRect.left - imageDOMRect.right; + // priceish element is always* to the right of the image + if (deltaX > 0) { + // give a higher score the closer it is to the image, normalized by VIEWPORT_WIDTH + return (VIEWPORT_WIDTH / deltaX) * isNearbyImageXAxisCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is an H1 element + * Scores fnode whose innerText matches a priceish RegExp pattern */ - function isHeaderElement(fnode) { - if (fnode.element.tagName === 'H1') { - return coeffIsHeaderElement; + function hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return hasPriceishPatternCoeff; } return DEFAULT_SCORE; } /** - * Returns true if the fnode is above the fold + * Checks if fnode is nearby the top scoring image element in the y-axis */ - function isAboveTheFold(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { + function isNearbyImageYAxis(fnode) { + const element = fnode.element; + const DOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) + && DOMRect.bottom <= imageDOMRect.bottom) { return true; } return false; } - /* The actual ruleset */ + /* The ruleset */ const rules = ruleset( /** * Image rules * - * If training, comment out unless training 'image'. + * If training, leave uncommented, as 'price' and 'title' rules depend + * on the `out` of these 'image' rules. */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), + // consider all visible img elements + rule(dom('img').when(isVisible), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldImageCoeff))), // better score for larger images rule(type('imageish'), score(largerImage)), - // return image element with max score + // return image element(s) with max score rule(type('imageish').max(), out('image')), /** @@ -166,19 +308,11 @@ trainees.set( * * If training, comment out unless training 'title'. */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), - // score higher for h1 elements - rule(type('titleish'), score(isHeaderElement)), - // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), - // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), - // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), - // return title element with max score + // consider all eligible h1 elements + rule(dom('h1').when(fnode => isEligible(fnode, 'titleish')), type('titleish')), + // since no further rules are needed for title, give all inputs the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element(s) with max score rule(type('titleish').max(), out('title')), /** @@ -186,15 +320,23 @@ trainees.set( * * If training, comment out unless training 'price'. */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // consider all eligible span and h2 elements + rule(dom('span, h2').when(fnode => isEligible(fnode, 'priceish')), type('priceish')), // check if the element has a '$' in its innerText rule(type('priceish'), score(hasDollarSign)), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(hasPriceInID)), + // check if any class names have "price" in them + rule(type('priceish'), score(hasPriceInClassName)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), - // return price element with max score + // check for x-axis proximity to max scoring image element + rule(type('priceish'), score(isNearbyImageXAxis)), + // check if innerText has a priceish pattern + rule(type('priceish'), score(hasPriceishPattern)), + // return price element(s) with max score rule(type('priceish').max(), out('price')), ); return rules; From 8e0de88bb8b0bd883e8228ed149a58931f1245b7 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Fri, 17 Aug 2018 11:31:16 -0700 Subject: [PATCH 5/9] #36: Incorporate feedback from Osmose and erikrose. A follow-up commit will address [this comment](https://github.com/mozilla/webext-commerce/pull/45/files#r210363008) and [this comment](https://github.com/mozilla/webext-commerce/pull/45/files#r210361004). --- src/config.js | 3 + src/fathom_coefficients.json | 3 +- src/fathom_extraction.js | 40 ++++--- src/fathom_ruleset.js | 226 ++++++++++++++++++----------------- 4 files changed, 141 insertions(+), 131 deletions(-) diff --git a/src/config.js b/src/config.js index 0baa43f..3f9c3be 100644 --- a/src/config.js +++ b/src/config.js @@ -16,3 +16,6 @@ export const PRICE_CHECK_TIMEOUT_INTERVAL = 1000 * 60 * 15; // 15 minutes /** Delay before removing iframes created during price checks */ export const IFRAME_TIMEOUT = 1000 * 60; // 1 minute + +// Minimum score to be considered the "correct" feature element extracted by Fathom +export const SCORE_THRESHOLD = 4; diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index a90f91a..53d5d07 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -6,6 +6,7 @@ "hasPriceInClassNameCoeff": 2, "isAboveTheFoldPriceCoeff": 33, "isAboveTheFoldImageCoeff": 13, - "isNearbyImageXAxisCoeff": 5, + "isNearbyImageXAxisPriceCoeff": 5, + "isNearbyImageYAxisTitleCoeff": 5, "hasPriceishPatternCoeff": 15 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index b804b44..1af29b5 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -19,32 +19,36 @@ import { hasPriceInClassNameCoeff, isAboveTheFoldPriceCoeff, isAboveTheFoldImageCoeff, - isNearbyImageXAxisCoeff, + isNearbyImageXAxisPriceCoeff, + isNearbyImageYAxisTitleCoeff, hasPriceishPatternCoeff, } from 'commerce/fathom_coefficients.json'; +import {SCORE_THRESHOLD} from 'commerce/config'; const PRODUCT_FEATURES = ['title', 'price', 'image']; -const SCORE_THRESHOLD = 4; +const {rulesetMaker} = productRuleset.get('product'); +const rulesetWithCoeffs = rulesetMaker([ + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisPriceCoeff, + isNearbyImageYAxisTitleCoeff, + hasPriceishPatternCoeff, +]); /** * Extracts the highest scoring element above a score threshold * contained in a page's HTML document. */ function runRuleset(doc) { + const rulesetOutput = rulesetWithCoeffs.against(doc); const extractedElements = {}; - const rules = productRuleset.get('product').rulesetMaker; for (const feature of PRODUCT_FEATURES) { - let fnodesList = rules([ - largerImageCoeff, - largerFontSizeCoeff, - hasDollarSignCoeff, - hasPriceInIDCoeff, - hasPriceInClassNameCoeff, - isAboveTheFoldPriceCoeff, - isAboveTheFoldImageCoeff, - isNearbyImageXAxisCoeff, - hasPriceishPatternCoeff, - ]).against(doc).get(`${feature}`); + let fnodesList = rulesetOutput.get(feature); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { @@ -69,10 +73,10 @@ export default function extractProduct(doc) { const extractedElements = runRuleset(doc); if (hasAllFeatures(extractedElements)) { for (const feature of PRODUCT_FEATURES) { - extractedProduct[feature] = (feature === 'image' - ? extractedElements[feature].src - : extractedElements[feature].innerText - ); + if (feature === 'image') { + extractedProduct[feature] = extractedElements[feature].src; + } + extractedProduct[feature] = extractedElements[feature].innerText; } } return hasAllFeatures(extractedProduct) ? extractedProduct : null; diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js index bfcd0f9..f263abe 100644 --- a/src/fathom_ruleset.js +++ b/src/fathom_ruleset.js @@ -3,14 +3,14 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; -import {ancestors} from 'fathom-web/utils'; // for training: utilsForFrontend +// For training, replace 'utils' with 'utilsForFrontend'. The mozilla/fathom-trainees +// add-on currently imports Fathom as a submodule +import {ancestors} from 'fathom-web/utils'; +import {SCORE_THRESHOLD} from 'commerce/config'; const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; -const SCORE_THRESHOLD = 4; const TOP_BUFFER = 150; -const VIEWPORT_HEIGHT = window.innerHeight; -const VIEWPORT_WIDTH = window.innerWidth; // Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js const ZEROISH = 0.08; const ONEISH = 0.9; @@ -50,7 +50,7 @@ trainees.set( 'product', // Ruleset name: 'product' for production and 'title', 'image' or 'price' for training { // For training only: input rule coefficients in order here - coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 15], + coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 5, 15], rulesetMaker([ largerImageCoeff, largerFontSizeCoeff, @@ -59,7 +59,8 @@ trainees.set( hasPriceInClassNameCoeff, isAboveTheFoldPriceCoeff, isAboveTheFoldImageCoeff, - isNearbyImageXAxisCoeff, + isNearbyImageXAxisPriceCoeff, + isNearbyImageYAxisTitleCoeff, hasPriceishPatternCoeff, ]) { /** @@ -78,14 +79,10 @@ trainees.set( * Scores fnode in proportion to its font size */ function largerFontSize(fnode) { - const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; - const size = sizeWithUnits.replace('px', ''); - if (size) { - // normalize the multiplier by the default font size - const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * largerFontSizeCoeff); - } - return DEFAULT_SCORE; + const size = window.getComputedStyle(fnode.element).fontSize; + // normalize the multiplier by the default font size + const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; + return sizeMultiplier * largerFontSizeCoeff; } /** @@ -102,14 +99,12 @@ trainees.set( * Scores fnode with 'price' in its id or its parent's id */ function hasPriceInID(fnode) { - const element = fnode.element; - const parentElement = element.parentElement; - const ID = element.id; - const parentID = parentElement.id; - if (ID.includes('price') || ID.includes('Price')) { + const id = fnode.element.id; + const parentID = fnode.element.parentElement.id; + if (id.toLowerCase().includes('price')) { return hasPriceInIDCoeff; } - if (parentID.includes('price') || parentID.includes('Price')) { + if (parentID.toLowerCase().includes('price')) { return 0.75 * hasPriceInIDCoeff; } return DEFAULT_SCORE; @@ -119,31 +114,28 @@ trainees.set( * Scores fnode with 'price' in its class name or its parent's class name */ function hasPriceInClassName(fnode) { - const element = fnode.element; - const parentElement = element.parentElement; - const className = element.className; - const parentClassName = parentElement.className; - if (className.includes('price') || className.includes('Price')) { + const className = fnode.element.className; + const parentClassName = fnode.element.parentElement.className; + if (className.toLowerCase().includes('price')) { return hasPriceInClassNameCoeff; } - if (parentClassName.includes('price') || parentClassName.includes('Price')) { + if (parentClassName.toLowerCase().includes('price')) { return 0.75 * hasPriceInClassNameCoeff; } return DEFAULT_SCORE; } - /** - * Checks if fnode is visible - */ function isVisible(fnode) { - const element = fnode.element; - for (const ancestor of ancestors(element)) { + for (const ancestor of ancestors(fnode.element)) { const style = getComputedStyle(ancestor); - if (style.visibility === 'hidden' + const isElementHidden = ( + style.visibility === 'hidden' || style.display === 'none' || style.opacity === '0' || style.width === '0' - || style.height === '0') { + || style.height === '0' + ); + if (isElementHidden) { return false; } } @@ -151,79 +143,66 @@ trainees.set( } /** - * Scale a number to the range [ZEROISH, ONEISH]. - * - * Taken from: https://github.com/mozilla/fathom-trainees - * - * For a rising trapezoid, the result is ZEROISH until the input - * reaches zeroAt, then increases linearly until oneAt, at which it - * becomes ONEISH. To make a falling trapezoid, where the result is - * ONEISH to the left and ZEROISH to the right, use a zeroAt greater - * than oneAt. + * Scores fnode by its vertical location relative to the fold */ - function trapezoid(number, zeroAt, oneAt) { - const isRising = zeroAt < oneAt; - if (isRising) { - if (number <= zeroAt) { - return ZEROISH; - } - if (number >= oneAt) { - return ONEISH; - } - } else { - if (number >= zeroAt) { - return ZEROISH; - } - if (number <= oneAt) { - return ONEISH; - } + function isAboveTheFold(fnode, featureCoeff) { + const viewportHeight = window.innerHeight; + const top = fnode.element.getBoundingClientRect().top; + const upperHeightLimit = viewportHeight * 2; + // Use a falling trapezoid function to score the element + // Taken from: https://github.com/mozilla/fathom-trainees + if (top >= upperHeightLimit) { + return ZEROISH * featureCoeff; + } + if (top <= viewportHeight) { + return ONEISH * featureCoeff; } - const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); - return slope * (number - zeroAt) + ZEROISH; + // slope = deltaY / deltaX + const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); + // y = mx + b, where m = slope and b = y-intercept + return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; } /** - * Scores fnode by its vertical location relative to the fold + * Checks to see if a 'priceish' fnode is eligible for scoring + * Note: This is a compound method, because `.when` chaining these methods + * onto a `dom` rule does not currently work. i.e. + * `rule(dom('span, h2') + * .when(isVisible) + * .when(hasDifferentInnerTextThanChildren) + * .when(isNearbyImageYAxisPrice)), + * type('priceish')),` + * ...is replaced with: + * `rule(dom('span, h2').when(isEligiblePrice), type('priceish')),` */ - function isAboveTheFold(fnode, featureCoeff) { - const domRect = fnode.element.getBoundingClientRect(); - // Use a falling trapezoid to score the element; - // result is ONEISH until the input reaches VIEWPORT_HEIGHT, then decreases - // linearly until VIEWPORT_HEIGHT * 2, where it becomes ZEROISH. - return trapezoid(domRect.top, VIEWPORT_HEIGHT * 2, VIEWPORT_HEIGHT) * featureCoeff; + function isEligiblePrice(fnode) { + return ( + isVisible(fnode) + && hasDifferentInnerTextThanChildren(fnode) + && isNearbyImageYAxisPrice(fnode) + ); } /** - * Checks to see if fnode is eligible for scoring - * Note: This is a compound method, because `.when` chaining these methods onto - * a `dom` rule does not currently work. + * Checks to see if a 'titleish' fnode is eligible for scoring */ - function isEligible(fnode, featureType) { - if (featureType === 'priceish') { - return ( - isVisible(fnode) - && removeRedundantAncestors(fnode) - && isNearbyImageYAxis(fnode) - ); - } - if (featureType === 'titleish') { - return ( - isVisible(fnode) - /** - * Don't removeRedundantAncestors, because

tags for - * Amazon and Walmart have and
element children, - * respectively, with the same innerText. - */ - && isNearbyImageYAxis(fnode) - ); - } - return false; + function isEligibleTitle(fnode) { + return ( + isVisible(fnode) + // Don't use hasDifferentInnerTextThanChildren, because

tags + // for Amazon and Walmart have and
element children, + // respectively, with the same innerText. + // + // Don't use isNearbyImageYAxisTitle here, as unlike for price, there + // is a strong correlation for vertical proximity to image, so we want + // to score it proportionally rather than have a hard cut-off. + ); } /** - * Checks if fnode has the same innerText as any of its children + * Checks if fnode has different innerText compared to any of its children */ - function removeRedundantAncestors(fnode) { + function hasDifferentInnerTextThanChildren(fnode) { const element = fnode.element; const children = element.children; if (children.length > 0) { @@ -239,16 +218,24 @@ trainees.set( /** * Scores fnode based on its x distance from the highest scoring image element */ - function isNearbyImageXAxis(fnode) { - const element = fnode.element; - const eleDOMRect = element.getBoundingClientRect(); + function isNearbyImageXAxisPrice(fnode) { + const viewportWidth = window.innerWidth; + const eleDOMRect = fnode.element.getBoundingClientRect(); const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle const imageDOMRect = imageElement.getBoundingClientRect(); - const deltaX = eleDOMRect.left - imageDOMRect.right; - // priceish element is always* to the right of the image - if (deltaX > 0) { - // give a higher score the closer it is to the image, normalized by VIEWPORT_WIDTH - return (VIEWPORT_WIDTH / deltaX) * isNearbyImageXAxisCoeff; + const deltaRight = eleDOMRect.left - imageDOMRect.right; + const deltaLeft = imageDOMRect.left - eleDOMRect.right; + // True if element is completely to the right or left of the image element + const noOverlap = (deltaRight > 0 || deltaLeft > 0); + let deltaX; + if (noOverlap) { + if (deltaRight > 0) { + deltaX = deltaRight; + } else { + deltaX = deltaLeft; + } + // give a higher score the closer it is to the image, normalized by viewportWidth + return (viewportWidth / deltaX) * isNearbyImageXAxisPriceCoeff; } return DEFAULT_SCORE; } @@ -271,10 +258,30 @@ trainees.set( return DEFAULT_SCORE; } + /** + * Scores fnode based on its y distance from the highest scoring image element + */ + function isNearbyImageYAxisTitle(fnode) { + const viewportHeight = window.innerHeight; + const DOMRect = fnode.element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + // Some titles (like on Ebay) are above the image, so include a top buffer + const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); + const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; + // Give elements in a specific vertical band a higher score + if (isEleTopNearby && isEleBottomNearby) { + const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); + // give a higher score the closer it is to the image, normalized by viewportHeight + return (viewportHeight / deltaY) * isNearbyImageYAxisTitleCoeff; + } + return DEFAULT_SCORE; + } + /** * Checks if fnode is nearby the top scoring image element in the y-axis */ - function isNearbyImageYAxis(fnode) { + function isNearbyImageYAxisPrice(fnode) { const element = fnode.element; const DOMRect = element.getBoundingClientRect(); const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle @@ -290,9 +297,6 @@ trainees.set( const rules = ruleset( /** * Image rules - * - * If training, leave uncommented, as 'price' and 'title' rules depend - * on the `out` of these 'image' rules. */ // consider all visible img elements rule(dom('img').when(isVisible), type('imageish')), @@ -305,11 +309,11 @@ trainees.set( /** * Title rules - * - * If training, comment out unless training 'title'. */ // consider all eligible h1 elements - rule(dom('h1').when(fnode => isEligible(fnode, 'titleish')), type('titleish')), + rule(dom('h1').when(isEligibleTitle), type('titleish')), + // better score based on y-axis proximity to max scoring image element + rule(type('titleish'), score(isNearbyImageYAxisTitle)), // since no further rules are needed for title, give all inputs the minimum score rule(type('titleish'), score(() => SCORE_THRESHOLD)), // return title element(s) with max score @@ -317,11 +321,9 @@ trainees.set( /** * Price rules - * - * If training, comment out unless training 'price'. */ // consider all eligible span and h2 elements - rule(dom('span, h2').when(fnode => isEligible(fnode, 'priceish')), type('priceish')), + rule(dom('span, h2').when(isEligiblePrice), type('priceish')), // check if the element has a '$' in its innerText rule(type('priceish'), score(hasDollarSign)), // better score the closer the element is to the top of the page @@ -332,8 +334,8 @@ trainees.set( rule(type('priceish'), score(hasPriceInClassName)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), - // check for x-axis proximity to max scoring image element - rule(type('priceish'), score(isNearbyImageXAxis)), + // better score based on x-axis proximity to max scoring image element + rule(type('priceish'), score(isNearbyImageXAxisPrice)), // check if innerText has a priceish pattern rule(type('priceish'), score(hasPriceishPattern)), // return price element(s) with max score From 7567ca3222f575245b7c1f5b534b43eb75eb7500 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Sat, 18 Aug 2018 20:35:30 -0700 Subject: [PATCH 6/9] #36: Break out fathom_ruleset.js into two separate scripts. The first script, 'ruleset_factory.js', exports a class to create a ruleset based on a set of coefficients; instances of this class are used in production (via 'fathom_extraction.js') and for Fathom training (via 'trainees.js'). 2. The second script, 'trainees.js', is used exclusively for training using the FathomFox web extension and does not ship with the commerce web extension. Additional changes and notes: * I chose not to make use of the 'autobind' decorator in 'ruleset_factory.js', since it is also used in the training add-on, where devDeps like 'babel-core' and 'babel-plugin-transform-decorators-legacy' do not exist. * I also turned off an eslint rule that requires class methods to use 'this', since some methods in RulesetFactory don't require it, and it would be tedious and confusing to call some methods on the class instance and others on the class itself. * The new training script ('trainees.js') has three elements in the map it exports, one for each product feature ('image', 'title', 'price'). This allows us to select which feature to train from a dropdown menu on FathomFox's trainer page. * Currently, for training, four files must be copied over into the 'fathom-trainees' add-on src directory: * config.js * fathom_default_coefficients.json * ruleset_factory.js * trainees.js (overwritting the existing file) * In a separate commit, I will put all the Fathom extraction files into an 'extraction' (or similar) subfolder. --- .eslintrc.json | 1 + ....json => fathom_default_coefficients.json} | 0 src/fathom_extraction.js | 37 +- src/fathom_ruleset.js | 349 ------------------ src/ruleset_factory.js | 316 ++++++++++++++++ src/trainees.js | 98 +++++ 6 files changed, 424 insertions(+), 377 deletions(-) rename src/{fathom_coefficients.json => fathom_default_coefficients.json} (100%) delete mode 100644 src/fathom_ruleset.js create mode 100644 src/ruleset_factory.js create mode 100644 src/trainees.js diff --git a/.eslintrc.json b/.eslintrc.json index 02081a6..7ff4655 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -14,6 +14,7 @@ "no-restricted-syntax": ["off"], "no-use-before-define": ["error", {"functions": false}], "no-prototype-builtins": ["off"], + "class-methods-use-this": ["off"], "react/jsx-one-expression-per-line": ["off"], "react/prefer-stateless-function": ["off"], diff --git a/src/fathom_coefficients.json b/src/fathom_default_coefficients.json similarity index 100% rename from src/fathom_coefficients.json rename to src/fathom_default_coefficients.json diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 1af29b5..263c366 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -10,45 +10,26 @@ * Features: title, image, price */ -import productRuleset from 'commerce/fathom_ruleset'; -import { - largerImageCoeff, - largerFontSizeCoeff, - hasDollarSignCoeff, - hasPriceInIDCoeff, - hasPriceInClassNameCoeff, - isAboveTheFoldPriceCoeff, - isAboveTheFoldImageCoeff, - isNearbyImageXAxisPriceCoeff, - isNearbyImageYAxisTitleCoeff, - hasPriceishPatternCoeff, -} from 'commerce/fathom_coefficients.json'; +import defaultCoefficients from 'commerce/fathom_default_coefficients.json'; +import RulesetFactory from 'commerce/ruleset_factory'; import {SCORE_THRESHOLD} from 'commerce/config'; const PRODUCT_FEATURES = ['title', 'price', 'image']; -const {rulesetMaker} = productRuleset.get('product'); -const rulesetWithCoeffs = rulesetMaker([ - largerImageCoeff, - largerFontSizeCoeff, - hasDollarSignCoeff, - hasPriceInIDCoeff, - hasPriceInClassNameCoeff, - isAboveTheFoldPriceCoeff, - isAboveTheFoldImageCoeff, - isNearbyImageXAxisPriceCoeff, - isNearbyImageYAxisTitleCoeff, - hasPriceishPatternCoeff, -]); +// Array of numbers corresponding to the coefficients +const coefficients = Object.values(defaultCoefficients); +// For production, we don't need to generate a new ruleset factory +// and ruleset every time we run Fathom, since the coefficients are static. +const rulesetFactory = new RulesetFactory(coefficients); +const rules = rulesetFactory.makeRuleset(); /** * Extracts the highest scoring element above a score threshold * contained in a page's HTML document. */ function runRuleset(doc) { - const rulesetOutput = rulesetWithCoeffs.against(doc); const extractedElements = {}; for (const feature of PRODUCT_FEATURES) { - let fnodesList = rulesetOutput.get(feature); + let fnodesList = rules.against(doc).get(feature); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js deleted file mode 100644 index f263abe..0000000 --- a/src/fathom_ruleset.js +++ /dev/null @@ -1,349 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -import {dom, out, rule, ruleset, score, type} from 'fathom-web'; -// For training, replace 'utils' with 'utilsForFrontend'. The mozilla/fathom-trainees -// add-on currently imports Fathom as a submodule -import {ancestors} from 'fathom-web/utils'; -import {SCORE_THRESHOLD} from 'commerce/config'; - -const DEFAULT_BODY_FONT_SIZE = 14; -const DEFAULT_SCORE = 1; -const TOP_BUFFER = 150; -// Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js -const ZEROISH = 0.08; -const ONEISH = 0.9; - -/** - * Rulesets to train. - * - * Drop this file into the fathom-trainees/src folder (replacing the default file) - * to train Fathom against this ruleset. - * - * More mechanically, a map of names to {coeffs, rulesetMaker} objects. - * rulesetMaker is a function that takes an Array of coefficients and returns a - * ruleset that uses them. coeffs is typically the best-yet-found coefficients - * for a ruleset but can also be some more widely flung ones that you want to - * start the trainer from. The rulesets you specify here show up in the Train - * UI, from which you can kick off a training run. - * - * Fathom notes: - * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` - * attribute is the same as the `out`-ed string. Example: An element tagged with - * `data-fathom="image"` will map to `rule(..., out("image"))`. - * - I would not recommend using the Corpus Collector to build up a training set, - * because you can only batch freeze original pages, meaning tagged pages would be - * re-freezed, and there are non-obvious side effects in the diff (an issue with - * the freeze-dried library Fathom uses). - */ - -const trainees = new Map(); - -trainees.set( - /** - * A ruleset that finds the main product title, image and price on a product page. - * IMPORTANT: Currently, the Trainer assumes that the name of the ruleset and the - * out-rule of interest are the same. A multi-out ruleset will not work without - * commenting out all but one `out` and setting the ruleset name to that `out`. - */ - 'product', // Ruleset name: 'product' for production and 'title', 'image' or 'price' for training - { - // For training only: input rule coefficients in order here - coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 5, 15], - rulesetMaker([ - largerImageCoeff, - largerFontSizeCoeff, - hasDollarSignCoeff, - hasPriceInIDCoeff, - hasPriceInClassNameCoeff, - isAboveTheFoldPriceCoeff, - isAboveTheFoldImageCoeff, - isNearbyImageXAxisPriceCoeff, - isNearbyImageYAxisTitleCoeff, - hasPriceishPatternCoeff, - ]) { - /** - * Scores fnode in direct proportion to its size - */ - function largerImage(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - const area = (domRect.width) * (domRect.height); - if (area === 0) { - return DEFAULT_SCORE; - } - return area * largerImageCoeff; - } - - /** - * Scores fnode in proportion to its font size - */ - function largerFontSize(fnode) { - const size = window.getComputedStyle(fnode.element).fontSize; - // normalize the multiplier by the default font size - const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; - return sizeMultiplier * largerFontSizeCoeff; - } - - /** - * Scores fnode with a '$' in its innerText - */ - function hasDollarSign(fnode) { - if (fnode.element.innerText.includes('$')) { - return hasDollarSignCoeff; - } - return DEFAULT_SCORE; - } - - /** - * Scores fnode with 'price' in its id or its parent's id - */ - function hasPriceInID(fnode) { - const id = fnode.element.id; - const parentID = fnode.element.parentElement.id; - if (id.toLowerCase().includes('price')) { - return hasPriceInIDCoeff; - } - if (parentID.toLowerCase().includes('price')) { - return 0.75 * hasPriceInIDCoeff; - } - return DEFAULT_SCORE; - } - - /** - * Scores fnode with 'price' in its class name or its parent's class name - */ - function hasPriceInClassName(fnode) { - const className = fnode.element.className; - const parentClassName = fnode.element.parentElement.className; - if (className.toLowerCase().includes('price')) { - return hasPriceInClassNameCoeff; - } - if (parentClassName.toLowerCase().includes('price')) { - return 0.75 * hasPriceInClassNameCoeff; - } - return DEFAULT_SCORE; - } - - function isVisible(fnode) { - for (const ancestor of ancestors(fnode.element)) { - const style = getComputedStyle(ancestor); - const isElementHidden = ( - style.visibility === 'hidden' - || style.display === 'none' - || style.opacity === '0' - || style.width === '0' - || style.height === '0' - ); - if (isElementHidden) { - return false; - } - } - return true; - } - - /** - * Scores fnode by its vertical location relative to the fold - */ - function isAboveTheFold(fnode, featureCoeff) { - const viewportHeight = window.innerHeight; - const top = fnode.element.getBoundingClientRect().top; - const upperHeightLimit = viewportHeight * 2; - // Use a falling trapezoid function to score the element - // Taken from: https://github.com/mozilla/fathom-trainees - if (top >= upperHeightLimit) { - return ZEROISH * featureCoeff; - } - if (top <= viewportHeight) { - return ONEISH * featureCoeff; - } - // slope = deltaY / deltaX - const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); - // y = mx + b, where m = slope and b = y-intercept - return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; - } - - /** - * Checks to see if a 'priceish' fnode is eligible for scoring - * Note: This is a compound method, because `.when` chaining these methods - * onto a `dom` rule does not currently work. i.e. - * `rule(dom('span, h2') - * .when(isVisible) - * .when(hasDifferentInnerTextThanChildren) - * .when(isNearbyImageYAxisPrice)), - * type('priceish')),` - * ...is replaced with: - * `rule(dom('span, h2').when(isEligiblePrice), type('priceish')),` - */ - function isEligiblePrice(fnode) { - return ( - isVisible(fnode) - && hasDifferentInnerTextThanChildren(fnode) - && isNearbyImageYAxisPrice(fnode) - ); - } - - /** - * Checks to see if a 'titleish' fnode is eligible for scoring - */ - function isEligibleTitle(fnode) { - return ( - isVisible(fnode) - // Don't use hasDifferentInnerTextThanChildren, because

tags - // for Amazon and Walmart have and
element children, - // respectively, with the same innerText. - // - // Don't use isNearbyImageYAxisTitle here, as unlike for price, there - // is a strong correlation for vertical proximity to image, so we want - // to score it proportionally rather than have a hard cut-off. - ); - } - - /** - * Checks if fnode has different innerText compared to any of its children - */ - function hasDifferentInnerTextThanChildren(fnode) { - const element = fnode.element; - const children = element.children; - if (children.length > 0) { - for (const descendant of children) { - if (descendant.innerText === element.innerText) { - return false; - } - } - } - return true; - } - - /** - * Scores fnode based on its x distance from the highest scoring image element - */ - function isNearbyImageXAxisPrice(fnode) { - const viewportWidth = window.innerWidth; - const eleDOMRect = fnode.element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle - const imageDOMRect = imageElement.getBoundingClientRect(); - const deltaRight = eleDOMRect.left - imageDOMRect.right; - const deltaLeft = imageDOMRect.left - eleDOMRect.right; - // True if element is completely to the right or left of the image element - const noOverlap = (deltaRight > 0 || deltaLeft > 0); - let deltaX; - if (noOverlap) { - if (deltaRight > 0) { - deltaX = deltaRight; - } else { - deltaX = deltaLeft; - } - // give a higher score the closer it is to the image, normalized by viewportWidth - return (viewportWidth / deltaX) * isNearbyImageXAxisPriceCoeff; - } - return DEFAULT_SCORE; - } - - /** - * Scores fnode whose innerText matches a priceish RegExp pattern - */ - function hasPriceishPattern(fnode) { - const text = fnode.element.innerText; - /** - * With an optional '$' that doesn't necessarily have to be at the beginning - * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before - * a decimal point and exactly two after, where the two digits after the decimal point - * are at the end of the string - */ - const regExp = /\${0,1}\d+\.\d{2}$/; - if (regExp.test(text)) { - return hasPriceishPatternCoeff; - } - return DEFAULT_SCORE; - } - - /** - * Scores fnode based on its y distance from the highest scoring image element - */ - function isNearbyImageYAxisTitle(fnode) { - const viewportHeight = window.innerHeight; - const DOMRect = fnode.element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle - const imageDOMRect = imageElement.getBoundingClientRect(); - // Some titles (like on Ebay) are above the image, so include a top buffer - const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); - const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; - // Give elements in a specific vertical band a higher score - if (isEleTopNearby && isEleBottomNearby) { - const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); - // give a higher score the closer it is to the image, normalized by viewportHeight - return (viewportHeight / deltaY) * isNearbyImageYAxisTitleCoeff; - } - return DEFAULT_SCORE; - } - - /** - * Checks if fnode is nearby the top scoring image element in the y-axis - */ - function isNearbyImageYAxisPrice(fnode) { - const element = fnode.element; - const DOMRect = element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle - const imageDOMRect = imageElement.getBoundingClientRect(); - if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) - && DOMRect.bottom <= imageDOMRect.bottom) { - return true; - } - return false; - } - - /* The ruleset */ - const rules = ruleset( - /** - * Image rules - */ - // consider all visible img elements - rule(dom('img').when(isVisible), type('imageish')), - // better score the closer the element is to the top of the page - rule(type('imageish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldImageCoeff))), - // better score for larger images - rule(type('imageish'), score(largerImage)), - // return image element(s) with max score - rule(type('imageish').max(), out('image')), - - /** - * Title rules - */ - // consider all eligible h1 elements - rule(dom('h1').when(isEligibleTitle), type('titleish')), - // better score based on y-axis proximity to max scoring image element - rule(type('titleish'), score(isNearbyImageYAxisTitle)), - // since no further rules are needed for title, give all inputs the minimum score - rule(type('titleish'), score(() => SCORE_THRESHOLD)), - // return title element(s) with max score - rule(type('titleish').max(), out('title')), - - /** - * Price rules - */ - // consider all eligible span and h2 elements - rule(dom('span, h2').when(isEligiblePrice), type('priceish')), - // check if the element has a '$' in its innerText - rule(type('priceish'), score(hasDollarSign)), - // better score the closer the element is to the top of the page - rule(type('priceish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldPriceCoeff))), - // check if the id has "price" in it - rule(type('priceish'), score(hasPriceInID)), - // check if any class names have "price" in them - rule(type('priceish'), score(hasPriceInClassName)), - // better score for larger font size - rule(type('priceish'), score(largerFontSize)), - // better score based on x-axis proximity to max scoring image element - rule(type('priceish'), score(isNearbyImageXAxisPrice)), - // check if innerText has a priceish pattern - rule(type('priceish'), score(hasPriceishPattern)), - // return price element(s) with max score - rule(type('priceish').max(), out('price')), - ); - return rules; - }, - }, -); - -export default trainees; diff --git a/src/ruleset_factory.js b/src/ruleset_factory.js new file mode 100644 index 0000000..2053ec0 --- /dev/null +++ b/src/ruleset_factory.js @@ -0,0 +1,316 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +// Since the fathom-trainees add-on currently uses a submodule of Fathom, for +// training, replace 'utils' with 'utilsForFrontend' +import {ancestors} from 'fathom-web/utils'; +// relative URLs are needed for training, as the 'commerce' alias doesn't exist +// in that context +import {SCORE_THRESHOLD} from './config'; + +const DEFAULT_BODY_FONT_SIZE = 14; +const DEFAULT_SCORE = 1; +const TOP_BUFFER = 150; +// From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; + +export default class RulesetFactory { + /** + * Create a ruleset factory. + * + * @param {Array.number} coefficients The coefficients to apply for each rule + */ + constructor(coefficients) { + [ + this.largerImageCoeff, + this.largerFontSizeCoeff, + this.hasDollarSignCoeff, + this.hasPriceInIDCoeff, + this.hasPriceInClassNameCoeff, + this.isAboveTheFoldPriceCoeff, + this.isAboveTheFoldImageCoeff, + this.isNearbyImageXAxisPriceCoeff, + this.isNearbyImageYAxisTitleCoeff, + this.hasPriceishPatternCoeff, + ] = coefficients; + } + + /** + * Scores fnode in direct proportion to its size + */ + largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * this.largerImageCoeff; + } + + /** + * Scores fnode in proportion to its font size + */ + largerFontSize(fnode) { + const size = window.getComputedStyle(fnode.element).fontSize; + // Normalize the multiplier by the default font size + const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; + return sizeMultiplier * this.largerFontSizeCoeff; + } + + /** + * Scores fnode with a '$' in its innerText + */ + hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return this.hasDollarSignCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its id or its parent's id + */ + hasPriceInID(fnode) { + const id = fnode.element.id; + const parentID = fnode.element.parentElement.id; + if (id.toLowerCase().includes('price')) { + return this.hasPriceInIDCoeff; + } + if (parentID.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInIDCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode with 'price' in its class name or its parent's class name + */ + hasPriceInClassName(fnode) { + const className = fnode.element.className; + const parentClassName = fnode.element.parentElement.className; + if (className.toLowerCase().includes('price')) { + return this.hasPriceInClassNameCoeff; + } + if (parentClassName.toLowerCase().includes('price')) { + return 0.75 * this.hasPriceInClassNameCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + isAboveTheFold(fnode, featureCoeff) { + const viewportHeight = window.innerHeight; + const top = fnode.element.getBoundingClientRect().top; + const upperHeightLimit = viewportHeight * 2; + // Use a falling trapezoid function to score the element + // Taken from: https://github.com/mozilla/fathom-trainees + if (top >= upperHeightLimit) { + return ZEROISH * featureCoeff; + } + if (top <= viewportHeight) { + return ONEISH * featureCoeff; + } + // slope = deltaY / deltaX + const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); + return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + isNearbyImageXAxisPrice(fnode) { + const viewportWidth = window.innerWidth; + const eleDOMRect = fnode.element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaRight = eleDOMRect.left - imageDOMRect.right; + const deltaLeft = imageDOMRect.left - eleDOMRect.right; + // True if element is completely to the right or left of the image element + const noOverlap = (deltaRight > 0 || deltaLeft > 0); + let deltaX; + if (noOverlap) { + if (deltaRight > 0) { + deltaX = deltaRight; + } else { + deltaX = deltaLeft; + } + // Give a higher score the closer it is to the image, normalized by viewportWidth + return (viewportWidth / deltaX) * this.isNearbyImageXAxisPriceCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode based on its y distance from the highest scoring image element + */ + isNearbyImageYAxisTitle(fnode) { + const viewportHeight = window.innerHeight; + const DOMRect = fnode.element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + // Some titles (like on Ebay) are above the image, so include a top buffer + const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); + const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; + // Give elements in a specific vertical band a higher score + if (isEleTopNearby && isEleBottomNearby) { + const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); + // Give a higher score the closer it is to the image, normalized by viewportHeight + return (viewportHeight / deltaY) * this.isNearbyImageYAxisTitleCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Scores fnode whose innerText matches a priceish RegExp pattern + */ + hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return this.hasPriceishPatternCoeff; + } + return DEFAULT_SCORE; + } + + /** + * Checks to see if a 'priceish' fnode is eligible for scoring + */ + isEligiblePrice(fnode) { + return ( + this.isVisible(fnode) + && this.hasDifferentInnerTextThanChildren(fnode) + && this.isNearbyImageYAxisPrice(fnode) + ); + } + + /** + * Checks to see if a 'titleish' fnode is eligible for scoring + */ + isEligibleTitle(fnode) { + return ( + this.isVisible(fnode) + // Don't use hasDifferentInnerTextThanChildren, because

tags + // for Amazon and Walmart have and
element children, + // respectively, with the same innerText. + // + // Don't use isNearbyImageYAxisTitle here, as unlike for price, there + // is a strong correlation for vertical proximity to image, so we want + // to score it proportionally rather than have a hard cut-off. + ); + } + + /** + * Checks if fnode has different innerText compared to any of its children + */ + hasDifferentInnerTextThanChildren(fnode) { + const element = fnode.element; + const children = element.children; + if (children.length > 0) { + for (const descendant of children) { + if (descendant.innerText === element.innerText) { + return false; + } + } + } + return true; + } + + /** + * Checks if fnode is nearby the top scoring image element in the y-axis + * Unlike for 'title', 'price' elements had worse accuracy when scored + * in proportion to y-axis proximity to the image. + */ + isNearbyImageYAxisPrice(fnode) { + const element = fnode.element; + const DOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) + && DOMRect.bottom <= imageDOMRect.bottom) { + return true; + } + return false; + } + + isVisible(fnode) { + for (const ancestor of ancestors(fnode.element)) { + const style = getComputedStyle(ancestor); + const isElementHidden = ( + style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0' + ); + if (isElementHidden) { + return false; + } + } + return true; + } + + /** + * Using coefficients passed into the constructor method, returns a weighted + * ruleset used to score elements in an HTML document. + */ + makeRuleset() { + return ruleset( + /** + * Image rules + */ + // consider all visible img elements + rule(dom('img').when(this.isVisible.bind(this)), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldImageCoeff))), + // better score for larger images + rule(type('imageish'), score(this.largerImage.bind(this))), + // return image element(s) with max score + rule(type('imageish').max(), out('image')), + + /** + * Title rules + */ + // consider all eligible h1 elements + rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('titleish')), + // better score based on y-axis proximity to max scoring image element + rule(type('titleish'), score(this.isNearbyImageYAxisTitle.bind(this))), + // since no further rules are needed for title, give all inputs the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element(s) with max score + rule(type('titleish').max(), out('title')), + + /** + * Price rules + */ + // consider all eligible span and h2 elements + rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(this.hasDollarSign.bind(this))), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(this.hasPriceInID.bind(this))), + // check if any class names have "price" in them + rule(type('priceish'), score(this.hasPriceInClassName.bind(this))), + // better score for larger font size + rule(type('priceish'), score(this.largerFontSize.bind(this))), + // better score based on x-axis proximity to max scoring image element + rule(type('priceish'), score(this.isNearbyImageXAxisPrice.bind(this))), + // check if innerText has a priceish pattern + rule(type('priceish'), score(this.hasPriceishPattern.bind(this))), + // return price element(s) with max score + rule(type('priceish').max(), out('price')), + ); + } +} diff --git a/src/trainees.js b/src/trainees.js new file mode 100644 index 0000000..6863517 --- /dev/null +++ b/src/trainees.js @@ -0,0 +1,98 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import defaultCoefficients from './fathom_default_coefficients.json'; +import RulesetFactory from './ruleset_factory'; + +// Array of numbers corresponding to the coefficients +const coeffs = Object.values(defaultCoefficients); + +/** + * Rulesets to train using Fathom. + * + * More mechanically, a map of names to {coeffs, rulesetMaker} objects. + * rulesetMaker is a function that takes an Array of coefficients and returns a + * ruleset that uses them. coeffs is typically the best-yet-found coefficients + * for a ruleset but can also be some more widely flung ones that you want to + * start the trainer from. The rulesets you specify here show up in the Train + * UI, from which you can kick off a training run. + * + * How to train: + * 1. Fork the `mozilla/fathom-trainees` repo, + * 2. Copy this file, `fathom_default_coefficients.json`, `ruleset_factory.js` + * and `config.js` over to the `./src` folder in the `fathom-trainees` add-on. + * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer + * + * Notes: + * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` + * attribute is the same as the `out`-ed string. Example: An element tagged with + * `data-fathom="image"` will map to `rule(..., out("image"))`. + * - The Trainer assumes that the name of the ruleset and the out-rule of interest + * are the same. Therefore, if a ruleset contains more than one out-rule, + * each `out`-ed feature must have its own key in the `trainees` map. You can + * select which feature to train from the dropdown menu on FathomFox's Trainer page. + * - I would not recommend using the Corpus Collector to build up a training set, + * because you can only batch freeze original pages, meaning tagged pages would be + * re-freezed, and there are non-obvious side effects in the diff (an issue with + * the freeze-dried library Fathom uses). + */ + +const trainees = new Map([ + [ + /** + * A ruleset that finds the main product image on a product page. + */ + 'image', // Ruleset name + { + coeffs, + /** + * @param {Array.number} coefficients + */ + rulesetMaker(coefficients) { + // The coefficients are updated over time during training, so create a new factory for + // each iteration + const rulesetFactory = new RulesetFactory(coefficients); + return rulesetFactory.makeRuleset(); // The ruleset + }, + }, + ], + [ + /** + * A ruleset that finds the main product title on a product page. + */ + 'title', // Ruleset name + { + coeffs, + /** + * @param {Array.number} coefficients + */ + rulesetMaker(coefficients) { + // The coefficients are updated over time during training, so create a new factory for + // each iteration + const rulesetFactory = new RulesetFactory(coefficients); + return rulesetFactory.makeRuleset(); // The ruleset + }, + }, + ], + [ + /** + * A ruleset that finds the main product price on a product page. + */ + 'price', // Ruleset name + { + coeffs, + /** + * @param {Array.number} coefficients + */ + rulesetMaker(coefficients) { + // The coefficients are updated over time during training, so create a new factory for + // each iteration + const rulesetFactory = new RulesetFactory(coefficients); + return rulesetFactory.makeRuleset(); // The ruleset + }, + }, + ], +]); + +export default trainees; From 4b1d448d4d12c16181f8c5531200cac646bb1448 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Mon, 20 Aug 2018 10:38:47 -0700 Subject: [PATCH 7/9] #36: Reorganize files into an 'extraction' subfolder. --- src/{ => extraction}/fallback_extraction.js | 4 ++-- .../fathom_default_coefficients.json | 0 src/{ => extraction}/fathom_extraction.js | 6 +++--- src/{ => extraction}/product_extraction_data.json | 0 src/{ => extraction}/ruleset_factory.js | 8 +++++++- src/product_info.js | 4 ++-- src/trainees.js | 13 ++++++++----- 7 files changed, 22 insertions(+), 13 deletions(-) rename src/{ => extraction}/fallback_extraction.js (97%) rename src/{ => extraction}/fathom_default_coefficients.json (100%) rename src/{ => extraction}/fathom_extraction.js (93%) rename src/{ => extraction}/product_extraction_data.json (100%) rename src/{ => extraction}/ruleset_factory.js (97%) diff --git a/src/fallback_extraction.js b/src/extraction/fallback_extraction.js similarity index 97% rename from src/fallback_extraction.js rename to src/extraction/fallback_extraction.js index db1f6ee..5b3535f 100644 --- a/src/fallback_extraction.js +++ b/src/extraction/fallback_extraction.js @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/* +/** * Uses CSS selectors, or failing that, Open Graph tags to extract * a product from its product page, where a 'product' is defined by the bundle * of features that makes it identifiable. @@ -10,7 +10,7 @@ * Features: title, image, price */ -import extractionData from 'commerce/product_extraction_data.json'; +import extractionData from 'commerce/extraction/product_extraction_data.json'; const OPEN_GRAPH_PROPERTY_VALUES = { title: 'og:title', diff --git a/src/fathom_default_coefficients.json b/src/extraction/fathom_default_coefficients.json similarity index 100% rename from src/fathom_default_coefficients.json rename to src/extraction/fathom_default_coefficients.json diff --git a/src/fathom_extraction.js b/src/extraction/fathom_extraction.js similarity index 93% rename from src/fathom_extraction.js rename to src/extraction/fathom_extraction.js index 263c366..0b59e84 100644 --- a/src/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -/* +/** * Uses Fathom to extract a product from its product page, * where a 'product' is defined by the bundle of features that * makes it identifiable. @@ -10,8 +10,8 @@ * Features: title, image, price */ -import defaultCoefficients from 'commerce/fathom_default_coefficients.json'; -import RulesetFactory from 'commerce/ruleset_factory'; +import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; +import RulesetFactory from 'commerce/extraction/ruleset_factory'; import {SCORE_THRESHOLD} from 'commerce/config'; const PRODUCT_FEATURES = ['title', 'price', 'image']; diff --git a/src/product_extraction_data.json b/src/extraction/product_extraction_data.json similarity index 100% rename from src/product_extraction_data.json rename to src/extraction/product_extraction_data.json diff --git a/src/ruleset_factory.js b/src/extraction/ruleset_factory.js similarity index 97% rename from src/ruleset_factory.js rename to src/extraction/ruleset_factory.js index 2053ec0..7e4744f 100644 --- a/src/ruleset_factory.js +++ b/src/extraction/ruleset_factory.js @@ -2,13 +2,19 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/** + * Exports a RulesetFactory class, which when instantiated, binds Fathom + * coefficients to a ruleset. An instance of this class is used for product + * feature extraction (`fathom_extraction.js`) and for training (`trainees.js`). + */ + import {dom, out, rule, ruleset, score, type} from 'fathom-web'; // Since the fathom-trainees add-on currently uses a submodule of Fathom, for // training, replace 'utils' with 'utilsForFrontend' import {ancestors} from 'fathom-web/utils'; // relative URLs are needed for training, as the 'commerce' alias doesn't exist // in that context -import {SCORE_THRESHOLD} from './config'; +import {SCORE_THRESHOLD} from '../config'; const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; diff --git a/src/product_info.js b/src/product_info.js index e5de47c..9aa01a2 100644 --- a/src/product_info.js +++ b/src/product_info.js @@ -7,8 +7,8 @@ * "document_idle", which is after all DOM content has been loaded. */ -import extractProductWithFathom from 'commerce/fathom_extraction'; -import extractProductWithFallback from 'commerce/fallback_extraction'; +import extractProductWithFathom from 'commerce/extraction/fathom_extraction'; +import extractProductWithFallback from 'commerce/extraction/fallback_extraction'; /** * Checks to see if any product information for the page was found, diff --git a/src/trainees.js b/src/trainees.js index 6863517..a8ce8fb 100644 --- a/src/trainees.js +++ b/src/trainees.js @@ -2,8 +2,8 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -import defaultCoefficients from './fathom_default_coefficients.json'; -import RulesetFactory from './ruleset_factory'; +import defaultCoefficients from './extraction/fathom_default_coefficients.json'; +import RulesetFactory from './extraction/ruleset_factory'; // Array of numbers corresponding to the coefficients const coeffs = Object.values(defaultCoefficients); @@ -20,9 +20,12 @@ const coeffs = Object.values(defaultCoefficients); * * How to train: * 1. Fork the `mozilla/fathom-trainees` repo, - * 2. Copy this file, `fathom_default_coefficients.json`, `ruleset_factory.js` - * and `config.js` over to the `./src` folder in the `fathom-trainees` add-on. - * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer + * 2. In the `fathom-trainees` add-on, copy this file and `config.js` over to the + * `./src` folder, and copy `./extraction/fathom_default_coefficients.json` and + * `./extraction/ruleset_factory.js` to a new `./src/extraction` subfolder. + * * Note: You will have to replace 'utils' with 'utilsForFrontend' on the + * import in `ruleset_factory.js`. See that file for more information. + * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer. * * Notes: * - The FathomFox Trainer assumes that the value of your corpus' `data-fathom` From 486142b005a84583a85b8ad6702644798b2b2833 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Fri, 24 Aug 2018 14:57:53 -0700 Subject: [PATCH 8/9] #36: Incorporate more feedback from Osmose. Also updated the Code Organization section of the README to include the new 'extraction' subfolder. --- README.md | 1 + .../fathom_default_coefficients.json | 10 ++-- src/extraction/fathom_extraction.js | 10 ++-- src/extraction/ruleset_factory.js | 45 ++++++++++---- src/{ => extraction}/trainees.js | 58 ++++++------------- 5 files changed, 63 insertions(+), 61 deletions(-) rename src/{ => extraction}/trainees.js (61%) diff --git a/README.md b/README.md index 4fc09a6..7c802c8 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ After this, you can run `pipenv run test` to run the automated test suite. - `src/background` contains the background scripts that trigger UI elements (such as the page action) and periodically check for price updates. - `src/browser_action` contains the toolbar popup for managing the list of currently-tracked products. +- `src/extraction` contains the content scripts that extract product information from product web pages. - `src/page_action` contains the URL bar popup for viewing and tracking the product in the current tab. - `src/state` contains the Redux-based code for managing global extension state. - `src/tests` contains the automated test suite. diff --git a/src/extraction/fathom_default_coefficients.json b/src/extraction/fathom_default_coefficients.json index 53d5d07..7a70565 100644 --- a/src/extraction/fathom_default_coefficients.json +++ b/src/extraction/fathom_default_coefficients.json @@ -1,12 +1,12 @@ { - "largerImageCoeff": 2, - "largerFontSizeCoeff": 7, "hasDollarSignCoeff": 8, - "hasPriceInIDCoeff": 17, "hasPriceInClassNameCoeff": 2, - "isAboveTheFoldPriceCoeff": 33, + "hasPriceInIDCoeff": 17, + "hasPriceishPatternCoeff": 15, "isAboveTheFoldImageCoeff": 13, + "isAboveTheFoldPriceCoeff": 33, "isNearbyImageXAxisPriceCoeff": 5, "isNearbyImageYAxisTitleCoeff": 5, - "hasPriceishPatternCoeff": 15 + "largerFontSizeCoeff": 7, + "largerImageCoeff": 2 } diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js index 0b59e84..40328ae 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -15,8 +15,8 @@ import RulesetFactory from 'commerce/extraction/ruleset_factory'; import {SCORE_THRESHOLD} from 'commerce/config'; const PRODUCT_FEATURES = ['title', 'price', 'image']; -// Array of numbers corresponding to the coefficients -const coefficients = Object.values(defaultCoefficients); +// Array of numbers corresponding to the coefficients in order +const coefficients = RulesetFactory.getCoeffsInOrder(defaultCoefficients); // For production, we don't need to generate a new ruleset factory // and ruleset every time we run Fathom, since the coefficients are static. const rulesetFactory = new RulesetFactory(coefficients); @@ -28,8 +28,9 @@ const rules = rulesetFactory.makeRuleset(); */ function runRuleset(doc) { const extractedElements = {}; + const results = rules.against(doc); for (const feature of PRODUCT_FEATURES) { - let fnodesList = rules.against(doc).get(feature); + let fnodesList = results.get(feature); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. if (fnodesList.length >= 1) { @@ -56,8 +57,9 @@ export default function extractProduct(doc) { for (const feature of PRODUCT_FEATURES) { if (feature === 'image') { extractedProduct[feature] = extractedElements[feature].src; + } else { + extractedProduct[feature] = extractedElements[feature].innerText; } - extractedProduct[feature] = extractedElements[feature].innerText; } } return hasAllFeatures(extractedProduct) ? extractedProduct : null; diff --git a/src/extraction/ruleset_factory.js b/src/extraction/ruleset_factory.js index 7e4744f..5f6834d 100644 --- a/src/extraction/ruleset_factory.js +++ b/src/extraction/ruleset_factory.js @@ -31,16 +31,16 @@ export default class RulesetFactory { */ constructor(coefficients) { [ - this.largerImageCoeff, - this.largerFontSizeCoeff, this.hasDollarSignCoeff, - this.hasPriceInIDCoeff, this.hasPriceInClassNameCoeff, - this.isAboveTheFoldPriceCoeff, + this.hasPriceInIDCoeff, + this.hasPriceishPatternCoeff, this.isAboveTheFoldImageCoeff, + this.isAboveTheFoldPriceCoeff, this.isNearbyImageXAxisPriceCoeff, this.isNearbyImageYAxisTitleCoeff, - this.hasPriceishPatternCoeff, + this.largerFontSizeCoeff, + this.largerImageCoeff, ] = coefficients; } @@ -113,15 +113,20 @@ export default class RulesetFactory { const viewportHeight = window.innerHeight; const top = fnode.element.getBoundingClientRect().top; const upperHeightLimit = viewportHeight * 2; - // Use a falling trapezoid function to score the element - // Taken from: https://github.com/mozilla/fathom-trainees + + // If the node is below the fold by more than a viewport's length, + // return a low score. if (top >= upperHeightLimit) { return ZEROISH * featureCoeff; } + + // If the node is above the fold, return a high score. if (top <= viewportHeight) { return ONEISH * featureCoeff; } - // slope = deltaY / deltaX + + // Otherwise, scale the score linearly between the fold and a viewport's + // length below it. const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; } @@ -132,7 +137,7 @@ export default class RulesetFactory { isNearbyImageXAxisPrice(fnode) { const viewportWidth = window.innerWidth; const eleDOMRect = fnode.element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageElement = this.getHighestScoringImage(fnode); const imageDOMRect = imageElement.getBoundingClientRect(); const deltaRight = eleDOMRect.left - imageDOMRect.right; const deltaLeft = imageDOMRect.left - eleDOMRect.right; @@ -157,7 +162,7 @@ export default class RulesetFactory { isNearbyImageYAxisTitle(fnode) { const viewportHeight = window.innerHeight; const DOMRect = fnode.element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageElement = this.getHighestScoringImage(fnode); const imageDOMRect = imageElement.getBoundingClientRect(); // Some titles (like on Ebay) are above the image, so include a top buffer const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); @@ -240,7 +245,7 @@ export default class RulesetFactory { isNearbyImageYAxisPrice(fnode) { const element = fnode.element; const DOMRect = element.getBoundingClientRect(); - const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageElement = this.getHighestScoringImage(fnode); const imageDOMRect = imageElement.getBoundingClientRect(); if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) && DOMRect.bottom <= imageDOMRect.bottom) { @@ -319,4 +324,22 @@ export default class RulesetFactory { rule(type('priceish').max(), out('price')), ); } + + /** + * Takes in a coefficients object and returns a coefficients array in the + * same order. + */ + static getCoeffsInOrder(coeffsObj) { + const coeffsKeys = Object.keys(coeffsObj); + coeffsKeys.sort(); // sort keys in string Unicode order + const coeffs = []; + for (const key of coeffsKeys) { + coeffs.push(coeffsObj[key]); + } + return coeffs; + } + + getHighestScoringImage(fnode) { + return fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + } } diff --git a/src/trainees.js b/src/extraction/trainees.js similarity index 61% rename from src/trainees.js rename to src/extraction/trainees.js index a8ce8fb..4254085 100644 --- a/src/trainees.js +++ b/src/extraction/trainees.js @@ -2,11 +2,13 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* eslint-disable import/no-unresolved */ +// This file is moved up a level to the ./src folder for training import defaultCoefficients from './extraction/fathom_default_coefficients.json'; import RulesetFactory from './extraction/ruleset_factory'; -// Array of numbers corresponding to the coefficients -const coeffs = Object.values(defaultCoefficients); +// Array of numbers corresponding to the coefficients in order +const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients); /** * Rulesets to train using Fathom. @@ -41,59 +43,33 @@ const coeffs = Object.values(defaultCoefficients); * the freeze-dried library Fathom uses). */ +function rulesetMaker(coefficients) { + // The coefficients are updated over time during training, so create a new factory for + // each iteration + const rulesetFactory = new RulesetFactory(coefficients); + return rulesetFactory.makeRuleset(); +} + const trainees = new Map([ [ - /** - * A ruleset that finds the main product image on a product page. - */ - 'image', // Ruleset name + 'image', { coeffs, - /** - * @param {Array.number} coefficients - */ - rulesetMaker(coefficients) { - // The coefficients are updated over time during training, so create a new factory for - // each iteration - const rulesetFactory = new RulesetFactory(coefficients); - return rulesetFactory.makeRuleset(); // The ruleset - }, + rulesetMaker, }, ], [ - /** - * A ruleset that finds the main product title on a product page. - */ - 'title', // Ruleset name + 'title', { coeffs, - /** - * @param {Array.number} coefficients - */ - rulesetMaker(coefficients) { - // The coefficients are updated over time during training, so create a new factory for - // each iteration - const rulesetFactory = new RulesetFactory(coefficients); - return rulesetFactory.makeRuleset(); // The ruleset - }, + rulesetMaker, }, ], [ - /** - * A ruleset that finds the main product price on a product page. - */ - 'price', // Ruleset name + 'price', { coeffs, - /** - * @param {Array.number} coefficients - */ - rulesetMaker(coefficients) { - // The coefficients are updated over time during training, so create a new factory for - // each iteration - const rulesetFactory = new RulesetFactory(coefficients); - return rulesetFactory.makeRuleset(); // The ruleset - }, + rulesetMaker, }, ], ]); From e3e4f15be03569c4fcf9ee9b21d5186d3e921e95 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Mon, 27 Aug 2018 11:37:23 -0700 Subject: [PATCH 9/9] Fix #36: Remove 'SCORE_THRESHOLD' const from config.js. This const was not actually needed by more than one file, which simplifies how 'trainees.js' and its imported scripts are used for training. --- src/config.js | 3 --- src/extraction/fathom_extraction.js | 3 ++- src/extraction/ruleset_factory.js | 5 ----- src/extraction/trainees.js | 11 +++++------ 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/config.js b/src/config.js index 3f9c3be..0baa43f 100644 --- a/src/config.js +++ b/src/config.js @@ -16,6 +16,3 @@ export const PRICE_CHECK_TIMEOUT_INTERVAL = 1000 * 60 * 15; // 15 minutes /** Delay before removing iframes created during price checks */ export const IFRAME_TIMEOUT = 1000 * 60; // 1 minute - -// Minimum score to be considered the "correct" feature element extracted by Fathom -export const SCORE_THRESHOLD = 4; diff --git a/src/extraction/fathom_extraction.js b/src/extraction/fathom_extraction.js index 40328ae..00cebbc 100644 --- a/src/extraction/fathom_extraction.js +++ b/src/extraction/fathom_extraction.js @@ -12,9 +12,10 @@ import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json'; import RulesetFactory from 'commerce/extraction/ruleset_factory'; -import {SCORE_THRESHOLD} from 'commerce/config'; const PRODUCT_FEATURES = ['title', 'price', 'image']; +// Minimum score to be considered the "correct" feature element extracted by Fathom +const SCORE_THRESHOLD = 4; // Array of numbers corresponding to the coefficients in order const coefficients = RulesetFactory.getCoeffsInOrder(defaultCoefficients); // For production, we don't need to generate a new ruleset factory diff --git a/src/extraction/ruleset_factory.js b/src/extraction/ruleset_factory.js index 5f6834d..8a9778e 100644 --- a/src/extraction/ruleset_factory.js +++ b/src/extraction/ruleset_factory.js @@ -12,9 +12,6 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; // Since the fathom-trainees add-on currently uses a submodule of Fathom, for // training, replace 'utils' with 'utilsForFrontend' import {ancestors} from 'fathom-web/utils'; -// relative URLs are needed for training, as the 'commerce' alias doesn't exist -// in that context -import {SCORE_THRESHOLD} from '../config'; const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; @@ -296,8 +293,6 @@ export default class RulesetFactory { rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('titleish')), // better score based on y-axis proximity to max scoring image element rule(type('titleish'), score(this.isNearbyImageYAxisTitle.bind(this))), - // since no further rules are needed for title, give all inputs the minimum score - rule(type('titleish'), score(() => SCORE_THRESHOLD)), // return title element(s) with max score rule(type('titleish').max(), out('title')), diff --git a/src/extraction/trainees.js b/src/extraction/trainees.js index 4254085..2e82854 100644 --- a/src/extraction/trainees.js +++ b/src/extraction/trainees.js @@ -3,9 +3,8 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* eslint-disable import/no-unresolved */ -// This file is moved up a level to the ./src folder for training -import defaultCoefficients from './extraction/fathom_default_coefficients.json'; -import RulesetFactory from './extraction/ruleset_factory'; +import defaultCoefficients from './fathom_default_coefficients.json'; +import RulesetFactory from './ruleset_factory'; // Array of numbers corresponding to the coefficients in order const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients); @@ -22,9 +21,9 @@ const coeffs = RulesetFactory.getCoeffsInOrder(defaultCoefficients); * * How to train: * 1. Fork the `mozilla/fathom-trainees` repo, - * 2. In the `fathom-trainees` add-on, copy this file and `config.js` over to the - * `./src` folder, and copy `./extraction/fathom_default_coefficients.json` and - * `./extraction/ruleset_factory.js` to a new `./src/extraction` subfolder. + * 2. In the `fathom-trainees` add-on, copy this file, + * `./extraction/fathom_default_coefficients.json` and + * `./extraction/ruleset_factory.js` to the `./src` folder. * * Note: You will have to replace 'utils' with 'utilsForFrontend' on the * import in `ruleset_factory.js`. See that file for more information. * 3. Follow instructions at: https://github.com/erikrose/fathom-fox#the-trainer.