From a8f138ba9cc4014d738582c00722b008830f7f65 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 9 Aug 2018 22:16:32 -0700 Subject: [PATCH] #36: Update rules for better accuracy --- .gitignore | 1 + src/fathom_coefficients.json | 17 ++- src/fathom_extraction.js | 30 +++-- src/fathom_ruleset.js | 219 +++++++++++++++++++++++++++-------- 4 files changed, 203 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index d54851e..26b41ab 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,16 @@ { "largerImageCoeff": 3, - "largerFontSizeCoeff": 1, + "largerFontSizePriceCoeff": 1, + "largerFontSizeTitleCoeff": 1, "hasDollarSignCoeff": 3, - "hasTitleInIDCoeff": 10, - "hasTitleInClassNameCoeff": 5, - "isHiddenCoeff": -100, - "isHeaderElementCoeff": 10 + "hasFeatureNameInIDTitleCoeff": 10, + "hasFeatureNameInIDPriceCoeff": 10, + "hasFeatureNameInClassNameTitleCoeff": 5, + "hasFeatureNameInClassNamePriceCoeff": 5, + "isAboveTheFoldPriceCoeff": 30, + "isAboveTheFoldImageCoeff": 15, + "isAboveTheFoldTitleCoeff": 15, + "isHeaderElementCoeff": 10, + "optimalPriceStringLengthCoeff": 5, + "closestToImageCoeff": 1 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 65790d2..847f0c4 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -13,12 +13,19 @@ import productRuleset from 'commerce/fathom_ruleset'; import { largerImageCoeff, - largerFontSizeCoeff, + largerFontSizePriceCoeff, + largerFontSizeTitleCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, + hasFeatureNameInIDTitleCoeff, + hasFeatureNameInIDPriceCoeff, + hasFeatureNameInClassNameTitleCoeff, + hasFeatureNameInClassNamePriceCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isAboveTheFoldTitleCoeff, isHeaderElementCoeff, + optimalPriceStringLengthCoeff, + closestToImageCoeff, } from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; @@ -34,12 +41,19 @@ function runRuleset(doc) { for (const feature of PRODUCT_FEATURES) { let fnodesList = rules([ largerImageCoeff, - largerFontSizeCoeff, + largerFontSizePriceCoeff, + largerFontSizeTitleCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, + hasFeatureNameInIDTitleCoeff, + hasFeatureNameInIDPriceCoeff, + hasFeatureNameInClassNameTitleCoeff, + hasFeatureNameInClassNamePriceCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isAboveTheFoldTitleCoeff, isHeaderElementCoeff, + optimalPriceStringLengthCoeff, + closestToImageCoeff, ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js index 39ef04e..3ee0a0a 100644 --- a/src/fathom_ruleset.js +++ b/src/fathom_ruleset.js @@ -3,10 +3,15 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +import {ancestors} from 'fathom-web/utils'; const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; const VIEWPORT_HEIGHT = window.innerHeight; +const VIEWPORT_WIDTH = window.innerWidth; +// Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; /** * Rulesets to train. @@ -42,15 +47,23 @@ trainees.set( */ 'product', // 'product' for production and 'title', 'image' or 'price' for training { - coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + // For training only: input rule coefficients in order here + coeffs: [3, 1, 1, 3, 10, 10, 5, 5, 30, 15, 15, 10, 5, 1], rulesetMaker([ - coeffLargerImage, - coeffLargerFontSize, - coeffHasDollarSign, - coeffHasTitleInID, - coeffHasTitleInClassName, - coeffIsHidden, - coeffIsHeaderElement, + largerImageCoeff, + largerFontSizePriceCoeff, + largerFontSizeTitleCoeff, + hasDollarSignCoeff, + hasFeatureNameInIDTitleCoeff, + hasFeatureNameInIDPriceCoeff, + hasFeatureNameInClassNameTitleCoeff, + hasFeatureNameInClassNamePriceCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isAboveTheFoldTitleCoeff, + isHeaderElementCoeff, + optimalPriceStringLengthCoeff, + closestToImageCoeff, ]) { /** * Scores fnode in direct proportion to its size @@ -61,19 +74,19 @@ trainees.set( if (area === 0) { return DEFAULT_SCORE; } - return area * coeffLargerImage; + return area * largerImageCoeff; } /** - * Scores fnode in direct proportion to its font size + * Scores fnode in proportion to its font size */ - function largerFontSize(fnode) { + function largerFontSize(fnode, featureCoeff) { const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; const size = sizeWithUnits.replace('px', ''); if (size) { // normalize the multiplier by the default font size const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * coeffLargerFontSize); + return (sizeMultiplier * featureCoeff); } return DEFAULT_SCORE; } @@ -83,47 +96,64 @@ trainees.set( */ function hasDollarSign(fnode) { if (fnode.element.innerText.includes('$')) { - return coeffHasDollarSign; + return hasDollarSignCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in its id + * Scores fnode with the feature name in its id */ - function hasTitleInID(fnode) { + function hasFeatureNameInID(fnode, featureType, featureCoeff) { const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return coeffHasTitleInID; + if (featureType === 'titleish') { + if (id.includes('title') || id.includes('Title')) { + return featureCoeff; + } + } + if (featureType === 'priceish') { + if (id.includes('price') || id.includes('Price')) { + return featureCoeff; + } } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in a class name + * Scores fnode with feature name in a class name */ - function hasTitleInClassName(fnode) { + function hasFeatureNameInClassName(fnode, featureType, featureCoeff) { const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return coeffHasTitleInClassName; + if (featureType === 'titleish') { + if (className.includes('title') || className.includes('Title')) { + return featureCoeff; + } + } + if (featureType === 'priceish') { + if (className.includes('price') || className.includes('Price')) { + return featureCoeff; + } } return DEFAULT_SCORE; } /** - * Scores fnode that is hidden + * Checks if an fnode is visible */ - function isHidden(fnode) { + function isVisible(fnode) { const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return coeffIsHidden; + // TODO: consider using element.closest() + for (const ancestor of ancestors(element)) { + const style = getComputedStyle(ancestor); + if (style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return false; + } } - return DEFAULT_SCORE; + return true; } /** @@ -131,22 +161,99 @@ trainees.set( */ function isHeaderElement(fnode) { if (fnode.element.tagName === 'H1') { - return coeffIsHeaderElement; + return isHeaderElementCoeff; } return DEFAULT_SCORE; } + /** + * Scale a number to the range [ZEROISH, ONEISH]. + * + * Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js + * + * For a rising trapezoid, the result is ZEROISH until the input + * reaches zeroAt, then increases linearly until oneAt, at which it + * becomes ONEISH. To make a falling trapezoid, where the result is + * ONEISH to the left and ZEROISH to the right, use a zeroAt greater + * than oneAt. + */ + function trapezoid(number, zeroAt, oneAt) { + const isRising = zeroAt < oneAt; + if (isRising) { + if (number <= zeroAt) { + return ZEROISH; + } + if (number >= oneAt) { + return ONEISH; + } + } else { + if (number >= zeroAt) { + return ZEROISH; + } + if (number <= oneAt) { + return ONEISH; + } + } + const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); + return slope * (number - zeroAt) + ZEROISH; + } + /** * Returns true if the fnode is above the fold */ - function isAboveTheFold(fnode) { + function isAboveTheFold(fnode, featureCoeff) { const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { - return true; + // Use a falling trapezoid to score the element; + // result is ONEISH until the input reaches VIEWPORT_HEIGHT, then decreases + // linearly until VIEWPORT_HEIGHT * 2, where it becomes ZEROISH. + return trapezoid(domRect.top, VIEWPORT_HEIGHT * 2, VIEWPORT_HEIGHT) * featureCoeff; + } + + /** + * Remove fnodes that have the same innerText as a descendant + */ + function removeRedundantAncestors(fnode) { + const element = fnode.element; + if (element.children.length > 0) { + for (const descendant of element.children) { + if (descendant.innerText === element.innerText) { + return false; + } + } } - return false; + return true; } + function optimalPriceStringLength(fnode) { + // Single price: $X.XX to $XXX.XX ranges from 5 to 7 characters + // Range: $X.XX - $X.XX to $XXX.XX - $XXX.XX ranges from 13 to 17 + let lowerLimit = 5; + let upperLimit = 7; + const innerText = fnode.element.innerText; + const isRange = innerText.includes('-'); + if (isRange) { + lowerLimit = 13; + upperLimit = 17; + } + if (innerText.length >= lowerLimit && innerText.length <= upperLimit) { + return optimalPriceStringLengthCoeff; + } + return DEFAULT_SCORE; + } + + function closestToImage(fnode) { + const element = fnode.element; + const priceDOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + // score priceish element higher the closer it is to the image + // priceish element is always* to the right of the image + const deltaX = priceDOMRect.left - imageDOMRect.right; + if (deltaX > 0) { + return (VIEWPORT_WIDTH / deltaX) * closestToImageCoeff; + } + return DEFAULT_SCORE; + } /* The actual ruleset */ const rules = ruleset( /** @@ -154,8 +261,10 @@ trainees.set( * * If training, comment out unless training 'image'. */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), + // consider all visible img elements + rule(dom('img').when(isVisible), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldImageCoeff))), // better score for larger images rule(type('imageish'), score(largerImage)), // return image element with max score @@ -166,18 +275,18 @@ trainees.set( * * If training, comment out unless training 'title'. */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), + // consider all visible h1 and span elements + rule(dom('h1, span').when(isVisible), type('titleish')), // score higher for h1 elements rule(type('titleish'), score(isHeaderElement)), + // better score the closer the element is to the top of the page + rule(type('titleish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldTitleCoeff))), // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), + rule(type('titleish'), score(fnode => hasFeatureNameInID(fnode, 'titleish', hasFeatureNameInIDTitleCoeff))), // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), + rule(type('titleish'), score(fnode => hasFeatureNameInClassName(fnode, 'titleish', hasFeatureNameInClassNameTitleCoeff))), // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), + rule(type('titleish').when(removeRedundantAncestors), score(fnode => largerFontSize(fnode, largerFontSizeTitleCoeff))), // return title element with max score rule(type('titleish').max(), out('title')), @@ -186,14 +295,22 @@ trainees.set( * * If training, comment out unless training 'price'. */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // consider all visible span and h2 elements + rule(dom('span, h2').when(isVisible), type('priceish')), // check if the element has a '$' in its innerText - rule(type('priceish'), score(hasDollarSign)), + rule(type('priceish').when(removeRedundantAncestors), score(hasDollarSign)), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(fnode => hasFeatureNameInID(fnode, 'priceish', hasFeatureNameInIDPriceCoeff))), + // check if any class names have "price" in them + rule(type('priceish'), score(fnode => hasFeatureNameInClassName(fnode, 'priceish', hasFeatureNameInClassNamePriceCoeff))), // better score for larger font size - rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), + rule(type('priceish'), score(fnode => largerFontSize(fnode, largerFontSizePriceCoeff))), + // check innerText length for optimal range + rule(type('priceish'), score(optimalPriceStringLength)), + // check for proximity to max scoring image element + rule(type('priceish'), score(closestToImage)), // return price element with max score rule(type('priceish').max(), out('price')), );