diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index d54851e..a90f91a 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,11 @@ { - "largerImageCoeff": 3, - "largerFontSizeCoeff": 1, - "hasDollarSignCoeff": 3, - "hasTitleInIDCoeff": 10, - "hasTitleInClassNameCoeff": 5, - "isHiddenCoeff": -100, - "isHeaderElementCoeff": 10 + "largerImageCoeff": 2, + "largerFontSizeCoeff": 7, + "hasDollarSignCoeff": 8, + "hasPriceInIDCoeff": 17, + "hasPriceInClassNameCoeff": 2, + "isAboveTheFoldPriceCoeff": 33, + "isAboveTheFoldImageCoeff": 13, + "isNearbyImageXAxisCoeff": 5, + "hasPriceishPatternCoeff": 15 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 65790d2..b804b44 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -15,10 +15,12 @@ import { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, } from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; @@ -36,10 +38,12 @@ function runRuleset(doc) { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js index 39ef04e..bfcd0f9 100644 --- a/src/fathom_ruleset.js +++ b/src/fathom_ruleset.js @@ -3,10 +3,17 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +import {ancestors} from 'fathom-web/utils'; // for training: utilsForFrontend const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; +const SCORE_THRESHOLD = 4; +const TOP_BUFFER = 150; const VIEWPORT_HEIGHT = window.innerHeight; +const VIEWPORT_WIDTH = window.innerWidth; +// Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; /** * Rulesets to train. @@ -40,17 +47,20 @@ trainees.set( * out-rule of interest are the same. A multi-out ruleset will not work without * commenting out all but one `out` and setting the ruleset name to that `out`. */ - 'product', // 'product' for production and 'title', 'image' or 'price' for training + 'product', // Ruleset name: 'product' for production and 'title', 'image' or 'price' for training { - coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + // For training only: input rule coefficients in order here + coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 15], rulesetMaker([ - coeffLargerImage, - coeffLargerFontSize, - coeffHasDollarSign, - coeffHasTitleInID, - coeffHasTitleInClassName, - coeffIsHidden, - coeffIsHeaderElement, + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]) { /** * Scores fnode in direct proportion to its size @@ -61,11 +71,11 @@ trainees.set( if (area === 0) { return DEFAULT_SCORE; } - return area * coeffLargerImage; + return area * largerImageCoeff; } /** - * Scores fnode in direct proportion to its font size + * Scores fnode in proportion to its font size */ function largerFontSize(fnode) { const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; @@ -73,7 +83,7 @@ trainees.set( if (size) { // normalize the multiplier by the default font size const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * coeffLargerFontSize); + return (sizeMultiplier * largerFontSizeCoeff); } return DEFAULT_SCORE; } @@ -83,82 +93,214 @@ trainees.set( */ function hasDollarSign(fnode) { if (fnode.element.innerText.includes('$')) { - return coeffHasDollarSign; + return hasDollarSignCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in its id + * Scores fnode with 'price' in its id or its parent's id */ - function hasTitleInID(fnode) { - const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return coeffHasTitleInID; + function hasPriceInID(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const ID = element.id; + const parentID = parentElement.id; + if (ID.includes('price') || ID.includes('Price')) { + return hasPriceInIDCoeff; + } + if (parentID.includes('price') || parentID.includes('Price')) { + return 0.75 * hasPriceInIDCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in a class name + * Scores fnode with 'price' in its class name or its parent's class name */ - function hasTitleInClassName(fnode) { - const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return coeffHasTitleInClassName; + function hasPriceInClassName(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const className = element.className; + const parentClassName = parentElement.className; + if (className.includes('price') || className.includes('Price')) { + return hasPriceInClassNameCoeff; + } + if (parentClassName.includes('price') || parentClassName.includes('Price')) { + return 0.75 * hasPriceInClassNameCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is hidden + * Checks if fnode is visible + */ + function isVisible(fnode) { + const element = fnode.element; + for (const ancestor of ancestors(element)) { + const style = getComputedStyle(ancestor); + if (style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return false; + } + } + return true; + } + + /** + * Scale a number to the range [ZEROISH, ONEISH]. + * + * Taken from: https://github.com/mozilla/fathom-trainees + * + * For a rising trapezoid, the result is ZEROISH until the input + * reaches zeroAt, then increases linearly until oneAt, at which it + * becomes ONEISH. To make a falling trapezoid, where the result is + * ONEISH to the left and ZEROISH to the right, use a zeroAt greater + * than oneAt. + */ + function trapezoid(number, zeroAt, oneAt) { + const isRising = zeroAt < oneAt; + if (isRising) { + if (number <= zeroAt) { + return ZEROISH; + } + if (number >= oneAt) { + return ONEISH; + } + } else { + if (number >= zeroAt) { + return ZEROISH; + } + if (number <= oneAt) { + return ONEISH; + } + } + const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); + return slope * (number - zeroAt) + ZEROISH; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + function isAboveTheFold(fnode, featureCoeff) { + const domRect = fnode.element.getBoundingClientRect(); + // Use a falling trapezoid to score the element; + // result is ONEISH until the input reaches VIEWPORT_HEIGHT, then decreases + // linearly until VIEWPORT_HEIGHT * 2, where it becomes ZEROISH. + return trapezoid(domRect.top, VIEWPORT_HEIGHT * 2, VIEWPORT_HEIGHT) * featureCoeff; + } + + /** + * Checks to see if fnode is eligible for scoring + * Note: This is a compound method, because `.when` chaining these methods onto + * a `dom` rule does not currently work. + */ + function isEligible(fnode, featureType) { + if (featureType === 'priceish') { + return ( + isVisible(fnode) + && removeRedundantAncestors(fnode) + && isNearbyImageYAxis(fnode) + ); + } + if (featureType === 'titleish') { + return ( + isVisible(fnode) + /** + * Don't removeRedundantAncestors, because

tags for + * Amazon and Walmart have and
element children, + * respectively, with the same innerText. + */ + && isNearbyImageYAxis(fnode) + ); + } + return false; + } + + /** + * Checks if fnode has the same innerText as any of its children */ - function isHidden(fnode) { + function removeRedundantAncestors(fnode) { const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return coeffIsHidden; + const children = element.children; + if (children.length > 0) { + for (const descendant of children) { + if (descendant.innerText === element.innerText) { + return false; + } + } + } + return true; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + function isNearbyImageXAxis(fnode) { + const element = fnode.element; + const eleDOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaX = eleDOMRect.left - imageDOMRect.right; + // priceish element is always* to the right of the image + if (deltaX > 0) { + // give a higher score the closer it is to the image, normalized by VIEWPORT_WIDTH + return (VIEWPORT_WIDTH / deltaX) * isNearbyImageXAxisCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is an H1 element + * Scores fnode whose innerText matches a priceish RegExp pattern */ - function isHeaderElement(fnode) { - if (fnode.element.tagName === 'H1') { - return coeffIsHeaderElement; + function hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return hasPriceishPatternCoeff; } return DEFAULT_SCORE; } /** - * Returns true if the fnode is above the fold + * Checks if fnode is nearby the top scoring image element in the y-axis */ - function isAboveTheFold(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { + function isNearbyImageYAxis(fnode) { + const element = fnode.element; + const DOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) + && DOMRect.bottom <= imageDOMRect.bottom) { return true; } return false; } - /* The actual ruleset */ + /* The ruleset */ const rules = ruleset( /** * Image rules * - * If training, comment out unless training 'image'. + * If training, leave uncommented, as 'price' and 'title' rules depend + * on the `out` of these 'image' rules. */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), + // consider all visible img elements + rule(dom('img').when(isVisible), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldImageCoeff))), // better score for larger images rule(type('imageish'), score(largerImage)), - // return image element with max score + // return image element(s) with max score rule(type('imageish').max(), out('image')), /** @@ -166,19 +308,11 @@ trainees.set( * * If training, comment out unless training 'title'. */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), - // score higher for h1 elements - rule(type('titleish'), score(isHeaderElement)), - // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), - // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), - // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), - // return title element with max score + // consider all eligible h1 elements + rule(dom('h1').when(fnode => isEligible(fnode, 'titleish')), type('titleish')), + // since no further rules are needed for title, give all inputs the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element(s) with max score rule(type('titleish').max(), out('title')), /** @@ -186,15 +320,23 @@ trainees.set( * * If training, comment out unless training 'price'. */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // consider all eligible span and h2 elements + rule(dom('span, h2').when(fnode => isEligible(fnode, 'priceish')), type('priceish')), // check if the element has a '$' in its innerText rule(type('priceish'), score(hasDollarSign)), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(hasPriceInID)), + // check if any class names have "price" in them + rule(type('priceish'), score(hasPriceInClassName)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), - // return price element with max score + // check for x-axis proximity to max scoring image element + rule(type('priceish'), score(isNearbyImageXAxis)), + // check if innerText has a priceish pattern + rule(type('priceish'), score(hasPriceishPattern)), + // return price element(s) with max score rule(type('priceish').max(), out('price')), ); return rules;