From 5b0aba87ad7ff8fa652f76a08aa08c640354cc97 Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 9 Aug 2018 22:16:32 -0700 Subject: [PATCH] #36: Update rules for better accuracy These rules and coefficients yield the following accuracy based on a training corpus of 50 product pages from our top 5 sites (Amazon, Ebay, Walmart, Best Buy and Home Depot): * 100% for product 'image' * 96% for product 'title' * 94% for product 'price' Product 'price' and 'title' features have proximity rules based on the highest scoring product 'image' element. For now, this is done by accessing the image fnode using an internal '_ruleset' object; @erikrose is working on better support for this use case in the very near future, so this implementation can be improved at that time. --- .gitignore | 1 + src/fathom_coefficients.json | 16 ++- src/fathom_extraction.js | 20 +-- src/fathom_ruleset.js | 268 +++++++++++++++++++++++++++-------- 4 files changed, 227 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index a26de97..11393fe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ node_modules web-ext-artifacts build gecko.log +.DS_Store diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index d54851e..a90f91a 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,9 +1,11 @@ { - "largerImageCoeff": 3, - "largerFontSizeCoeff": 1, - "hasDollarSignCoeff": 3, - "hasTitleInIDCoeff": 10, - "hasTitleInClassNameCoeff": 5, - "isHiddenCoeff": -100, - "isHeaderElementCoeff": 10 + "largerImageCoeff": 2, + "largerFontSizeCoeff": 7, + "hasDollarSignCoeff": 8, + "hasPriceInIDCoeff": 17, + "hasPriceInClassNameCoeff": 2, + "isAboveTheFoldPriceCoeff": 33, + "isAboveTheFoldImageCoeff": 13, + "isNearbyImageXAxisCoeff": 5, + "hasPriceishPatternCoeff": 15 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 65790d2..b804b44 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -15,10 +15,12 @@ import { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, } from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; @@ -36,10 +38,12 @@ function runRuleset(doc) { largerImageCoeff, largerFontSizeCoeff, hasDollarSignCoeff, - hasTitleInIDCoeff, - hasTitleInClassNameCoeff, - isHiddenCoeff, - isHeaderElementCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]).against(doc).get(`${feature}`); fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); // It is possible for multiple elements to have the same highest score. diff --git a/src/fathom_ruleset.js b/src/fathom_ruleset.js index 39ef04e..bfcd0f9 100644 --- a/src/fathom_ruleset.js +++ b/src/fathom_ruleset.js @@ -3,10 +3,17 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; +import {ancestors} from 'fathom-web/utils'; // for training: utilsForFrontend const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; +const SCORE_THRESHOLD = 4; +const TOP_BUFFER = 150; const VIEWPORT_HEIGHT = window.innerHeight; +const VIEWPORT_WIDTH = window.innerWidth; +// Taken from: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js +const ZEROISH = 0.08; +const ONEISH = 0.9; /** * Rulesets to train. @@ -40,17 +47,20 @@ trainees.set( * out-rule of interest are the same. A multi-out ruleset will not work without * commenting out all but one `out` and setting the ruleset name to that `out`. */ - 'product', // 'product' for production and 'title', 'image' or 'price' for training + 'product', // Ruleset name: 'product' for production and 'title', 'image' or 'price' for training { - coeffs: [3, 1, 3, 10, 5, -100, 10], // Input rule coefficients in order here + // For training only: input rule coefficients in order here + coeffs: [2, 7, 8, 17, 2, 33, 13, 5, 15], rulesetMaker([ - coeffLargerImage, - coeffLargerFontSize, - coeffHasDollarSign, - coeffHasTitleInID, - coeffHasTitleInClassName, - coeffIsHidden, - coeffIsHeaderElement, + largerImageCoeff, + largerFontSizeCoeff, + hasDollarSignCoeff, + hasPriceInIDCoeff, + hasPriceInClassNameCoeff, + isAboveTheFoldPriceCoeff, + isAboveTheFoldImageCoeff, + isNearbyImageXAxisCoeff, + hasPriceishPatternCoeff, ]) { /** * Scores fnode in direct proportion to its size @@ -61,11 +71,11 @@ trainees.set( if (area === 0) { return DEFAULT_SCORE; } - return area * coeffLargerImage; + return area * largerImageCoeff; } /** - * Scores fnode in direct proportion to its font size + * Scores fnode in proportion to its font size */ function largerFontSize(fnode) { const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; @@ -73,7 +83,7 @@ trainees.set( if (size) { // normalize the multiplier by the default font size const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; - return (sizeMultiplier * coeffLargerFontSize); + return (sizeMultiplier * largerFontSizeCoeff); } return DEFAULT_SCORE; } @@ -83,82 +93,214 @@ trainees.set( */ function hasDollarSign(fnode) { if (fnode.element.innerText.includes('$')) { - return coeffHasDollarSign; + return hasDollarSignCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in its id + * Scores fnode with 'price' in its id or its parent's id */ - function hasTitleInID(fnode) { - const id = fnode.element.id; - if (id.includes('title') || id.includes('Title')) { - return coeffHasTitleInID; + function hasPriceInID(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const ID = element.id; + const parentID = parentElement.id; + if (ID.includes('price') || ID.includes('Price')) { + return hasPriceInIDCoeff; + } + if (parentID.includes('price') || parentID.includes('Price')) { + return 0.75 * hasPriceInIDCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode with "title" in a class name + * Scores fnode with 'price' in its class name or its parent's class name */ - function hasTitleInClassName(fnode) { - const className = fnode.element.className; - if (className.includes('title') || className.includes('Title')) { - return coeffHasTitleInClassName; + function hasPriceInClassName(fnode) { + const element = fnode.element; + const parentElement = element.parentElement; + const className = element.className; + const parentClassName = parentElement.className; + if (className.includes('price') || className.includes('Price')) { + return hasPriceInClassNameCoeff; + } + if (parentClassName.includes('price') || parentClassName.includes('Price')) { + return 0.75 * hasPriceInClassNameCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is hidden + * Checks if fnode is visible + */ + function isVisible(fnode) { + const element = fnode.element; + for (const ancestor of ancestors(element)) { + const style = getComputedStyle(ancestor); + if (style.visibility === 'hidden' + || style.display === 'none' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return false; + } + } + return true; + } + + /** + * Scale a number to the range [ZEROISH, ONEISH]. + * + * Taken from: https://github.com/mozilla/fathom-trainees + * + * For a rising trapezoid, the result is ZEROISH until the input + * reaches zeroAt, then increases linearly until oneAt, at which it + * becomes ONEISH. To make a falling trapezoid, where the result is + * ONEISH to the left and ZEROISH to the right, use a zeroAt greater + * than oneAt. + */ + function trapezoid(number, zeroAt, oneAt) { + const isRising = zeroAt < oneAt; + if (isRising) { + if (number <= zeroAt) { + return ZEROISH; + } + if (number >= oneAt) { + return ONEISH; + } + } else { + if (number >= zeroAt) { + return ZEROISH; + } + if (number <= oneAt) { + return ONEISH; + } + } + const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); + return slope * (number - zeroAt) + ZEROISH; + } + + /** + * Scores fnode by its vertical location relative to the fold + */ + function isAboveTheFold(fnode, featureCoeff) { + const domRect = fnode.element.getBoundingClientRect(); + // Use a falling trapezoid to score the element; + // result is ONEISH until the input reaches VIEWPORT_HEIGHT, then decreases + // linearly until VIEWPORT_HEIGHT * 2, where it becomes ZEROISH. + return trapezoid(domRect.top, VIEWPORT_HEIGHT * 2, VIEWPORT_HEIGHT) * featureCoeff; + } + + /** + * Checks to see if fnode is eligible for scoring + * Note: This is a compound method, because `.when` chaining these methods onto + * a `dom` rule does not currently work. + */ + function isEligible(fnode, featureType) { + if (featureType === 'priceish') { + return ( + isVisible(fnode) + && removeRedundantAncestors(fnode) + && isNearbyImageYAxis(fnode) + ); + } + if (featureType === 'titleish') { + return ( + isVisible(fnode) + /** + * Don't removeRedundantAncestors, because

tags for + * Amazon and Walmart have and
element children, + * respectively, with the same innerText. + */ + && isNearbyImageYAxis(fnode) + ); + } + return false; + } + + /** + * Checks if fnode has the same innerText as any of its children */ - function isHidden(fnode) { + function removeRedundantAncestors(fnode) { const element = fnode.element; - const style = window.getComputedStyle(element); - if (!element.offsetParent // null if the offsetParent has a display set to "none" - || style.visibility === 'hidden' - || style.opacity === '0' - || style.width === '0' - || style.height === '0') { - return coeffIsHidden; + const children = element.children; + if (children.length > 0) { + for (const descendant of children) { + if (descendant.innerText === element.innerText) { + return false; + } + } + } + return true; + } + + /** + * Scores fnode based on its x distance from the highest scoring image element + */ + function isNearbyImageXAxis(fnode) { + const element = fnode.element; + const eleDOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + const deltaX = eleDOMRect.left - imageDOMRect.right; + // priceish element is always* to the right of the image + if (deltaX > 0) { + // give a higher score the closer it is to the image, normalized by VIEWPORT_WIDTH + return (VIEWPORT_WIDTH / deltaX) * isNearbyImageXAxisCoeff; } return DEFAULT_SCORE; } /** - * Scores fnode that is an H1 element + * Scores fnode whose innerText matches a priceish RegExp pattern */ - function isHeaderElement(fnode) { - if (fnode.element.tagName === 'H1') { - return coeffIsHeaderElement; + function hasPriceishPattern(fnode) { + const text = fnode.element.innerText; + /** + * With an optional '$' that doesn't necessarily have to be at the beginning + * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before + * a decimal point and exactly two after, where the two digits after the decimal point + * are at the end of the string + */ + const regExp = /\${0,1}\d+\.\d{2}$/; + if (regExp.test(text)) { + return hasPriceishPatternCoeff; } return DEFAULT_SCORE; } /** - * Returns true if the fnode is above the fold + * Checks if fnode is nearby the top scoring image element in the y-axis */ - function isAboveTheFold(fnode) { - const domRect = fnode.element.getBoundingClientRect(); - if (domRect.top <= VIEWPORT_HEIGHT) { + function isNearbyImageYAxis(fnode) { + const element = fnode.element; + const DOMRect = element.getBoundingClientRect(); + const imageElement = fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle + const imageDOMRect = imageElement.getBoundingClientRect(); + if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) + && DOMRect.bottom <= imageDOMRect.bottom) { return true; } return false; } - /* The actual ruleset */ + /* The ruleset */ const rules = ruleset( /** * Image rules * - * If training, comment out unless training 'image'. + * If training, leave uncommented, as 'price' and 'title' rules depend + * on the `out` of these 'image' rules. */ - // consider all img elements near the top of the page - rule(dom('img').when(isAboveTheFold), type('imageish')), + // consider all visible img elements + rule(dom('img').when(isVisible), type('imageish')), + // better score the closer the element is to the top of the page + rule(type('imageish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldImageCoeff))), // better score for larger images rule(type('imageish'), score(largerImage)), - // return image element with max score + // return image element(s) with max score rule(type('imageish').max(), out('image')), /** @@ -166,19 +308,11 @@ trainees.set( * * If training, comment out unless training 'title'. */ - // consider all h1 and span elements near the top of the page - rule(dom('h1, span').when(isAboveTheFold), type('titleish')), - // score higher for h1 elements - rule(type('titleish'), score(isHeaderElement)), - // check if the id has "title" in it - rule(type('titleish'), score(hasTitleInID)), - // check if any class names have "title" in them - rule(type('titleish'), score(hasTitleInClassName)), - // better score for larger font size - rule(type('titleish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('titleish'), score(isHidden)), - // return title element with max score + // consider all eligible h1 elements + rule(dom('h1').when(fnode => isEligible(fnode, 'titleish')), type('titleish')), + // since no further rules are needed for title, give all inputs the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element(s) with max score rule(type('titleish').max(), out('title')), /** @@ -186,15 +320,23 @@ trainees.set( * * If training, comment out unless training 'price'. */ - // consider all span and h2 elements near the top of the page - rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // consider all eligible span and h2 elements + rule(dom('span, h2').when(fnode => isEligible(fnode, 'priceish')), type('priceish')), // check if the element has a '$' in its innerText rule(type('priceish'), score(hasDollarSign)), + // better score the closer the element is to the top of the page + rule(type('priceish'), score(fnode => isAboveTheFold(fnode, isAboveTheFoldPriceCoeff))), + // check if the id has "price" in it + rule(type('priceish'), score(hasPriceInID)), + // check if any class names have "price" in them + rule(type('priceish'), score(hasPriceInClassName)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), - // reduce score if element is hidden - rule(type('priceish'), score(isHidden)), - // return price element with max score + // check for x-axis proximity to max scoring image element + rule(type('priceish'), score(isNearbyImageXAxis)), + // check if innerText has a priceish pattern + rule(type('priceish'), score(hasPriceishPattern)), + // return price element(s) with max score rule(type('priceish').max(), out('price')), ); return rules;