From 9afa26632755d27c8f6f7687b868fed3d6d2609c Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Tue, 7 Aug 2018 11:33:39 -0700 Subject: [PATCH] #36: Update product title rules. Product title rules previously pulled the unique 'title' element from the 'head' element on the page (part of the pages metadata). While this ostensibly requires less processing (we don't have to search the DOM or score any other elements), the title string often requires site-specific cleaning such as to remove the vendor name, and the final, cleaned up string cannot not be verified as accurate by Fathom, which only tells us if our rules picked the right element. The alternative approach, implemented here, is to pull the title from the corresponding element in the content of the page. Since Fathom can verify that the right element was selected, and the string from this element would not require any cleaning, this approach is a much better proxy for extracting the correct product title. --- src/fathom_coefficients.json | 8 ++- src/fathom_extraction.js | 128 +++++++++++++++++++---------------- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 2846242..ceec556 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,5 +1,9 @@ { "largerImage": 3, - "largerFontSize": 2, - "hasDollarSign": 3 + "largerFontSize": 1, + "hasDollarSign": 3, + "hasTitleInID": 10, + "hasTitleInClassName": 5, + "isHidden": -100, + "isHeaderElement": 10 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 4e75202..19413d0 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -14,16 +14,11 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; const PRODUCT_FEATURES = ['title', 'price', 'image']; -const SCORE_THRESHOLD = 3; +const SCORE_THRESHOLD = 4; +const DEFAULT_BODY_FONT_SIZE = 14; const DEFAULT_SCORE = 1; const VIEWPORT_HEIGHT = window.innerHeight; -/** - * Each of these functions represents a rule check: if the fnode passes - * the rule, it gets a weighted score from 'fathom_coefficients.json'; - * otherwise, it gets the default score. - */ - /** * Returns true if the fnode is above the fold */ @@ -61,16 +56,66 @@ function hasDollarSign(fnode) { * Scores fnode in direct proportion to its font size */ function largerFontSize(fnode) { - const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size'); + const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize; const size = sizeWithUnits.replace('px', ''); if (size) { - return (parseInt(size, 10) * fathomCoeffs.largerFontSize); + // normalize the multiplier by the default font size + const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE; + return (sizeMultiplier * fathomCoeffs.largerFontSize); + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode with "title" in its id + */ +function hasTitleInID(fnode) { + const id = fnode.element.id; + if (id.includes('title') || id.includes('Title')) { + return fathomCoeffs.hasTitleInID; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode with "title" in a class name + */ +function hasTitleInClassName(fnode) { + const className = fnode.element.className; + if (className.includes('title') || className.includes('Title')) { + return fathomCoeffs.hasTitleInClassName; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode that is hidden + */ +function isHidden(fnode) { + const element = fnode.element; + const style = window.getComputedStyle(element); + if (!element.offsetParent // null if the offsetParent has a display set to "none" + || style.visibility === 'hidden' + || style.opacity === '0' + || style.width === '0' + || style.height === '0') { + return fathomCoeffs.isHidden; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode that is an H1 element + */ +function isHeaderElement(fnode) { + if (fnode.element.tagName === 'H1') { + return fathomCoeffs.isHeaderElement; } return DEFAULT_SCORE; } /** - * Ruleset for product features. Each feature has its own type. + * Ruleset for product features; each feature has its own type. */ const rules = ruleset( /** @@ -86,10 +131,18 @@ const rules = ruleset( /** * Title rules */ - // consider only the title element - rule(dom('title'), type('titleish')), - // give the title element the minimum score - rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // consider all h1 and span elements near the top of the page + rule(dom('h1, span').when(isAboveTheFold), type('titleish')), + // score higher for h1 elements + rule(type('titleish'), score(isHeaderElement)), + // check if the id has "title" in it + rule(type('titleish'), score(hasTitleInID)), + // check if any class names have "title" in them + rule(type('titleish'), score(hasTitleInClassName)), + // better score for larger font size + rule(type('titleish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('titleish'), score(isHidden)), // return title element with max score rule(type('titleish').max(), out('title')), @@ -102,6 +155,8 @@ const rules = ruleset( rule(type('priceish'), score(hasDollarSign)), // better score for larger font size rule(type('priceish'), score(largerFontSize)), + // reduce score if element is hidden + rule(type('priceish'), score(isHidden)), // return price element with max score rule(type('priceish').max(), out('price')), ); @@ -130,42 +185,6 @@ function hasAllFeatures(obj) { return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); } -// Trim off the shorter substring between ' - ', ': ' or ' | ' -function trimTitle(title) { - let textArr = []; - // TODO: This currently cuts of the " - Black" substring on E-bay - if (title.includes(' - ')) { - textArr = title.split(' - '); - } - if (title.includes(': ')) { - textArr = title.split(': '); - } - if (textArr.length >= 1) { - return textArr.reduce((a, b) => ((a.length > b.length) ? a : b)); - } - return title; -} - -/** - * Takes a price string of the form "$1997 /each" and turns - * it into "$19.97". - * TODO: Can this be generalized/simplified? This is very specific - * to Home Depot's product pages. - */ -function formatPrice(price) { - let formattedPrice = price; - if (price.includes('/')) { - const index = price.indexOf('/'); - formattedPrice = price.slice(0, index); - formattedPrice = formattedPrice.trim(); - const decimalIndex = formattedPrice.length - 2; - const rightSide = formattedPrice.substring(decimalIndex); - const leftSide = formattedPrice.replace(rightSide, ''); - formattedPrice = `${leftSide}.${rightSide}`; - } - return formattedPrice; -} - /* * Run the ruleset for the product features against the current window document */ @@ -174,16 +193,9 @@ export default function extractProduct(doc) { const extractedElements = runRuleset(doc); if (hasAllFeatures(extractedElements)) { for (const feature of PRODUCT_FEATURES) { - let text = extractedElements[feature].innerText; - if (feature === 'title') { - text = trimTitle(text); - } - if (feature === 'price') { - text = formatPrice(text); - } extractedProduct[feature] = (feature === 'image' ? extractedElements[feature].src - : text + : extractedElements[feature].innerText ); } }