diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 83b8f89..2846242 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,3 +1,5 @@ { - "hasPriceClass": 2 + "largerImage": 3, + "largerFontSize": 2, + "hasDollarSign": 3 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 6efad7f..4e75202 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -13,30 +13,97 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; -const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass; +const PRODUCT_FEATURES = ['title', 'price', 'image']; +const SCORE_THRESHOLD = 3; +const DEFAULT_SCORE = 1; +const VIEWPORT_HEIGHT = window.innerHeight; /** - * Scores fnodes with a "price" class + * Each of these functions represents a rule check: if the fnode passes + * the rule, it gets a weighted score from 'fathom_coefficients.json'; + * otherwise, it gets the default score. */ -function hasPriceClass(fnode) { - if (fnode.element.classList.contains('price')) { - return fathomCoeffs.hasPriceClass; + +/** + * Returns true if the fnode is above the fold + */ +function isAboveTheFold(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= VIEWPORT_HEIGHT) { + return true; } - return 1; + return false; +} + +/** + * Scores fnode in direct proportion to its size + */ +function largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * fathomCoeffs.largerImage; +} + +/** + * Scores fnode with a '$' in its innerText + */ +function hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return fathomCoeffs.hasDollarSign; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode in direct proportion to its font size + */ +function largerFontSize(fnode) { + const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size'); + const size = sizeWithUnits.replace('px', ''); + if (size) { + return (parseInt(size, 10) * fathomCoeffs.largerFontSize); + } + return DEFAULT_SCORE; } /** * Ruleset for product features. Each feature has its own type. */ const rules = ruleset( - // get all elements that could contain the price - rule(dom('div'), type('priceish')), + /** + * Image rules + */ + // consider all img elements near the top of the page + rule(dom('img').when(isAboveTheFold), type('imageish')), + // better score for larger images + rule(type('imageish'), score(largerImage)), + // return image element with max score + rule(type('imageish').max(), out('image')), - // check class names to see if they contain 'price' - rule(type('priceish'), score(hasPriceClass)), + /** + * Title rules + */ + // consider only the title element + rule(dom('title'), type('titleish')), + // give the title element the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element with max score + rule(type('titleish').max(), out('title')), + /** + * Price rules + */ + // consider all span and h2 elements near the top of the page + rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(hasDollarSign)), + // better score for larger font size + rule(type('priceish'), score(largerFontSize)), // return price element with max score - rule(type('priceish').max(), out('product-price')), + rule(type('priceish').max(), out('price')), ); /** @@ -44,27 +111,81 @@ const rules = ruleset( * contained in a page's HTML document. */ function runRuleset(doc) { - let fnodesList = rules.against(doc).get('product-price'); - fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); - // It is possible for multiple elements to have the same highest score. - if (fnodesList.length >= 1) { - return fnodesList[0].element; + const extractedElements = {}; + for (const feature of PRODUCT_FEATURES) { + let fnodesList = rules.against(doc).get(`${feature}`); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } + } + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); +} + +// Trim off the shorter substring between ' - ', ': ' or ' | ' +function trimTitle(title) { + let textArr = []; + // TODO: This currently cuts of the " - Black" substring on E-bay + if (title.includes(' - ')) { + textArr = title.split(' - '); + } + if (title.includes(': ')) { + textArr = title.split(': '); + } + if (textArr.length >= 1) { + return textArr.reduce((a, b) => ((a.length > b.length) ? a : b)); + } + return title; +} + +/** + * Takes a price string of the form "$1997 /each" and turns + * it into "$19.97". + * TODO: Can this be generalized/simplified? This is very specific + * to Home Depot's product pages. + */ +function formatPrice(price) { + let formattedPrice = price; + if (price.includes('/')) { + const index = price.indexOf('/'); + formattedPrice = price.slice(0, index); + formattedPrice = formattedPrice.trim(); + const decimalIndex = formattedPrice.length - 2; + const rightSide = formattedPrice.substring(decimalIndex); + const leftSide = formattedPrice.replace(rightSide, ''); + formattedPrice = `${leftSide}.${rightSide}`; } - return null; + return formattedPrice; } /* * Run the ruleset for the product features against the current window document */ export default function extractProduct(doc) { - const priceEle = runRuleset(doc); - if (priceEle) { - const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); - if (price) { - return { - price, - }; + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + let text = extractedElements[feature].innerText; + if (feature === 'title') { + text = trimTitle(text); + } + if (feature === 'price') { + text = formatPrice(text); + } + extractedProduct[feature] = (feature === 'image' + ? extractedElements[feature].src + : text + ); } } - return null; + return hasAllFeatures(extractedProduct) ? extractedProduct : null; }