From 934cc71c6066a86d9b27674c8a8e22510fc4bbae Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 2 Aug 2018 17:16:23 -0700 Subject: [PATCH] #36: Add more sophisticated Fathom rules. These rules successfully pull out product title, price and image from the following product pages (one each from the 5 top sites): * [Amazon](https://www.amazon.com/KitchenAid-KL26M1XER-Professional-6-Qt-Bowl-Lift/dp/B01LYV1U30?smid=ATVPDKIKX0DER&pf_rd_p=0c7b792f-241a-4510-94f4-dd184a76f201&pf_rd_r=AZD7BGV3JZGTB23F30X3) * [Ebay](https://www.ebay.com/p/Best-Choice-Products-650W-6-speed-5-5QT-Kitchen-Food-Stand-Mixer-with-Stainless-Steels-Bowl-Black/3018375728?iid=253733404998) * [Walmart](https://www.walmart.com/ip/KitchenAid-Classic-Series-4-5-Quart-Tilt-Head-Stand-Mixer-Onyx-Black-K45SSOB/29474640) * [Best Buy](https://www.bestbuy.com/site/jbl-everest-elite-750nc-wireless-over-ear-noise-cancelling-headphones-gunmetal/5840136.p?skuId=5840136) * [Home Depot](https://www.homedepot.com/p/Husky-SAE-Combination-Wrench-Set-10-Piece-HCW10PCSAE/202934501) TODO: * Create a training set with FathomFox and run these rules against them to measure their accuracy for 50 product pages (10 from each top site). * Modify trimTitle method, so it doesn't cut off the color from the title for the product on Ebay. * Generalize formatPrice method. @Osmose, would you have any suggestions? --- src/fathom_coefficients.json | 4 +- src/fathom_extraction.js | 171 ++++++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 26 deletions(-) diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 83b8f89..2846242 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,3 +1,5 @@ { - "hasPriceClass": 2 + "largerImage": 3, + "largerFontSize": 2, + "hasDollarSign": 3 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index 6efad7f..4e75202 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -13,30 +13,97 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; -const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass; +const PRODUCT_FEATURES = ['title', 'price', 'image']; +const SCORE_THRESHOLD = 3; +const DEFAULT_SCORE = 1; +const VIEWPORT_HEIGHT = window.innerHeight; /** - * Scores fnodes with a "price" class + * Each of these functions represents a rule check: if the fnode passes + * the rule, it gets a weighted score from 'fathom_coefficients.json'; + * otherwise, it gets the default score. */ -function hasPriceClass(fnode) { - if (fnode.element.classList.contains('price')) { - return fathomCoeffs.hasPriceClass; + +/** + * Returns true if the fnode is above the fold + */ +function isAboveTheFold(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= VIEWPORT_HEIGHT) { + return true; } - return 1; + return false; +} + +/** + * Scores fnode in direct proportion to its size + */ +function largerImage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + const area = (domRect.width) * (domRect.height); + if (area === 0) { + return DEFAULT_SCORE; + } + return area * fathomCoeffs.largerImage; +} + +/** + * Scores fnode with a '$' in its innerText + */ +function hasDollarSign(fnode) { + if (fnode.element.innerText.includes('$')) { + return fathomCoeffs.hasDollarSign; + } + return DEFAULT_SCORE; +} + +/** + * Scores fnode in direct proportion to its font size + */ +function largerFontSize(fnode) { + const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size'); + const size = sizeWithUnits.replace('px', ''); + if (size) { + return (parseInt(size, 10) * fathomCoeffs.largerFontSize); + } + return DEFAULT_SCORE; } /** * Ruleset for product features. Each feature has its own type. */ const rules = ruleset( - // get all elements that could contain the price - rule(dom('div'), type('priceish')), + /** + * Image rules + */ + // consider all img elements near the top of the page + rule(dom('img').when(isAboveTheFold), type('imageish')), + // better score for larger images + rule(type('imageish'), score(largerImage)), + // return image element with max score + rule(type('imageish').max(), out('image')), - // check class names to see if they contain 'price' - rule(type('priceish'), score(hasPriceClass)), + /** + * Title rules + */ + // consider only the title element + rule(dom('title'), type('titleish')), + // give the title element the minimum score + rule(type('titleish'), score(() => SCORE_THRESHOLD)), + // return title element with max score + rule(type('titleish').max(), out('title')), + /** + * Price rules + */ + // consider all span and h2 elements near the top of the page + rule(dom('span, h2').when(isAboveTheFold), type('priceish')), + // check if the element has a '$' in its innerText + rule(type('priceish'), score(hasDollarSign)), + // better score for larger font size + rule(type('priceish'), score(largerFontSize)), // return price element with max score - rule(type('priceish').max(), out('product-price')), + rule(type('priceish').max(), out('price')), ); /** @@ -44,27 +111,81 @@ const rules = ruleset( * contained in a page's HTML document. */ function runRuleset(doc) { - let fnodesList = rules.against(doc).get('product-price'); - fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); - // It is possible for multiple elements to have the same highest score. - if (fnodesList.length >= 1) { - return fnodesList[0].element; + const extractedElements = {}; + for (const feature of PRODUCT_FEATURES) { + let fnodesList = rules.against(doc).get(`${feature}`); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } + } + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); +} + +// Trim off the shorter substring between ' - ', ': ' or ' | ' +function trimTitle(title) { + let textArr = []; + // TODO: This currently cuts of the " - Black" substring on E-bay + if (title.includes(' - ')) { + textArr = title.split(' - '); + } + if (title.includes(': ')) { + textArr = title.split(': '); + } + if (textArr.length >= 1) { + return textArr.reduce((a, b) => ((a.length > b.length) ? a : b)); + } + return title; +} + +/** + * Takes a price string of the form "$1997 /each" and turns + * it into "$19.97". + * TODO: Can this be generalized/simplified? This is very specific + * to Home Depot's product pages. + */ +function formatPrice(price) { + let formattedPrice = price; + if (price.includes('/')) { + const index = price.indexOf('/'); + formattedPrice = price.slice(0, index); + formattedPrice = formattedPrice.trim(); + const decimalIndex = formattedPrice.length - 2; + const rightSide = formattedPrice.substring(decimalIndex); + const leftSide = formattedPrice.replace(rightSide, ''); + formattedPrice = `${leftSide}.${rightSide}`; } - return null; + return formattedPrice; } /* * Run the ruleset for the product features against the current window document */ export default function extractProduct(doc) { - const priceEle = runRuleset(doc); - if (priceEle) { - const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); - if (price) { - return { - price, - }; + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + let text = extractedElements[feature].innerText; + if (feature === 'title') { + text = trimTitle(text); + } + if (feature === 'price') { + text = formatPrice(text); + } + extractedProduct[feature] = (feature === 'image' + ? extractedElements[feature].src + : text + ); } } - return null; + return hasAllFeatures(extractedProduct) ? extractedProduct : null; }