From b9a56d733214c7c502a40e54708b018d35ac036c Mon Sep 17 00:00:00 2001 From: Bianca Danforth Date: Thu, 2 Aug 2018 17:16:23 -0700 Subject: [PATCH] #36: Add more sophisticated Fathom rules. This commit builds off of PR #38, so that PR should merge before this. Open questions: * How to test interdependent rules, such as 'isNearProductImage' for the product title and product price candidate elements? * The only feature that is pulled out accurately on my test page (an Amazon product page) is the image. What rules can I add/modify to get title and price correct? TODO: * Add rule to remove ancestor elements who have the same 'innerText' value. * Consider adding image rule to see if an image element is the largest image on the page (above the fold). * Add price rule to see if innerText starts with '$'. --- src/fathom_coefficients.json | 4 +- src/fathom_extraction.js | 125 +++++++++++++++++++++++++++-------- 2 files changed, 101 insertions(+), 28 deletions(-) diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json index 46f28f0..86836c2 100644 --- a/src/fathom_coefficients.json +++ b/src/fathom_coefficients.json @@ -1,3 +1,5 @@ { - "hasDivWithPriceClass": 2 + "isNearTopOfPage": 3, + "isSufficientlyLarge": 3, + "hasIdOrClassWithTitleSubstring": 2 } diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js index ba4846e..cdfc768 100644 --- a/src/fathom_extraction.js +++ b/src/fathom_extraction.js @@ -13,32 +13,90 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web'; import fathomCoeffs from 'commerce/fathom_coefficients.json'; -const SCORE_THRESHOLD = fathomCoeffs.hasDivWithPriceClass; +const PRODUCT_FEATURES = ['title', 'price', 'image']; +const SCORE_THRESHOLD = 3; +const DEFAULT_SCORE = 1; /** - * Checks to see if an element is a
with a class of "price". - * Returns an integer corresponding to the coefficient to use for - * scoring an element with this rule. + * Each of these functions represents a rule check: if the fnode passes + * the rule, it gets a weighted score from 'fathom_coefficients.json'; + * otherwise, it gets the default score. */ -function hasDivWithPriceClass(fnode) { - if (fnode.element.classList.contains('price')) { - return fathomCoeffs.hasDivWithPriceClass; + +/** + * TODO bdanforth: add comment + */ +function isNearTopOfPage(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.top <= 800) { + return fathomCoeffs.isNearTopOfPage; + } + return DEFAULT_SCORE; +} + +/** + * TODO bdanforth: add comment + */ +function isSufficientlyLarge(fnode) { + const domRect = fnode.element.getBoundingClientRect(); + if (domRect.width >= 300 && domRect.height >= 300) { + return fathomCoeffs.isSufficientlyLarge; } - return 1; + return DEFAULT_SCORE; +} + +/** + * TODO bdanforth: add comment + */ +function hasIdOrClassWithTitleSubstring(fnode) { + // An element's class list is an array-like object + const classListStr = Array.prototype.join.call(fnode.element.classList); + if (fnode.element.id.includes('title') || classListStr.includes('title')) { + return fathomCoeffs.hasIdOrClassWithTitleSubstring; + } + return DEFAULT_SCORE; } /** * Ruleset for product features. Each feature has its own type. */ const rules = ruleset( - // get all elements that could contain the price - rule(dom('div'), type('priceish')), + // TODO: write rule(s) that ignore ancestors who have the + // same innerText value - // check class names to see if they contain 'price' - rule(type('priceish'), score(hasDivWithPriceClass)), + /** + * Image rules + */ + // TODO: also add rule for "largestImageOnPage"? + // consider all img elements in the DOM + rule(dom('img'), type('imageish')), + // check if these elements are near the top of the page + rule(type('imageish'), score(isNearTopOfPage)), + rule(type('imageish'), score(isSufficientlyLarge)), + // return image element with max score + rule(type('imageish').max(), out('image')), + /** + * Title rules + */ + // consider all h1 and span elements in the DOM + rule(dom('h1, span'), type('titleish')), + // check if these elements are near the top of the page + rule(type('titleish'), score(isNearTopOfPage)), + // check if 'title' is a substring in the element's id or classes + rule(type('titleish'), score(hasIdOrClassWithTitleSubstring)), + // return title element with max score + rule(type('titleish').max(), out('title')), + + /** + * Price rules + */ + // consider all span elements in the DOM + rule(dom('span'), type('priceish')), + // check if these elements are near the top of the page + rule(type('priceish'), score(isNearTopOfPage)), // return price element with max score - rule(type('priceish').max(), out('product-price')), + rule(type('priceish').max(), out('price')), ); /** @@ -46,27 +104,40 @@ const rules = ruleset( * contained in a page's HTML document. */ function runRuleset(doc) { - let fnodesList = rules.against(doc).get('product-price'); - fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); - // It is possible for multiple elements to have the same highest score. - if (fnodesList.length >= 1) { - return fnodesList[0].element; + const extractedElements = {}; + for (const feature of PRODUCT_FEATURES) { + let fnodesList = rules.against(doc).get(`${feature}`); + fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD); + // It is possible for multiple elements to have the same highest score. + if (fnodesList.length >= 1) { + extractedElements[feature] = fnodesList[0].element; + } } - return null; + return extractedElements; +} + +/** + * Returns true if every key in PRODUCT_FEATURES has a truthy value. + * TODO: Generalize and put in utils? Maybe make an array of all keys in + * the object and replace PRODUCT_FEATURES with that array? + */ +function hasAllFeatures(obj) { + return PRODUCT_FEATURES.map(key => obj[key]).every(val => val); } /* * Run the ruleset for the product features against the current window document */ export default function extractProduct(doc) { - const priceEle = runRuleset(doc); - if (priceEle) { - const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); - if (price) { - return { - price, - }; + const extractedProduct = {}; + const extractedElements = runRuleset(doc); + if (hasAllFeatures(extractedElements)) { + for (const feature of PRODUCT_FEATURES) { + extractedProduct[feature] = (feature === 'image' + ? extractedElements[feature].src + : extractedElements[feature].innerText + ); } } - return null; + return hasAllFeatures(extractedProduct) ? extractedProduct : null; }