Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
#36: Add more sophisticated Fathom rules.
Browse files Browse the repository at this point in the history
This commit builds off of PR #38, so that PR should merge before this.

Open questions:
* How to test interdependent rules, such as 'isNearProductImage' for the product title and product price candidate elements?
* The only feature that is pulled out accurately on my test page (an Amazon product page) is the image. What rules can I add/modify to get title and price correct?

TODO:
* Add rule to remove ancestor elements who have the same 'innerText' value.
* Consider adding image rule to see if an image element is the largest image on the page (above the fold).
* Add price rule to see if innerText starts with '$'.
  • Loading branch information
biancadanforth committed Aug 3, 2018
1 parent a99d3ba commit b9a56d7
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 28 deletions.
4 changes: 3 additions & 1 deletion src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
{
"hasDivWithPriceClass": 2
"isNearTopOfPage": 3,
"isSufficientlyLarge": 3,
"hasIdOrClassWithTitleSubstring": 2
}
125 changes: 98 additions & 27 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,60 +13,131 @@
import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

const SCORE_THRESHOLD = fathomCoeffs.hasDivWithPriceClass;
const PRODUCT_FEATURES = ['title', 'price', 'image'];
const SCORE_THRESHOLD = 3;
const DEFAULT_SCORE = 1;

/**
* Checks to see if an element is a <div> with a class of "price".
* Returns an integer corresponding to the coefficient to use for
* scoring an element with this rule.
* Each of these functions represents a rule check: if the fnode passes
* the rule, it gets a weighted score from 'fathom_coefficients.json';
* otherwise, it gets the default score.
*/
function hasDivWithPriceClass(fnode) {
if (fnode.element.classList.contains('price')) {
return fathomCoeffs.hasDivWithPriceClass;

/**
* TODO bdanforth: add comment
*/
function isNearTopOfPage(fnode) {
const domRect = fnode.element.getBoundingClientRect();
if (domRect.top <= 800) {
return fathomCoeffs.isNearTopOfPage;
}
return DEFAULT_SCORE;
}

/**
* TODO bdanforth: add comment
*/
function isSufficientlyLarge(fnode) {
const domRect = fnode.element.getBoundingClientRect();
if (domRect.width >= 300 && domRect.height >= 300) {
return fathomCoeffs.isSufficientlyLarge;
}
return 1;
return DEFAULT_SCORE;
}

/**
* TODO bdanforth: add comment
*/
function hasIdOrClassWithTitleSubstring(fnode) {
// An element's class list is an array-like object
const classListStr = Array.prototype.join.call(fnode.element.classList);
if (fnode.element.id.includes('title') || classListStr.includes('title')) {
return fathomCoeffs.hasIdOrClassWithTitleSubstring;
}
return DEFAULT_SCORE;
}

/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
// get all elements that could contain the price
rule(dom('div'), type('priceish')),
// TODO: write rule(s) that ignore ancestors who have the
// same innerText value

// check class names to see if they contain 'price'
rule(type('priceish'), score(hasDivWithPriceClass)),
/**
* Image rules
*/
// TODO: also add rule for "largestImageOnPage"?
// consider all img elements in the DOM
rule(dom('img'), type('imageish')),
// check if these elements are near the top of the page
rule(type('imageish'), score(isNearTopOfPage)),
rule(type('imageish'), score(isSufficientlyLarge)),
// return image element with max score
rule(type('imageish').max(), out('image')),

/**
* Title rules
*/
// consider all h1 and span elements in the DOM
rule(dom('h1, span'), type('titleish')),
// check if these elements are near the top of the page
rule(type('titleish'), score(isNearTopOfPage)),
// check if 'title' is a substring in the element's id or classes
rule(type('titleish'), score(hasIdOrClassWithTitleSubstring)),
// return title element with max score
rule(type('titleish').max(), out('title')),

/**
* Price rules
*/
// consider all span elements in the DOM
rule(dom('span'), type('priceish')),
// check if these elements are near the top of the page
rule(type('priceish'), score(isNearTopOfPage)),
// return price element with max score
rule(type('priceish').max(), out('product-price')),
rule(type('priceish').max(), out('price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
let fnodesList = rules.against(doc).get('product-price');
fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
return fnodesList[0].element;
const extractedElements = {};
for (const feature of PRODUCT_FEATURES) {
let fnodesList = rules.against(doc).get(`${feature}`);
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
extractedElements[feature] = fnodesList[0].element;
}
}
return null;
return extractedElements;
}

/**
* Returns true if every key in PRODUCT_FEATURES has a truthy value.
* TODO: Generalize and put in utils? Maybe make an array of all keys in
* the object and replace PRODUCT_FEATURES with that array?
*/
function hasAllFeatures(obj) {
return PRODUCT_FEATURES.map(key => obj[key]).every(val => val);
}

/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
const priceEle = runRuleset(doc);
if (priceEle) {
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
if (price) {
return {
price,
};
const extractedProduct = {};
const extractedElements = runRuleset(doc);
if (hasAllFeatures(extractedElements)) {
for (const feature of PRODUCT_FEATURES) {
extractedProduct[feature] = (feature === 'image'
? extractedElements[feature].src
: extractedElements[feature].innerText
);
}
}
return null;
return hasAllFeatures(extractedProduct) ? extractedProduct : null;
}

0 comments on commit b9a56d7

Please sign in to comment.