Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
#36: Add more sophisticated Fathom rules.
Browse files Browse the repository at this point in the history
These rules successfully pull out product title, price and image from the following product pages (one each from the 5 top sites):
* [Amazon](https://www.amazon.com/KitchenAid-KL26M1XER-Professional-6-Qt-Bowl-Lift/dp/B01LYV1U30?smid=ATVPDKIKX0DER&pf_rd_p=0c7b792f-241a-4510-94f4-dd184a76f201&pf_rd_r=AZD7BGV3JZGTB23F30X3)
* [Ebay](https://www.ebay.com/p/Best-Choice-Products-650W-6-speed-5-5QT-Kitchen-Food-Stand-Mixer-with-Stainless-Steels-Bowl-Black/3018375728?iid=253733404998)
* [Walmart](https://www.walmart.com/ip/KitchenAid-Classic-Series-4-5-Quart-Tilt-Head-Stand-Mixer-Onyx-Black-K45SSOB/29474640)
* [Best Buy](https://www.bestbuy.com/site/jbl-everest-elite-750nc-wireless-over-ear-noise-cancelling-headphones-gunmetal/5840136.p?skuId=5840136)
* [Home Depot](https://www.homedepot.com/p/Husky-SAE-Combination-Wrench-Set-10-Piece-HCW10PCSAE/202934501)

TODO:
* Create a training set with FathomFox and run these rules against them to measure their accuracy for 50 product pages (10 from each top site).
* Modify trimTitle method, so it doesn't cut off the color from the title for the product on Ebay.
* Generalize formatPrice method. @Osmose, would you have any suggestions?
  • Loading branch information
biancadanforth committed Aug 24, 2018
1 parent 35197f8 commit 6d345f8
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 26 deletions.
4 changes: 3 additions & 1 deletion src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
{
"hasPriceClass": 2
"largerImage": 3,
"largerFontSize": 2,
"hasDollarSign": 3
}
171 changes: 146 additions & 25 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,58 +13,179 @@
import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass;
const PRODUCT_FEATURES = ['title', 'price', 'image'];
const SCORE_THRESHOLD = 3;
const DEFAULT_SCORE = 1;
const VIEWPORT_HEIGHT = window.innerHeight;

/**
* Scores fnodes with a "price" class
* Each of these functions represents a rule check: if the fnode passes
* the rule, it gets a weighted score from 'fathom_coefficients.json';
* otherwise, it gets the default score.
*/
function hasPriceClass(fnode) {
if (fnode.element.classList.contains('price')) {
return fathomCoeffs.hasPriceClass;

/**
* Returns true if the fnode is above the fold
*/
function isAboveTheFold(fnode) {
const domRect = fnode.element.getBoundingClientRect();
if (domRect.top <= VIEWPORT_HEIGHT) {
return true;
}
return 1;
return false;
}

/**
* Scores fnode in direct proportion to its size
*/
function largerImage(fnode) {
const domRect = fnode.element.getBoundingClientRect();
const area = (domRect.width) * (domRect.height);
if (area === 0) {
return DEFAULT_SCORE;
}
return area * fathomCoeffs.largerImage;
}

/**
* Scores fnode with a '$' in its innerText
*/
function hasDollarSign(fnode) {
if (fnode.element.innerText.includes('$')) {
return fathomCoeffs.hasDollarSign;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode in direct proportion to its font size
*/
function largerFontSize(fnode) {
const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size');
const size = sizeWithUnits.replace('px', '');
if (size) {
return (parseInt(size, 10) * fathomCoeffs.largerFontSize);
}
return DEFAULT_SCORE;
}

/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
// get all elements that could contain the price
rule(dom('div'), type('priceish')),
/**
* Image rules
*/
// consider all img elements near the top of the page
rule(dom('img').when(isAboveTheFold), type('imageish')),
// better score for larger images
rule(type('imageish'), score(largerImage)),
// return image element with max score
rule(type('imageish').max(), out('image')),

// check class names to see if they contain 'price'
rule(type('priceish'), score(hasPriceClass)),
/**
* Title rules
*/
// consider only the title element
rule(dom('title'), type('titleish')),
// give the title element the minimum score
rule(type('titleish'), score(() => SCORE_THRESHOLD)),
// return title element with max score
rule(type('titleish').max(), out('title')),

/**
* Price rules
*/
// consider all span and h2 elements near the top of the page
rule(dom('span, h2').when(isAboveTheFold), type('priceish')),
// check if the element has a '$' in its innerText
rule(type('priceish'), score(hasDollarSign)),
// better score for larger font size
rule(type('priceish'), score(largerFontSize)),
// return price element with max score
rule(type('priceish').max(), out('product-price')),
rule(type('priceish').max(), out('price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
let fnodesList = rules.against(doc).get('product-price');
fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
return fnodesList[0].element;
const extractedElements = {};
for (const feature of PRODUCT_FEATURES) {
let fnodesList = rules.against(doc).get(`${feature}`);
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
extractedElements[feature] = fnodesList[0].element;
}
}
return extractedElements;
}

/**
* Returns true if every key in PRODUCT_FEATURES has a truthy value.
*/
function hasAllFeatures(obj) {
return PRODUCT_FEATURES.map(key => obj[key]).every(val => val);
}

// Trim off the shorter substring between ' - ', ': ' or ' | '
function trimTitle(title) {
let textArr = [];
// TODO: This currently cuts of the " - Black" substring on E-bay
if (title.includes(' - ')) {
textArr = title.split(' - ');
}
if (title.includes(': ')) {
textArr = title.split(': ');
}
if (textArr.length >= 1) {
return textArr.reduce((a, b) => ((a.length > b.length) ? a : b));
}
return title;
}

/**
* Takes a price string of the form "$1997 /each" and turns
* it into "$19.97".
* TODO: Can this be generalized/simplified? This is very specific
* to Home Depot's product pages.
*/
function formatPrice(price) {
let formattedPrice = price;
if (price.includes('/')) {
const index = price.indexOf('/');
formattedPrice = price.slice(0, index);
formattedPrice = formattedPrice.trim();
const decimalIndex = formattedPrice.length - 2;
const rightSide = formattedPrice.substring(decimalIndex);
const leftSide = formattedPrice.replace(rightSide, '');
formattedPrice = `${leftSide}.${rightSide}`;
}
return null;
return formattedPrice;
}

/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
const priceEle = runRuleset(doc);
if (priceEle) {
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
if (price) {
return {
price,
};
const extractedProduct = {};
const extractedElements = runRuleset(doc);
if (hasAllFeatures(extractedElements)) {
for (const feature of PRODUCT_FEATURES) {
let text = extractedElements[feature].innerText;
if (feature === 'title') {
text = trimTitle(text);
}
if (feature === 'price') {
text = formatPrice(text);
}
extractedProduct[feature] = (feature === 'image'
? extractedElements[feature].src
: text
);
}
}
return null;
return hasAllFeatures(extractedProduct) ? extractedProduct : null;
}

0 comments on commit 6d345f8

Please sign in to comment.