Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
#36: Update product title rules.
Browse files Browse the repository at this point in the history
Product title rules previously pulled the unique 'title' element from the 'head' element on the page (part of the pages metadata). While this ostensibly requires less processing (we don't have to search the DOM or score any other elements), the title string often requires site-specific cleaning such as to remove the vendor name, and the final, cleaned up string cannot not be verified as accurate by Fathom, which only tells us if our rules picked the right element.

The alternative approach, implemented here, is to pull the title from the corresponding element in the content of the page. Since Fathom can verify that the right element was selected, and the string from this element would not require any cleaning, this approach is a much better proxy for extracting the correct product title.
  • Loading branch information
biancadanforth committed Aug 8, 2018
1 parent 934cc71 commit ba4ead8
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 60 deletions.
8 changes: 6 additions & 2 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"largerImage": 3,
"largerFontSize": 2,
"hasDollarSign": 3
"largerFontSize": 1,
"hasDollarSign": 3,
"hasTitleInID": 10,
"hasTitleInClassName": 5,
"isHidden": -100,
"isHeaderElement": 10
}
128 changes: 70 additions & 58 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,11 @@ import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';

const PRODUCT_FEATURES = ['title', 'price', 'image'];
const SCORE_THRESHOLD = 3;
const SCORE_THRESHOLD = 4;
const DEFAULT_BODY_FONT_SIZE = 14;
const DEFAULT_SCORE = 1;
const VIEWPORT_HEIGHT = window.innerHeight;

/**
* Each of these functions represents a rule check: if the fnode passes
* the rule, it gets a weighted score from 'fathom_coefficients.json';
* otherwise, it gets the default score.
*/

/**
* Returns true if the fnode is above the fold
*/
Expand Down Expand Up @@ -61,16 +56,66 @@ function hasDollarSign(fnode) {
* Scores fnode in direct proportion to its font size
*/
function largerFontSize(fnode) {
const sizeWithUnits = window.getComputedStyle(fnode.element).getPropertyValue('font-size');
const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize;
const size = sizeWithUnits.replace('px', '');
if (size) {
return (parseInt(size, 10) * fathomCoeffs.largerFontSize);
// normalize the multiplier by the default font size
const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE;
return (sizeMultiplier * fathomCoeffs.largerFontSize);
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in its id
*/
function hasTitleInID(fnode) {
const id = fnode.element.id;
if (id.includes('title') || id.includes('Title')) {
return fathomCoeffs.hasTitleInID;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in a class name
*/
function hasTitleInClassName(fnode) {
const className = fnode.element.className;
if (className.includes('title') || className.includes('Title')) {
return fathomCoeffs.hasTitleInClassName;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is hidden
*/
function isHidden(fnode) {
const element = fnode.element;
const style = window.getComputedStyle(element);
if (!element.offsetParent // null if the offsetParent has a display set to "none"
|| style.visibility === 'hidden'
|| style.opacity === '0'
|| style.width === '0'
|| style.height === '0') {
return fathomCoeffs.isHidden;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is an H1 element
*/
function isHeaderElement(fnode) {
if (fnode.element.tagName === 'H1') {
return fathomCoeffs.isHeaderElement;
}
return DEFAULT_SCORE;
}

/**
* Ruleset for product features. Each feature has its own type.
* Ruleset for product features; each feature has its own type.
*/
const rules = ruleset(
/**
Expand All @@ -86,10 +131,18 @@ const rules = ruleset(
/**
* Title rules
*/
// consider only the title element
rule(dom('title'), type('titleish')),
// give the title element the minimum score
rule(type('titleish'), score(() => SCORE_THRESHOLD)),
// consider all h1 and span elements near the top of the page
rule(dom('h1, span').when(isAboveTheFold), type('titleish')),
// score higher for h1 elements
rule(type('titleish'), score(isHeaderElement)),
// check if the id has "title" in it
rule(type('titleish'), score(hasTitleInID)),
// check if any class names have "title" in them
rule(type('titleish'), score(hasTitleInClassName)),
// better score for larger font size
rule(type('titleish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('titleish'), score(isHidden)),
// return title element with max score
rule(type('titleish').max(), out('title')),

Expand All @@ -102,6 +155,8 @@ const rules = ruleset(
rule(type('priceish'), score(hasDollarSign)),
// better score for larger font size
rule(type('priceish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('priceish'), score(isHidden)),
// return price element with max score
rule(type('priceish').max(), out('price')),
);
Expand Down Expand Up @@ -130,42 +185,6 @@ function hasAllFeatures(obj) {
return PRODUCT_FEATURES.map(key => obj[key]).every(val => val);
}

// Trim off the shorter substring between ' - ', ': ' or ' | '
function trimTitle(title) {
let textArr = [];
// TODO: This currently cuts of the " - Black" substring on E-bay
if (title.includes(' - ')) {
textArr = title.split(' - ');
}
if (title.includes(': ')) {
textArr = title.split(': ');
}
if (textArr.length >= 1) {
return textArr.reduce((a, b) => ((a.length > b.length) ? a : b));
}
return title;
}

/**
* Takes a price string of the form "$1997 /each" and turns
* it into "$19.97".
* TODO: Can this be generalized/simplified? This is very specific
* to Home Depot's product pages.
*/
function formatPrice(price) {
let formattedPrice = price;
if (price.includes('/')) {
const index = price.indexOf('/');
formattedPrice = price.slice(0, index);
formattedPrice = formattedPrice.trim();
const decimalIndex = formattedPrice.length - 2;
const rightSide = formattedPrice.substring(decimalIndex);
const leftSide = formattedPrice.replace(rightSide, '');
formattedPrice = `${leftSide}.${rightSide}`;
}
return formattedPrice;
}

/*
* Run the ruleset for the product features against the current window document
*/
Expand All @@ -174,16 +193,9 @@ export default function extractProduct(doc) {
const extractedElements = runRuleset(doc);
if (hasAllFeatures(extractedElements)) {
for (const feature of PRODUCT_FEATURES) {
let text = extractedElements[feature].innerText;
if (feature === 'title') {
text = trimTitle(text);
}
if (feature === 'price') {
text = formatPrice(text);
}
extractedProduct[feature] = (feature === 'image'
? extractedElements[feature].src
: text
: extractedElements[feature].innerText
);
}
}
Expand Down

0 comments on commit ba4ead8

Please sign in to comment.