Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
#36: Make fathom_ruleset.js mirror Fathom's trainer script.
Browse files Browse the repository at this point in the history
This enables the same script to be used for training and running in the commerce webextension.

How to train a ruleset with Fathom:
1. Follow Fathom's [instructions](https://github.com/erikrose/fathom-fox).
2. Open the [Fathom Trainees](https://github.com/mozilla/fathom-trainees) add-on in a new profile.
3. Install FathomFox in that window from AMO.
4. Drag and drop the training corpus (HTML files in ./training-set) into that window.
5. Copy ./src/fathom_ruleset.js into fathom-trainees/src/trainees.js and save over it.
6. Choose a feature to train, 'price', 'title' or 'image', and edit `trainees.set()` so that one of those features is the first argument.
7. Comment out the rules pertaining to all but that feature.
8. Click the FathomFox browserAction and select "Train"
9. Select the feature from the dropdown list and click the "Train against the tabs in this window" button.
10. You will see the accuracy based on the initial coefficients passed in, and Fathom will start generating optimized coefficients. This could take a while.
11. When Fathom is done, those coefficients will be logged to the Fathom page.
  • Loading branch information
biancadanforth committed Aug 8, 2018
1 parent 289df7d commit 42cd870
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 155 deletions.
14 changes: 7 additions & 7 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"largerImage": 3,
"largerFontSize": 1,
"hasDollarSign": 3,
"hasTitleInID": 10,
"hasTitleInClassName": 5,
"isHidden": -100,
"isHeaderElement": 10
"largerImageCoeff": 3,
"largerFontSizeCoeff": 1,
"hasDollarSignCoeff": 3,
"hasTitleInIDCoeff": 10,
"hasTitleInClassNameCoeff": 5,
"isHiddenCoeff": -100,
"isHeaderElementCoeff": 10
}
168 changes: 20 additions & 148 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,165 +10,37 @@
* Features: title, image, price
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';
import productRuleset from 'commerce/fathom_ruleset';
import {
largerImageCoeff,
largerFontSizeCoeff,
hasDollarSignCoeff,
hasTitleInIDCoeff,
hasTitleInClassNameCoeff,
isHiddenCoeff,
isHeaderElementCoeff,
} from 'commerce/fathom_coefficients.json';

const PRODUCT_FEATURES = ['title', 'price', 'image'];
const SCORE_THRESHOLD = 4;
const DEFAULT_BODY_FONT_SIZE = 14;
const DEFAULT_SCORE = 1;
const VIEWPORT_HEIGHT = window.innerHeight;

/**
* Returns true if the fnode is above the fold
*/
function isAboveTheFold(fnode) {
const domRect = fnode.element.getBoundingClientRect();
if (domRect.top <= VIEWPORT_HEIGHT) {
return true;
}
return false;
}

/**
* Scores fnode in direct proportion to its size
*/
function largerImage(fnode) {
const domRect = fnode.element.getBoundingClientRect();
const area = (domRect.width) * (domRect.height);
if (area === 0) {
return DEFAULT_SCORE;
}
return area * fathomCoeffs.largerImage;
}

/**
* Scores fnode with a '$' in its innerText
*/
function hasDollarSign(fnode) {
if (fnode.element.innerText.includes('$')) {
return fathomCoeffs.hasDollarSign;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode in direct proportion to its font size
*/
function largerFontSize(fnode) {
const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize;
const size = sizeWithUnits.replace('px', '');
if (size) {
// normalize the multiplier by the default font size
const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE;
return (sizeMultiplier * fathomCoeffs.largerFontSize);
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in its id
*/
function hasTitleInID(fnode) {
const id = fnode.element.id;
if (id.includes('title') || id.includes('Title')) {
return fathomCoeffs.hasTitleInID;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in a class name
*/
function hasTitleInClassName(fnode) {
const className = fnode.element.className;
if (className.includes('title') || className.includes('Title')) {
return fathomCoeffs.hasTitleInClassName;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is hidden
*/
function isHidden(fnode) {
const element = fnode.element;
const style = window.getComputedStyle(element);
if (!element.offsetParent // null if the offsetParent has a display set to "none"
|| style.visibility === 'hidden'
|| style.opacity === '0'
|| style.width === '0'
|| style.height === '0') {
return fathomCoeffs.isHidden;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is an H1 element
*/
function isHeaderElement(fnode) {
if (fnode.element.tagName === 'H1') {
return fathomCoeffs.isHeaderElement;
}
return DEFAULT_SCORE;
}

/**
* Ruleset for product features; each feature has its own type.
*/
const rules = ruleset(
/**
* Image rules
*/
// consider all img elements near the top of the page
rule(dom('img').when(isAboveTheFold), type('imageish')),
// better score for larger images
rule(type('imageish'), score(largerImage)),
// return image element with max score
rule(type('imageish').max(), out('image')),

/**
* Title rules
*/
// consider all h1 and span elements near the top of the page
rule(dom('h1, span').when(isAboveTheFold), type('titleish')),
// score higher for h1 elements
rule(type('titleish'), score(isHeaderElement)),
// check if the id has "title" in it
rule(type('titleish'), score(hasTitleInID)),
// check if any class names have "title" in them
rule(type('titleish'), score(hasTitleInClassName)),
// better score for larger font size
rule(type('titleish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('titleish'), score(isHidden)),
// return title element with max score
rule(type('titleish').max(), out('title')),

/**
* Price rules
*/
// consider all span and h2 elements near the top of the page
rule(dom('span, h2').when(isAboveTheFold), type('priceish')),
// check if the element has a '$' in its innerText
rule(type('priceish'), score(hasDollarSign)),
// better score for larger font size
rule(type('priceish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('priceish'), score(isHidden)),
// return price element with max score
rule(type('priceish').max(), out('price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
const extractedElements = {};
const rules = productRuleset.get('product').rulesetMaker;
for (const feature of PRODUCT_FEATURES) {
let fnodesList = rules.against(doc).get(`${feature}`);
let fnodesList = rules([
largerImageCoeff,
largerFontSizeCoeff,
hasDollarSignCoeff,
hasTitleInIDCoeff,
hasTitleInClassNameCoeff,
isHiddenCoeff,
isHeaderElementCoeff,
]).against(doc).get(`${feature}`);
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
Expand Down
Loading

0 comments on commit 42cd870

Please sign in to comment.