Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
#36: Make fathom_ruleset.js mirror Fathom's trainer script.
Browse files Browse the repository at this point in the history
This enables the same script to be used for training and running in the commerce webextension.

How to train a ruleset with Fathom:
1. Follow Fathom's [Trainer instructions](https://github.com/erikrose/fathom-fox#the-trainer).
2. Open the [Fathom Trainees](https://github.com/mozilla/fathom-trainees) add-on in a new profile.
3. Install FathomFox in that window from AMO.
4. Drag and drop the training corpus into that window.
  - Note: The training corpus are HTML files frozen using [FathomFox's DevTools panel](https://github.com/erikrose/fathom-fox#the-developer-tools-panel); our training corpus is on the shared "commerce" Google drive.
  - Note: As of the date of this commit, the Corpus Collector is not a recommended option for building a training corpus due to a `freeze-dry` dependency bug that inserts a bunch of extra garbage when re-freezing a frozen page.
5. Copy ./src/fathom_ruleset.js into fathom-trainees/src/trainees.js and save over it.
6. Choose a feature to train, 'price', 'title' or 'image', and edit `trainees.set()` so that one of those features is the first argument.
7. Comment out the rules pertaining to all but that feature.
  - Currently, you can only train one ruleset at a time with Fathom, and only one `out` (e.g. 'title', 'image' or 'product') at a time for a given ruleset.
  - If you have multiple `out`s you'd like to train simultaneously, repeat this process for the remaining features so Fathom is running in a separate browser window for each feature and its corresponding rules.
8. Click the FathomFox browserAction and select "Train"
9. Select the feature from the dropdown list and click the "Train against the tabs in this window" button.
10. The array of coefficients displayed on the training page will update over time as Fathom optimizes them; this could take a while.
  • Loading branch information
biancadanforth committed Aug 24, 2018
1 parent 9afa266 commit d94071d
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 155 deletions.
14 changes: 7 additions & 7 deletions src/fathom_coefficients.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"largerImage": 3,
"largerFontSize": 1,
"hasDollarSign": 3,
"hasTitleInID": 10,
"hasTitleInClassName": 5,
"isHidden": -100,
"isHeaderElement": 10
"largerImageCoeff": 3,
"largerFontSizeCoeff": 1,
"hasDollarSignCoeff": 3,
"hasTitleInIDCoeff": 10,
"hasTitleInClassNameCoeff": 5,
"isHiddenCoeff": -100,
"isHeaderElementCoeff": 10
}
168 changes: 20 additions & 148 deletions src/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,165 +10,37 @@
* Features: title, image, price
*/

import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';
import productRuleset from 'commerce/fathom_ruleset';
import {
largerImageCoeff,
largerFontSizeCoeff,
hasDollarSignCoeff,
hasTitleInIDCoeff,
hasTitleInClassNameCoeff,
isHiddenCoeff,
isHeaderElementCoeff,
} from 'commerce/fathom_coefficients.json';

const PRODUCT_FEATURES = ['title', 'price', 'image'];
const SCORE_THRESHOLD = 4;
const DEFAULT_BODY_FONT_SIZE = 14;
const DEFAULT_SCORE = 1;
const VIEWPORT_HEIGHT = window.innerHeight;

/**
* Returns true if the fnode is above the fold
*/
function isAboveTheFold(fnode) {
const domRect = fnode.element.getBoundingClientRect();
if (domRect.top <= VIEWPORT_HEIGHT) {
return true;
}
return false;
}

/**
* Scores fnode in direct proportion to its size
*/
function largerImage(fnode) {
const domRect = fnode.element.getBoundingClientRect();
const area = (domRect.width) * (domRect.height);
if (area === 0) {
return DEFAULT_SCORE;
}
return area * fathomCoeffs.largerImage;
}

/**
* Scores fnode with a '$' in its innerText
*/
function hasDollarSign(fnode) {
if (fnode.element.innerText.includes('$')) {
return fathomCoeffs.hasDollarSign;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode in direct proportion to its font size
*/
function largerFontSize(fnode) {
const sizeWithUnits = window.getComputedStyle(fnode.element).fontSize;
const size = sizeWithUnits.replace('px', '');
if (size) {
// normalize the multiplier by the default font size
const sizeMultiplier = parseInt(size, 10) / DEFAULT_BODY_FONT_SIZE;
return (sizeMultiplier * fathomCoeffs.largerFontSize);
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in its id
*/
function hasTitleInID(fnode) {
const id = fnode.element.id;
if (id.includes('title') || id.includes('Title')) {
return fathomCoeffs.hasTitleInID;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode with "title" in a class name
*/
function hasTitleInClassName(fnode) {
const className = fnode.element.className;
if (className.includes('title') || className.includes('Title')) {
return fathomCoeffs.hasTitleInClassName;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is hidden
*/
function isHidden(fnode) {
const element = fnode.element;
const style = window.getComputedStyle(element);
if (!element.offsetParent // null if the offsetParent has a display set to "none"
|| style.visibility === 'hidden'
|| style.opacity === '0'
|| style.width === '0'
|| style.height === '0') {
return fathomCoeffs.isHidden;
}
return DEFAULT_SCORE;
}

/**
* Scores fnode that is an H1 element
*/
function isHeaderElement(fnode) {
if (fnode.element.tagName === 'H1') {
return fathomCoeffs.isHeaderElement;
}
return DEFAULT_SCORE;
}

/**
* Ruleset for product features; each feature has its own type.
*/
const rules = ruleset(
/**
* Image rules
*/
// consider all img elements near the top of the page
rule(dom('img').when(isAboveTheFold), type('imageish')),
// better score for larger images
rule(type('imageish'), score(largerImage)),
// return image element with max score
rule(type('imageish').max(), out('image')),

/**
* Title rules
*/
// consider all h1 and span elements near the top of the page
rule(dom('h1, span').when(isAboveTheFold), type('titleish')),
// score higher for h1 elements
rule(type('titleish'), score(isHeaderElement)),
// check if the id has "title" in it
rule(type('titleish'), score(hasTitleInID)),
// check if any class names have "title" in them
rule(type('titleish'), score(hasTitleInClassName)),
// better score for larger font size
rule(type('titleish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('titleish'), score(isHidden)),
// return title element with max score
rule(type('titleish').max(), out('title')),

/**
* Price rules
*/
// consider all span and h2 elements near the top of the page
rule(dom('span, h2').when(isAboveTheFold), type('priceish')),
// check if the element has a '$' in its innerText
rule(type('priceish'), score(hasDollarSign)),
// better score for larger font size
rule(type('priceish'), score(largerFontSize)),
// reduce score if element is hidden
rule(type('priceish'), score(isHidden)),
// return price element with max score
rule(type('priceish').max(), out('price')),
);

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
const extractedElements = {};
const rules = productRuleset.get('product').rulesetMaker;
for (const feature of PRODUCT_FEATURES) {
let fnodesList = rules.against(doc).get(`${feature}`);
let fnodesList = rules([
largerImageCoeff,
largerFontSizeCoeff,
hasDollarSignCoeff,
hasTitleInIDCoeff,
hasTitleInClassNameCoeff,
isHiddenCoeff,
isHeaderElementCoeff,
]).against(doc).get(`${feature}`);
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
Expand Down
Loading

0 comments on commit d94071d

Please sign in to comment.