Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Merge pull request #45 from mozilla/36-fathom-rules
Browse files Browse the repository at this point in the history
Fix #36: Add initial Fathom rules with 98.7% average training accuracy
  • Loading branch information
biancadanforth authored Aug 27, 2018
2 parents 35197f8 + e3e4f15 commit 5c50172
Show file tree
Hide file tree
Showing 12 changed files with 502 additions and 77 deletions.
1 change: 1 addition & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"no-restricted-syntax": ["off"],
"no-use-before-define": ["error", {"functions": false}],
"no-prototype-builtins": ["off"],
"class-methods-use-this": ["off"],

"react/jsx-one-expression-per-line": ["off"],
"react/prefer-stateless-function": ["off"],
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ node_modules
web-ext-artifacts
build
gecko.log
.DS_Store
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ After this, you can run `pipenv run test` to run the automated test suite.

- `src/background` contains the background scripts that trigger UI elements (such as the page action) and periodically check for price updates.
- `src/browser_action` contains the toolbar popup for managing the list of currently-tracked products.
- `src/extraction` contains the content scripts that extract product information from product web pages.
- `src/page_action` contains the URL bar popup for viewing and tracking the product in the current tab.
- `src/state` contains the Redux-based code for managing global extension state.
- `src/tests` contains the automated test suite.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
/**
* Uses CSS selectors, or failing that, Open Graph <meta> tags to extract
* a product from its product page, where a 'product' is defined by the bundle
* of features that makes it identifiable.
*
* Features: title, image, price
*/

import extractionData from 'commerce/product_extraction_data.json';
import extractionData from 'commerce/extraction/product_extraction_data.json';

const OPEN_GRAPH_PROPERTY_VALUES = {
title: 'og:title',
Expand Down
12 changes: 12 additions & 0 deletions src/extraction/fathom_default_coefficients.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"hasDollarSignCoeff": 8,
"hasPriceInClassNameCoeff": 2,
"hasPriceInIDCoeff": 17,
"hasPriceishPatternCoeff": 15,
"isAboveTheFoldImageCoeff": 13,
"isAboveTheFoldPriceCoeff": 33,
"isNearbyImageXAxisPriceCoeff": 5,
"isNearbyImageYAxisTitleCoeff": 5,
"largerFontSizeCoeff": 7,
"largerImageCoeff": 2
}
67 changes: 67 additions & 0 deletions src/extraction/fathom_extraction.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
* Uses Fathom to extract a product from its product page,
* where a 'product' is defined by the bundle of features that
* makes it identifiable.
*
* Features: title, image, price
*/

import defaultCoefficients from 'commerce/extraction/fathom_default_coefficients.json';
import RulesetFactory from 'commerce/extraction/ruleset_factory';

const PRODUCT_FEATURES = ['title', 'price', 'image'];
// Minimum score to be considered the "correct" feature element extracted by Fathom
const SCORE_THRESHOLD = 4;
// Array of numbers corresponding to the coefficients in order
const coefficients = RulesetFactory.getCoeffsInOrder(defaultCoefficients);
// For production, we don't need to generate a new ruleset factory
// and ruleset every time we run Fathom, since the coefficients are static.
const rulesetFactory = new RulesetFactory(coefficients);
const rules = rulesetFactory.makeRuleset();

/**
* Extracts the highest scoring element above a score threshold
* contained in a page's HTML document.
*/
function runRuleset(doc) {
const extractedElements = {};
const results = rules.against(doc);
for (const feature of PRODUCT_FEATURES) {
let fnodesList = results.get(feature);
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
extractedElements[feature] = fnodesList[0].element;
}
}
return extractedElements;
}

/**
* Returns true if every key in PRODUCT_FEATURES has a truthy value.
*/
function hasAllFeatures(obj) {
return PRODUCT_FEATURES.map(key => obj[key]).every(val => val);
}

/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
const extractedProduct = {};
const extractedElements = runRuleset(doc);
if (hasAllFeatures(extractedElements)) {
for (const feature of PRODUCT_FEATURES) {
if (feature === 'image') {
extractedProduct[feature] = extractedElements[feature].src;
} else {
extractedProduct[feature] = extractedElements[feature].innerText;
}
}
}
return hasAllFeatures(extractedProduct) ? extractedProduct : null;
}
Loading

0 comments on commit 5c50172

Please sign in to comment.