diff --git a/src/fathom_coefficients.json b/src/fathom_coefficients.json
index 46f28f0..86836c2 100644
--- a/src/fathom_coefficients.json
+++ b/src/fathom_coefficients.json
@@ -1,3 +1,5 @@
{
- "hasDivWithPriceClass": 2
+ "isNearTopOfPage": 3,
+ "isSufficientlyLarge": 3,
+ "hasIdOrClassWithTitleSubstring": 2
}
diff --git a/src/fathom_extraction.js b/src/fathom_extraction.js
index ba4846e..cdfc768 100644
--- a/src/fathom_extraction.js
+++ b/src/fathom_extraction.js
@@ -13,32 +13,90 @@
import {dom, out, rule, ruleset, score, type} from 'fathom-web';
import fathomCoeffs from 'commerce/fathom_coefficients.json';
-const SCORE_THRESHOLD = fathomCoeffs.hasDivWithPriceClass;
+const PRODUCT_FEATURES = ['title', 'price', 'image'];
+const SCORE_THRESHOLD = 3;
+const DEFAULT_SCORE = 1;
/**
- * Checks to see if an element is a
with a class of "price".
- * Returns an integer corresponding to the coefficient to use for
- * scoring an element with this rule.
+ * Each of these functions represents a rule check: if the fnode passes
+ * the rule, it gets a weighted score from 'fathom_coefficients.json';
+ * otherwise, it gets the default score.
*/
-function hasDivWithPriceClass(fnode) {
- if (fnode.element.classList.contains('price')) {
- return fathomCoeffs.hasDivWithPriceClass;
+
+/**
+ * TODO bdanforth: add comment
+ */
+function isNearTopOfPage(fnode) {
+ const domRect = fnode.element.getBoundingClientRect();
+ if (domRect.top <= 800) {
+ return fathomCoeffs.isNearTopOfPage;
+ }
+ return DEFAULT_SCORE;
+}
+
+/**
+ * TODO bdanforth: add comment
+ */
+function isSufficientlyLarge(fnode) {
+ const domRect = fnode.element.getBoundingClientRect();
+ if (domRect.width >= 300 && domRect.height >= 300) {
+ return fathomCoeffs.isSufficientlyLarge;
}
- return 1;
+ return DEFAULT_SCORE;
+}
+
+/**
+ * TODO bdanforth: add comment
+ */
+function hasIdOrClassWithTitleSubstring(fnode) {
+ // An element's class list is an array-like object
+ const classListStr = Array.prototype.join.call(fnode.element.classList);
+ if (fnode.element.id.includes('title') || classListStr.includes('title')) {
+ return fathomCoeffs.hasIdOrClassWithTitleSubstring;
+ }
+ return DEFAULT_SCORE;
}
/**
* Ruleset for product features. Each feature has its own type.
*/
const rules = ruleset(
- // get all elements that could contain the price
- rule(dom('div'), type('priceish')),
+ // TODO: write rule(s) that ignore ancestors who have the
+ // same innerText value
- // check class names to see if they contain 'price'
- rule(type('priceish'), score(hasDivWithPriceClass)),
+ /**
+ * Image rules
+ */
+ // TODO: also add rule for "largestImageOnPage"?
+ // consider all img elements in the DOM
+ rule(dom('img'), type('imageish')),
+ // check if these elements are near the top of the page
+ rule(type('imageish'), score(isNearTopOfPage)),
+ rule(type('imageish'), score(isSufficientlyLarge)),
+ // return image element with max score
+ rule(type('imageish').max(), out('image')),
+ /**
+ * Title rules
+ */
+ // consider all h1 and span elements in the DOM
+ rule(dom('h1, span'), type('titleish')),
+ // check if these elements are near the top of the page
+ rule(type('titleish'), score(isNearTopOfPage)),
+ // check if 'title' is a substring in the element's id or classes
+ rule(type('titleish'), score(hasIdOrClassWithTitleSubstring)),
+ // return title element with max score
+ rule(type('titleish').max(), out('title')),
+
+ /**
+ * Price rules
+ */
+ // consider all span elements in the DOM
+ rule(dom('span'), type('priceish')),
+ // check if these elements are near the top of the page
+ rule(type('priceish'), score(isNearTopOfPage)),
// return price element with max score
- rule(type('priceish').max(), out('product-price')),
+ rule(type('priceish').max(), out('price')),
);
/**
@@ -46,27 +104,40 @@ const rules = ruleset(
* contained in a page's HTML document.
*/
function runRuleset(doc) {
- let fnodesList = rules.against(doc).get('product-price');
- fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD);
- // It is possible for multiple elements to have the same highest score.
- if (fnodesList.length >= 1) {
- return fnodesList[0].element;
+ const extractedElements = {};
+ for (const feature of PRODUCT_FEATURES) {
+ let fnodesList = rules.against(doc).get(`${feature}`);
+ fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
+ // It is possible for multiple elements to have the same highest score.
+ if (fnodesList.length >= 1) {
+ extractedElements[feature] = fnodesList[0].element;
+ }
}
- return null;
+ return extractedElements;
+}
+
+/**
+ * Returns true if every key in PRODUCT_FEATURES has a truthy value.
+ * TODO: Generalize and put in utils? Maybe make an array of all keys in
+ * the object and replace PRODUCT_FEATURES with that array?
+ */
+function hasAllFeatures(obj) {
+ return PRODUCT_FEATURES.map(key => obj[key]).every(val => val);
}
/*
* Run the ruleset for the product features against the current window document
*/
export default function extractProduct(doc) {
- const priceEle = runRuleset(doc);
- if (priceEle) {
- const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content');
- if (price) {
- return {
- price,
- };
+ const extractedProduct = {};
+ const extractedElements = runRuleset(doc);
+ if (hasAllFeatures(extractedElements)) {
+ for (const feature of PRODUCT_FEATURES) {
+ extractedProduct[feature] = (feature === 'image'
+ ? extractedElements[feature].src
+ : extractedElements[feature].innerText
+ );
}
}
- return null;
+ return hasAllFeatures(extractedProduct) ? extractedProduct : null;
}