mozilla · Osmose · Nov 20, 2018 · Oct 5, 2018 · Oct 19, 2018 · Oct 25, 2018
diff --git a/src/extraction/fathom/ruleset_factory.js b/src/extraction/fathom/ruleset_factory.js
@@ -3,12 +3,9 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 import {dom, out, rule, ruleset, score, type} from 'fathom-web';
-// Since the fathom-trainees add-on currently uses a submodule of Fathom, for
-// training, replace 'utils' with 'utilsForFrontend'
-import {ancestors} from 'fathom-web/utils';
+import {ancestors} from 'fathom-web/utilsForFrontend';
+import {euclidean} from 'fathom-web/clusters';
 
-const DEFAULT_BODY_FONT_SIZE = 14;
-const DEFAULT_SCORE = 1;
 const TOP_BUFFER = 150;
 // From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js
 const ZEROISH = 0.08;
@@ -26,165 +23,130 @@ export default class RulesetFactory {
     [
       this.hasDollarSignCoeff,
       this.hasPriceInClassNameCoeff,
+      this.hasPriceInParentClassNameCoeff,
       this.hasPriceInIDCoeff,
+      this.hasPriceInParentIDCoeff,
       this.hasPriceishPatternCoeff,
       this.isAboveTheFoldImageCoeff,
       this.isAboveTheFoldPriceCoeff,
-      this.isNearbyImageXAxisPriceCoeff,
+      this.isNearImageCoeff,
       this.isNearbyImageYAxisTitleCoeff,
-      this.largerFontSizeCoeff,
-      this.largerImageCoeff,
+      this.bigFontCoeff,
+      this.bigImageCoeff,
     ] = coefficients;
   }
 
   /**
    * Scores fnode in direct proportion to its size
    */
-  largerImage(fnode) {
+  imageIsBig(fnode) {
     const domRect = fnode.element.getBoundingClientRect();
-    const area = (domRect.width) * (domRect.height);
-    if (area === 0) {
-      return DEFAULT_SCORE;
-    }
-    return area * this.largerImageCoeff;
+    const area = domRect.width * domRect.height;
+
+    // Assume no product images as small as 80px^2. No further bonus over
+    // 1000^2. For one thing, that's getting into background image territory
+    // (though we should have distinct penalties for that sort of thing if we
+    // care). More importantly, clamp the upper bound of the score so we don't
+    // overcome other bonuses and penalties.
+    return trapezoid(area, 80 ** 2, 1000 ** 2) ** this.bigImageCoeff;
   }
 
-  /**
-   * Scores fnode in proportion to its font size
-   */
-  largerFontSize(fnode) {
-    const size = window.getComputedStyle(fnode.element).fontSize;
-    // Normalize the multiplier by the default font size
-    const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE;
-    return sizeMultiplier * this.largerFontSizeCoeff;
+  /** Return whether a  */
+  fontIsBig(fnode) {
+    const size = parseInt(getComputedStyle(fnode.element).fontSize, 10);
+    return trapezoid(size, 14, 50) ** this.bigFontCoeff;
   }
 
   /**
    * Scores fnode with a '$' in its innerText
    */
   hasDollarSign(fnode) {
-    if (fnode.element.innerText.includes('$')) {
-      return this.hasDollarSignCoeff;
-    }
-    return DEFAULT_SCORE;
+    return (fnode.element.innerText.includes('$') ? ONEISH : ZEROISH) ** this.hasDollarSignCoeff;
+  }
+
+  /** Return a confidence of whether "price" is a word within a given string. */
+  contains(haystack, needle, coeff) {
+    return (haystack.toLowerCase().includes(needle) ? ONEISH : ZEROISH) ** coeff;
   }
 
   /**
-   * Scores fnode with 'price' in its id or its parent's id
+   * Scores fnode with 'price' in its id
    */
   hasPriceInID(fnode) {
-    const id = fnode.element.id;
-    const parentID = fnode.element.parentElement.id;
-    if (id.toLowerCase().includes('price')) {
-      return this.hasPriceInIDCoeff;
-    }
-    if (parentID.toLowerCase().includes('price')) {
-      return 0.75 * this.hasPriceInIDCoeff;
-    }
-    return DEFAULT_SCORE;
+    return this.contains(fnode.element.id, 'price', this.hasPriceInIDCoeff);
   }
 
-  /**
-   * Scores fnode with 'price' in its class name or its parent's class name
-   */
+  hasPriceInParentID(fnode) {
+    return this.contains(fnode.element.parentElement.id, 'price', this.hasPriceInParentIDCoeff);
+  }
+
+  /** Scores fnode with 'price' in its class name */
   hasPriceInClassName(fnode) {
-    const className = fnode.element.className;
-    const parentClassName = fnode.element.parentElement.className;
-    if (className.toLowerCase().includes('price')) {
-      return this.hasPriceInClassNameCoeff;
-    }
-    if (parentClassName.toLowerCase().includes('price')) {
-      return 0.75 * this.hasPriceInClassNameCoeff;
-    }
-    return DEFAULT_SCORE;
+    return this.contains(fnode.element.className, 'price', this.hasPriceInClassNameCoeff);
+  }
+
+  /** Scores fnode with 'price' in its class name */
+  hasPriceInParentClassName(fnode) {
+    return this.contains(fnode.element.parentElement.className, 'price', this.hasPriceInParentClassNameCoeff);
   }
 
   /**
    * Scores fnode by its vertical location relative to the fold
    */
   isAboveTheFold(fnode, featureCoeff) {
     const viewportHeight = window.innerHeight;
-    const top = fnode.element.getBoundingClientRect().top;
-    const upperHeightLimit = viewportHeight * 2;
+    const imageTop = fnode.element.getBoundingClientRect().top;
 
-    // If the node is below the fold by more than a viewport's length,
-    // return a low score.
-    if (top >= upperHeightLimit) {
-      return ZEROISH * featureCoeff;
-    }
-
-    // If the node is above the fold, return a high score.
-    if (top <= viewportHeight) {
-      return ONEISH * featureCoeff;
-    }
-
-    // Otherwise, scale the score linearly between the fold and a viewport's
-    // length below it.
-    const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit);
-    return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff;
+    // Stop giving additional bonus for anything closer than 200px to the top
+    // of the viewport. Those are probably usually headers.
+    return trapezoid(imageTop, viewportHeight * 2, 200) ** featureCoeff;
   }
 
   /**
-   * Scores fnode based on its x distance from the highest scoring image element
+   * Return whether the centerpoint of the element is near that of the highest-
+   * scoring image.
    */
-  isNearbyImageXAxisPrice(fnode) {
-    const viewportWidth = window.innerWidth;
-    const eleDOMRect = fnode.element.getBoundingClientRect();
-    const imageElement = this.getHighestScoringImage(fnode);
-    const imageDOMRect = imageElement.getBoundingClientRect();
-    const deltaRight = eleDOMRect.left - imageDOMRect.right;
-    const deltaLeft = imageDOMRect.left - eleDOMRect.right;
-    // True if element is completely to the right or left of the image element
-    const noOverlap = (deltaRight > 0 || deltaLeft > 0);
-    let deltaX;
-    if (noOverlap) {
-      if (deltaRight > 0) {
-        deltaX = deltaRight;
-      } else {
-        deltaX = deltaLeft;
-      }
-      // Give a higher score the closer it is to the image, normalized by viewportWidth
-      return (viewportWidth / deltaX) * this.isNearbyImageXAxisPriceCoeff;
-    }
-    return DEFAULT_SCORE;
+  isNearImage(fnode) {
+    const image = this.getHighestScoringImage(fnode);
+    return trapezoid(euclidean(fnode, image), 1000, 0) ** this.isNearImageCoeff;
   }
 
   /**
-   * Scores fnode based on its y distance from the highest scoring image element
+   * Return whether the potential title is near the top or bottom of the
+   * highest-scoring image.
+   *
+   * This is a makeshift ORing 2 signals: a "near the top" and a "near the
+   * bottom" one.
    */
-  isNearbyImageYAxisTitle(fnode) {
-    const viewportHeight = window.innerHeight;
-    const DOMRect = fnode.element.getBoundingClientRect();
-    const imageElement = this.getHighestScoringImage(fnode);
-    const imageDOMRect = imageElement.getBoundingClientRect();
-    // Some titles (like on Ebay) are above the image, so include a top buffer
-    const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER);
-    const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom;
-    // Give elements in a specific vertical band a higher score
-    if (isEleTopNearby && isEleBottomNearby) {
-      const deltaY = Math.abs(imageDOMRect.top - DOMRect.top);
-      // Give a higher score the closer it is to the image, normalized by viewportHeight
-      return (viewportHeight / deltaY) * this.isNearbyImageYAxisTitleCoeff;
-    }
-    return DEFAULT_SCORE;
+  isNearImageTopOrBottom(fnode) {
+    const image = this.getHighestScoringImage(fnode).element;
+    const imageRect = image.getBoundingClientRect();
+    const nodeRect = fnode.element.getBoundingClientRect();
+
+    // Should cover title above image and title in a column next to image.
+    // Could also consider using the y-axis midpoint of title.
+    const topDistance = Math.abs(imageRect.top - nodeRect.top);
+
+    // Test nodeRect.top. They're probably not side by side with the title at
+    // the bottom. Rather, title will be below image.
+    const bottomDistance = Math.abs(imageRect.bottom - nodeRect.top);
+
+    const shortestDistance = Math.min(topDistance, bottomDistance);
+    return trapezoid(shortestDistance, 200, 0) ** this.isNearbyImageYAxisTitleCoeff;
   }
 
   /**
-   * Scores fnode whose innerText matches a priceish RegExp pattern
+   * Return whether the fnode's innertext contains a dollars-and-cents number.
    */
   hasPriceishPattern(fnode) {
     const text = fnode.element.innerText;
     /**
      * With an optional '$' that doesn't necessarily have to be at the beginning
      * of the string (ex: 'US $5.00' on Ebay), matches any number of digits before
-     * a decimal point and exactly two after, where the two digits after the decimal point
-     * are at the end of the string
+     * a decimal point and exactly two after.
      */
-    const regExp = /\${0,1}\d+\.\d{2}$/;
-    if (regExp.test(text)) {
-      return this.hasPriceishPatternCoeff;
-    }
-    return DEFAULT_SCORE;
+    const regExp = /\$?\d+\.\d{2}(?![0-9])/;
+    return (regExp.test(text) ? ONEISH : ZEROISH) ** this.hasPriceishPatternCoeff;
   }
 
   /**
@@ -238,7 +200,7 @@ export default class RulesetFactory {
   isNearbyImageYAxisPrice(fnode) {
     const element = fnode.element;
     const DOMRect = element.getBoundingClientRect();
-    const imageElement = this.getHighestScoringImage(fnode);
+    const imageElement = this.getHighestScoringImage(fnode).element;
     const imageDOMRect = imageElement.getBoundingClientRect();
     if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER)
       && DOMRect.bottom <= imageDOMRect.bottom) {
@@ -278,7 +240,7 @@ export default class RulesetFactory {
       // better score the closer the element is to the top of the page
       rule(type('imageish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldImageCoeff))),
       // better score for larger images
-      rule(type('imageish'), score(this.largerImage.bind(this))),
+      rule(type('imageish'), score(this.imageIsBig.bind(this))),
       // return image element(s) with max score
       rule(type('imageish').max(), out('image')),
 
@@ -288,27 +250,31 @@ export default class RulesetFactory {
       // consider all eligible h1 elements
       rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('titleish')),
       // better score based on y-axis proximity to max scoring image element
-      rule(type('titleish'), score(this.isNearbyImageYAxisTitle.bind(this))),
+      rule(type('titleish'), score(this.isNearImageTopOrBottom.bind(this))),
       // return title element(s) with max score
       rule(type('titleish').max(), out('title')),
 
       /**
       * Price rules
       */
+      // 72% by itself, at [4, 4, 4, 4...]!:
       // consider all eligible span and h2 elements
       rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('priceish')),
       // check if the element has a '$' in its innerText
       rule(type('priceish'), score(this.hasDollarSign.bind(this))),
       // better score the closer the element is to the top of the page
       rule(type('priceish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldPriceCoeff))),
+
       // check if the id has "price" in it
       rule(type('priceish'), score(this.hasPriceInID.bind(this))),
+      rule(type('priceish'), score(this.hasPriceInParentID.bind(this))),
       // check if any class names have "price" in them
       rule(type('priceish'), score(this.hasPriceInClassName.bind(this))),
+      rule(type('priceish'), score(this.hasPriceInParentClassName.bind(this))),
       // better score for larger font size
-      rule(type('priceish'), score(this.largerFontSize.bind(this))),
+      rule(type('priceish'), score(this.fontIsBig.bind(this))),
       // better score based on x-axis proximity to max scoring image element
-      rule(type('priceish'), score(this.isNearbyImageXAxisPrice.bind(this))),
+      rule(type('priceish'), score(this.isNearImage.bind(this))),
       // check if innerText has a priceish pattern
       rule(type('priceish'), score(this.hasPriceishPattern.bind(this))),
       // return price element(s) with max score
@@ -331,6 +297,33 @@ export default class RulesetFactory {
   }
 
   getHighestScoringImage(fnode) {
-    return fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle
+    return fnode._ruleset.get('image')[0]; // eslint-disable-line no-underscore-dangle
   }
 }
+
+/**
+ * Scale a number to the range [ZEROISH, ONEISH].
+ *
+ * For a rising trapezoid, the result is ZEROISH until the input reaches
+ * zeroAt, then increases linearly until oneAt, at which it becomes ONEISH. To
+ * make a falling trapezoid, where the result is ONEISH to the left and ZEROISH
+ * to the right, use a zeroAt greater than oneAt.
+ */
+function trapezoid(number, zeroAt, oneAt) {
+    const isRising = zeroAt < oneAt;
+    if (isRising) {
+        if (number <= zeroAt) {
+            return ZEROISH;
+        } else if (number >= oneAt) {
+            return ONEISH;
+        }
+    } else {
+        if (number >= zeroAt) {
+            return ZEROISH;
+        } else if (number <= oneAt) {
+            return ONEISH;
+        }
+    }
+    const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt);
+    return slope * (number - zeroAt) + ZEROISH;
+}