-
Notifications
You must be signed in to change notification settings - Fork 15
92% price accuracy #275
92% price accuracy #275
Changes from 16 commits
2558ea5
ca8d4c8
ce27b0d
d8e6f43
46b61d9
2eb12b8
51e6b30
6909520
8d38da1
525bd54
b567dd9
90423da
74658e9
c52af18
737ca67
47bbc7a
307fa3d
e1f4479
f0eba0d
3fc5856
dd16bf0
418a9cf
3486879
9217d83
89da723
0c888cb
1735fe3
82709cf
ac1d304
ba89ed4
fd6b043
6ce3ea9
5d55d10
45db392
e9edf39
c0570d3
a57656a
4cca17a
ea275aa
57ca295
aa4bd88
3c6649a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,12 +3,9 @@ | |
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | ||
|
||
import {dom, out, rule, ruleset, score, type} from 'fathom-web'; | ||
// Since the fathom-trainees add-on currently uses a submodule of Fathom, for | ||
// training, replace 'utils' with 'utilsForFrontend' | ||
import {ancestors} from 'fathom-web/utils'; | ||
import {ancestors} from 'fathom-web/utilsForFrontend'; | ||
import {euclidean} from 'fathom-web/clusters'; | ||
|
||
const DEFAULT_BODY_FONT_SIZE = 14; | ||
const DEFAULT_SCORE = 1; | ||
const TOP_BUFFER = 150; | ||
// From: https://github.com/mozilla/fathom-trainees/blob/master/src/trainees.js | ||
const ZEROISH = 0.08; | ||
|
@@ -26,165 +23,130 @@ export default class RulesetFactory { | |
[ | ||
this.hasDollarSignCoeff, | ||
this.hasPriceInClassNameCoeff, | ||
this.hasPriceInParentClassNameCoeff, | ||
this.hasPriceInIDCoeff, | ||
this.hasPriceInParentIDCoeff, | ||
this.hasPriceishPatternCoeff, | ||
this.isAboveTheFoldImageCoeff, | ||
this.isAboveTheFoldPriceCoeff, | ||
this.isNearbyImageXAxisPriceCoeff, | ||
this.isNearImageCoeff, | ||
this.isNearbyImageYAxisTitleCoeff, | ||
this.largerFontSizeCoeff, | ||
this.largerImageCoeff, | ||
this.bigFontCoeff, | ||
this.bigImageCoeff, | ||
] = coefficients; | ||
} | ||
|
||
/** | ||
* Scores fnode in direct proportion to its size | ||
*/ | ||
largerImage(fnode) { | ||
imageIsBig(fnode) { | ||
const domRect = fnode.element.getBoundingClientRect(); | ||
const area = (domRect.width) * (domRect.height); | ||
if (area === 0) { | ||
return DEFAULT_SCORE; | ||
} | ||
return area * this.largerImageCoeff; | ||
const area = domRect.width * domRect.height; | ||
|
||
// Assume no product images as small as 80px^2. No further bonus over | ||
// 1000^2. For one thing, that's getting into background image territory | ||
// (though we should have distinct penalties for that sort of thing if we | ||
// care). More importantly, clamp the upper bound of the score so we don't | ||
// overcome other bonuses and penalties. | ||
return trapezoid(area, 80 ** 2, 1000 ** 2) ** this.bigImageCoeff; | ||
} | ||
|
||
/** | ||
* Scores fnode in proportion to its font size | ||
*/ | ||
largerFontSize(fnode) { | ||
const size = window.getComputedStyle(fnode.element).fontSize; | ||
// Normalize the multiplier by the default font size | ||
const sizeMultiplier = parseFloat(size, 10) / DEFAULT_BODY_FONT_SIZE; | ||
return sizeMultiplier * this.largerFontSizeCoeff; | ||
/** Return whether a */ | ||
fontIsBig(fnode) { | ||
const size = parseInt(getComputedStyle(fnode.element).fontSize, 10); | ||
return trapezoid(size, 14, 50) ** this.bigFontCoeff; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For as much as I don't like the trapezoid name, it is making some of these rules a lot easier to understand. 👍 |
||
} | ||
|
||
/** | ||
* Scores fnode with a '$' in its innerText | ||
*/ | ||
hasDollarSign(fnode) { | ||
if (fnode.element.innerText.includes('$')) { | ||
return this.hasDollarSignCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
return (fnode.element.innerText.includes('$') ? ONEISH : ZEROISH) ** this.hasDollarSignCoeff; | ||
} | ||
|
||
/** Return a confidence of whether "price" is a word within a given string. */ | ||
contains(haystack, needle, coeff) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a tricky naming problem, huh. Maybe rename There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good call. I named those super-fast and never made a second pass to clean them up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. case-insensitive. But I ended up spelling it out. |
||
return (haystack.toLowerCase().includes(needle) ? ONEISH : ZEROISH) ** coeff; | ||
} | ||
|
||
/** | ||
* Scores fnode with 'price' in its id or its parent's id | ||
* Scores fnode with 'price' in its id | ||
*/ | ||
hasPriceInID(fnode) { | ||
const id = fnode.element.id; | ||
const parentID = fnode.element.parentElement.id; | ||
if (id.toLowerCase().includes('price')) { | ||
return this.hasPriceInIDCoeff; | ||
} | ||
if (parentID.toLowerCase().includes('price')) { | ||
return 0.75 * this.hasPriceInIDCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
return this.contains(fnode.element.id, 'price', this.hasPriceInIDCoeff); | ||
} | ||
|
||
/** | ||
* Scores fnode with 'price' in its class name or its parent's class name | ||
*/ | ||
hasPriceInParentID(fnode) { | ||
return this.contains(fnode.element.parentElement.id, 'price', this.hasPriceInParentIDCoeff); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It actually comes out as "", so it's fine. :-D |
||
} | ||
|
||
/** Scores fnode with 'price' in its class name */ | ||
hasPriceInClassName(fnode) { | ||
const className = fnode.element.className; | ||
const parentClassName = fnode.element.parentElement.className; | ||
if (className.toLowerCase().includes('price')) { | ||
return this.hasPriceInClassNameCoeff; | ||
} | ||
if (parentClassName.toLowerCase().includes('price')) { | ||
return 0.75 * this.hasPriceInClassNameCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
return this.contains(fnode.element.className, 'price', this.hasPriceInClassNameCoeff); | ||
} | ||
|
||
/** Scores fnode with 'price' in its class name */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In its parent's class name |
||
hasPriceInParentClassName(fnode) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea to break these each out into their own rules with their own coefficients for node versus parent node. |
||
return this.contains(fnode.element.parentElement.className, 'price', this.hasPriceInParentClassNameCoeff); | ||
} | ||
|
||
/** | ||
* Scores fnode by its vertical location relative to the fold | ||
*/ | ||
isAboveTheFold(fnode, featureCoeff) { | ||
const viewportHeight = window.innerHeight; | ||
const top = fnode.element.getBoundingClientRect().top; | ||
const upperHeightLimit = viewportHeight * 2; | ||
const imageTop = fnode.element.getBoundingClientRect().top; | ||
|
||
// If the node is below the fold by more than a viewport's length, | ||
// return a low score. | ||
if (top >= upperHeightLimit) { | ||
return ZEROISH * featureCoeff; | ||
} | ||
|
||
// If the node is above the fold, return a high score. | ||
if (top <= viewportHeight) { | ||
return ONEISH * featureCoeff; | ||
} | ||
|
||
// Otherwise, scale the score linearly between the fold and a viewport's | ||
// length below it. | ||
const slope = (ONEISH - ZEROISH) / (viewportHeight - upperHeightLimit); | ||
return (slope * (top - upperHeightLimit) + ZEROISH) * featureCoeff; | ||
// Stop giving additional bonus for anything closer than 200px to the top | ||
// of the viewport. Those are probably usually headers. | ||
return trapezoid(imageTop, viewportHeight * 2, 200) ** featureCoeff; | ||
} | ||
|
||
/** | ||
* Scores fnode based on its x distance from the highest scoring image element | ||
* Return whether the centerpoint of the element is near that of the highest- | ||
* scoring image. | ||
*/ | ||
isNearbyImageXAxisPrice(fnode) { | ||
const viewportWidth = window.innerWidth; | ||
const eleDOMRect = fnode.element.getBoundingClientRect(); | ||
const imageElement = this.getHighestScoringImage(fnode); | ||
const imageDOMRect = imageElement.getBoundingClientRect(); | ||
const deltaRight = eleDOMRect.left - imageDOMRect.right; | ||
const deltaLeft = imageDOMRect.left - eleDOMRect.right; | ||
// True if element is completely to the right or left of the image element | ||
const noOverlap = (deltaRight > 0 || deltaLeft > 0); | ||
let deltaX; | ||
if (noOverlap) { | ||
if (deltaRight > 0) { | ||
deltaX = deltaRight; | ||
} else { | ||
deltaX = deltaLeft; | ||
} | ||
// Give a higher score the closer it is to the image, normalized by viewportWidth | ||
return (viewportWidth / deltaX) * this.isNearbyImageXAxisPriceCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
isNearImage(fnode) { | ||
const image = this.getHighestScoringImage(fnode); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should be explicit about types? const imageFnode = this.getHighestScoringImageFnode(fnode); |
||
return trapezoid(euclidean(fnode, image), 1000, 0) ** this.isNearImageCoeff; | ||
} | ||
|
||
/** | ||
* Scores fnode based on its y distance from the highest scoring image element | ||
* Return whether the potential title is near the top or bottom of the | ||
* highest-scoring image. | ||
* | ||
* This is a makeshift ORing 2 signals: a "near the top" and a "near the | ||
* bottom" one. | ||
*/ | ||
isNearbyImageYAxisTitle(fnode) { | ||
const viewportHeight = window.innerHeight; | ||
const DOMRect = fnode.element.getBoundingClientRect(); | ||
const imageElement = this.getHighestScoringImage(fnode); | ||
const imageDOMRect = imageElement.getBoundingClientRect(); | ||
// Some titles (like on Ebay) are above the image, so include a top buffer | ||
const isEleTopNearby = DOMRect.top >= (imageDOMRect.top - TOP_BUFFER); | ||
const isEleBottomNearby = DOMRect.bottom <= imageDOMRect.bottom; | ||
// Give elements in a specific vertical band a higher score | ||
if (isEleTopNearby && isEleBottomNearby) { | ||
const deltaY = Math.abs(imageDOMRect.top - DOMRect.top); | ||
// Give a higher score the closer it is to the image, normalized by viewportHeight | ||
return (viewportHeight / deltaY) * this.isNearbyImageYAxisTitleCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
isNearImageTopOrBottom(fnode) { | ||
const image = this.getHighestScoringImage(fnode).element; | ||
const imageRect = image.getBoundingClientRect(); | ||
const nodeRect = fnode.element.getBoundingClientRect(); | ||
|
||
// Should cover title above image and title in a column next to image. | ||
// Could also consider using the y-axis midpoint of title. | ||
const topDistance = Math.abs(imageRect.top - nodeRect.top); | ||
|
||
// Test nodeRect.top. They're probably not side by side with the title at | ||
// the bottom. Rather, title will be below image. | ||
const bottomDistance = Math.abs(imageRect.bottom - nodeRect.top); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't think of a product page I have seen where the title is below the image -- did you find some from developing your test corpus? |
||
|
||
const shortestDistance = Math.min(topDistance, bottomDistance); | ||
return trapezoid(shortestDistance, 200, 0) ** this.isNearbyImageYAxisTitleCoeff; | ||
} | ||
|
||
/** | ||
* Scores fnode whose innerText matches a priceish RegExp pattern | ||
* Return whether the fnode's innertext contains a dollars-and-cents number. | ||
*/ | ||
hasPriceishPattern(fnode) { | ||
const text = fnode.element.innerText; | ||
/** | ||
* With an optional '$' that doesn't necessarily have to be at the beginning | ||
* of the string (ex: 'US $5.00' on Ebay), matches any number of digits before | ||
* a decimal point and exactly two after, where the two digits after the decimal point | ||
* are at the end of the string | ||
* a decimal point and exactly two after. | ||
*/ | ||
const regExp = /\${0,1}\d+\.\d{2}$/; | ||
if (regExp.test(text)) { | ||
return this.hasPriceishPatternCoeff; | ||
} | ||
return DEFAULT_SCORE; | ||
const regExp = /\$?\d+\.\d{2}(?![0-9])/; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the negative lookahead be optional, for price strings at the end of the element text? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, because the negative lookahead succeeds when the string ends. (After all, there is indeed no digit there.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this addition, // Trying it out in a REPL...
const regExpBefore = /\${0,1}\d+\.\d{2}$/;
const regExpAfter = /\$?\d+\.\d{2}(?![0-9])/;
const sample = 'US $4.99e';
regExpBefore.test(sample); // false
regExpAfter.test(sample); // true |
||
return (regExp.test(text) ? ONEISH : ZEROISH) ** this.hasPriceishPatternCoeff; | ||
} | ||
|
||
/** | ||
|
@@ -238,7 +200,7 @@ export default class RulesetFactory { | |
isNearbyImageYAxisPrice(fnode) { | ||
const element = fnode.element; | ||
const DOMRect = element.getBoundingClientRect(); | ||
const imageElement = this.getHighestScoringImage(fnode); | ||
const imageElement = this.getHighestScoringImage(fnode).element; | ||
const imageDOMRect = imageElement.getBoundingClientRect(); | ||
if (DOMRect.top >= (imageDOMRect.top - TOP_BUFFER) | ||
&& DOMRect.bottom <= imageDOMRect.bottom) { | ||
|
@@ -278,7 +240,7 @@ export default class RulesetFactory { | |
// better score the closer the element is to the top of the page | ||
rule(type('imageish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldImageCoeff))), | ||
// better score for larger images | ||
rule(type('imageish'), score(this.largerImage.bind(this))), | ||
rule(type('imageish'), score(this.imageIsBig.bind(this))), | ||
// return image element(s) with max score | ||
rule(type('imageish').max(), out('image')), | ||
|
||
|
@@ -288,27 +250,31 @@ export default class RulesetFactory { | |
// consider all eligible h1 elements | ||
rule(dom('h1').when(this.isEligibleTitle.bind(this)), type('titleish')), | ||
// better score based on y-axis proximity to max scoring image element | ||
rule(type('titleish'), score(this.isNearbyImageYAxisTitle.bind(this))), | ||
rule(type('titleish'), score(this.isNearImageTopOrBottom.bind(this))), | ||
// return title element(s) with max score | ||
rule(type('titleish').max(), out('title')), | ||
|
||
/** | ||
* Price rules | ||
*/ | ||
// 72% by itself, at [4, 4, 4, 4...]!: | ||
// consider all eligible span and h2 elements | ||
rule(dom('span, h2').when(this.isEligiblePrice.bind(this)), type('priceish')), | ||
// check if the element has a '$' in its innerText | ||
rule(type('priceish'), score(this.hasDollarSign.bind(this))), | ||
// better score the closer the element is to the top of the page | ||
rule(type('priceish'), score(fnode => this.isAboveTheFold(fnode, this.isAboveTheFoldPriceCoeff))), | ||
|
||
// check if the id has "price" in it | ||
rule(type('priceish'), score(this.hasPriceInID.bind(this))), | ||
rule(type('priceish'), score(this.hasPriceInParentID.bind(this))), | ||
// check if any class names have "price" in them | ||
rule(type('priceish'), score(this.hasPriceInClassName.bind(this))), | ||
rule(type('priceish'), score(this.hasPriceInParentClassName.bind(this))), | ||
// better score for larger font size | ||
rule(type('priceish'), score(this.largerFontSize.bind(this))), | ||
rule(type('priceish'), score(this.fontIsBig.bind(this))), | ||
// better score based on x-axis proximity to max scoring image element | ||
rule(type('priceish'), score(this.isNearbyImageXAxisPrice.bind(this))), | ||
rule(type('priceish'), score(this.isNearImage.bind(this))), | ||
// check if innerText has a priceish pattern | ||
rule(type('priceish'), score(this.hasPriceishPattern.bind(this))), | ||
// return price element(s) with max score | ||
|
@@ -331,6 +297,33 @@ export default class RulesetFactory { | |
} | ||
|
||
getHighestScoringImage(fnode) { | ||
return fnode._ruleset.get('image')[0].element; // eslint-disable-line no-underscore-dangle | ||
return fnode._ruleset.get('image')[0]; // eslint-disable-line no-underscore-dangle | ||
} | ||
} | ||
|
||
/** | ||
* Scale a number to the range [ZEROISH, ONEISH]. | ||
* | ||
* For a rising trapezoid, the result is ZEROISH until the input reaches | ||
* zeroAt, then increases linearly until oneAt, at which it becomes ONEISH. To | ||
* make a falling trapezoid, where the result is ONEISH to the left and ZEROISH | ||
* to the right, use a zeroAt greater than oneAt. | ||
*/ | ||
function trapezoid(number, zeroAt, oneAt) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, we could go with the more semantic meaning of this, which is something like: return STRONG_MATCH; // rename ONEISH to STRONG_MATCH
return WEAK_MATCH; // rename ZEROISH to WEAK_MATCH
return scaledMatch(number, {weak: 15, strong: 7}); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like your linearScale spelling. Changing. I'm not sure what you have in mind with defaulting the zeroAt param to ZEROISH and oneAt to ONEISH. ZEROISH and ONEISH are the output values of linearScale, not typically the input ones. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't mean to default them, rather was just putting them in as rando arguments. |
||
const isRising = zeroAt < oneAt; | ||
if (isRising) { | ||
if (number <= zeroAt) { | ||
return ZEROISH; | ||
} else if (number >= oneAt) { | ||
return ONEISH; | ||
} | ||
} else { | ||
if (number >= zeroAt) { | ||
return ZEROISH; | ||
} else if (number <= oneAt) { | ||
return ONEISH; | ||
} | ||
} | ||
const slope = (ONEISH - ZEROISH) / (oneAt - zeroAt); | ||
return slope * (number - zeroAt) + ZEROISH; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: It would be easier here if these threshold values (14, 50) were stored in
const
variables at the top for future tweaking. That way they could also be described in terms of what units they're in, and potentially used by other rules.