-
Notifications
You must be signed in to change notification settings - Fork 15
#36: Integrate Fathom-based page extraction with a simple ruleset. #38
Changes from 2 commits
e3f5f9a
d364866
0980f10
a99d3ba
505c16d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"hasDivWithPriceClass": 1 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/** This Source Code Form is subject to the terms of the Mozilla Public | ||
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | ||
|
||
/* | ||
* Using Fathom to extract a product from its product page, | ||
* where a 'product' is defined by the bundle of features that | ||
* makes it identifiable. | ||
* | ||
* Features: title, image, price | ||
*/ | ||
|
||
import {dom, out, rule, ruleset, score, type} from 'fathom-web'; | ||
import fathomCoeffs from 'commerce/fathom_coefficients.json'; | ||
|
||
/** | ||
* Checks to see if an element is a <div> with a class of "price". | ||
* Returns an integer corresponding to the coefficient to use for | ||
* scoring an element with this rule. | ||
*/ | ||
function hasDivWithPriceClass(fnode) { | ||
if (fnode.element.classList.contains('price')) { | ||
return fathomCoeffs.hasDivWithPriceClass; | ||
} | ||
return 1; | ||
} | ||
|
||
/** | ||
* Ruleset for product features. Each feature has its own type. | ||
*/ | ||
const rules = ruleset( | ||
// get all elements that could contain the price | ||
rule(dom('div'), type('priceish')), | ||
|
||
// check class names to see if they contain 'price' | ||
rule(type('priceish'), score(hasDivWithPriceClass)), | ||
|
||
// return price element with max score | ||
rule(type('priceish').max(), out('product-price')), | ||
); | ||
|
||
/** | ||
* Extracts the highest scoring element for a given feature contained | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The "given feature" bit of this comment doesn't really apply anymore. There's no feature being passed in anymore, it just returns all the features we currently know how to extract. |
||
* in a page's HTML document. | ||
*/ | ||
export default function runTuningRoutine(doc) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ruleset isn't really a "tuning routine". Maybe rename this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I'm not really sure what "tuningRoutine" means in this context, but we certainly aren't tuning anything here. This was a relic of Swathi's/Victor's code. |
||
const fnodesList = rules.against(doc).get('product-price'); | ||
// It is possible for multiple elements to have the same highest score. | ||
const elementsList = fnodesList.map(fnode => fnode.element); | ||
if (elementsList.length === 1) { | ||
return elementsList[0]; | ||
} | ||
return null; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,14 @@ | |
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | ||
|
||
/** | ||
* Note that this page is defined in manifest.json to run at "document_idle" | ||
* which is after all DOM content has been loaded. | ||
*/ | ||
|
||
import runTuningRoutine from 'commerce/fathom_ruleset'; | ||
import {retry} from 'commerce/utils'; | ||
import extractionData from './product_extraction_data.json'; | ||
import extractionData from 'commerce/product_extraction_data.json'; | ||
|
||
const OPEN_GRAPH_PROPERTY_VALUES = { | ||
title: 'og:title', | ||
|
@@ -49,85 +55,106 @@ async function openBackgroundPort() { | |
} | ||
}()); | ||
|
||
/** | ||
* Checks to see if any product information for the page was found, | ||
* and if so, sends it to the background script via the port. | ||
*/ | ||
async function getProductInfo(port) { | ||
const productInfo = extractData(); | ||
if (productInfo) { | ||
port.postMessage({ | ||
type: 'product-data', | ||
data: productInfo, | ||
}); | ||
} | ||
} | ||
|
||
/** | ||
* Returns any extraction data found for the vendor based on the URL | ||
* for the page. | ||
*/ | ||
function getProductAttributeInfo() { | ||
const hostname = new URL(window.location.href).host; | ||
for (const [vendor, attributeInfo] of Object.entries(extractionData)) { | ||
if (hostname.includes(vendor)) { | ||
return attributeInfo; | ||
const fallbackExtraction = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we define each of these extraction objects in separate files and import them? That'd help focus this module a bit more. For the Fathom one, you can probably just define that in the existing fathom file and export it instead of the "return DOM nodes" function. |
||
/** | ||
* Returns any extraction data found for the vendor based on the URL | ||
* for the page. | ||
*/ | ||
getProductAttributeInfo() { | ||
const hostname = new URL(window.location.href).host; | ||
for (const [vendor, attributeInfo] of Object.entries(extractionData)) { | ||
if (hostname.includes(vendor)) { | ||
return attributeInfo; | ||
} | ||
} | ||
} | ||
return null; | ||
} | ||
return null; | ||
}, | ||
|
||
/** | ||
* Extracts and returns the string value for a given element property or attribute. | ||
* | ||
* @param {HTMLElement} element | ||
* @param {string} extractionProperty | ||
*/ | ||
function extractValueFromElement(element, extractionProperty) { | ||
switch (extractionProperty) { | ||
case 'content': | ||
return element.getAttribute('content'); | ||
case 'innerText': | ||
return element.innerText; | ||
case 'src': | ||
return element.src; | ||
default: | ||
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); | ||
} | ||
} | ||
/** | ||
* Extracts and returns the string value for a given element property or attribute. | ||
* | ||
* @param {HTMLElement} element | ||
* @param {string} extractionProperty | ||
*/ | ||
extractValueFromElement(element, extractionProperty) { | ||
switch (extractionProperty) { | ||
case 'content': | ||
return element.getAttribute('content'); | ||
case 'innerText': | ||
return element.innerText; | ||
case 'src': | ||
return element.src; | ||
default: | ||
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); | ||
} | ||
}, | ||
|
||
/** | ||
* Returns any product information available on the page from CSS | ||
* selectors if they exist, otherwise from Open Graph <meta> tags. | ||
*/ | ||
function extractData() { | ||
const data = {}; | ||
const attributeInfo = getProductAttributeInfo(); | ||
if (attributeInfo) { | ||
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) { | ||
const {selectors, extractUsing} = extractor; | ||
for (const selector of selectors) { | ||
const element = document.querySelector(selector); | ||
if (element) { | ||
data[productAttribute] = extractValueFromElement(element, extractUsing); | ||
if (data[productAttribute]) { | ||
break; | ||
} else { | ||
throw new Error(`Element found did not return a valid product ${productAttribute}.`); | ||
/** | ||
* Returns any product information available on the page from CSS | ||
* selectors if they exist, otherwise from Open Graph <meta> tags. | ||
*/ | ||
extractProduct() { | ||
const data = {}; | ||
const attributeInfo = this.getProductAttributeInfo(); | ||
if (attributeInfo) { | ||
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) { | ||
const {selectors, extractUsing} = extractor; | ||
for (const selector of selectors) { | ||
const element = document.querySelector(selector); | ||
if (element) { | ||
data[productAttribute] = this.extractValueFromElement(element, extractUsing); | ||
if (data[productAttribute]) { | ||
break; | ||
} else { | ||
throw new Error(`Element found did not return a valid product ${productAttribute}.`); | ||
} | ||
} else if (selector === selectors[selectors.length - 1]) { | ||
// None of the selectors matched an element on the page | ||
throw new Error(`No elements found with vendor data for product ${productAttribute}.`); | ||
} | ||
} else if (selector === selectors[selectors.length - 1]) { | ||
// None of the selectors matched an element on the page | ||
throw new Error(`No elements found with vendor data for product ${productAttribute}.`); | ||
} | ||
} | ||
} else { | ||
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { | ||
const metaEle = document.querySelector(`meta[property='${value}']`); | ||
if (metaEle) { | ||
data[key] = metaEle.getAttribute('content'); | ||
} | ||
} | ||
} | ||
} else { | ||
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { | ||
const metaEle = document.querySelector(`meta[property='${value}']`); | ||
if (metaEle) { | ||
data[key] = metaEle.getAttribute('content'); | ||
data.url = window.document.URL; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This extraction doesn't change per-extraction-method, so we can just add it in in |
||
return data; | ||
}, | ||
}; | ||
|
||
const fathomExtraction = { | ||
/* | ||
* Run the ruleset for the product features against the current window document | ||
*/ | ||
extractProduct() { | ||
const priceEle = runTuningRoutine(window.document); | ||
if (priceEle) { | ||
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); | ||
if (price) { | ||
return { | ||
price, | ||
url: window.document.URL, | ||
}; | ||
} | ||
} | ||
} | ||
return data; | ||
return null; | ||
}, | ||
}; | ||
|
||
/** | ||
* Checks to see if any product information for the page was found, | ||
* and if so, sends it to the background script via the port. | ||
*/ | ||
async function getProductInfo(port) { | ||
const extractedProduct = fathomExtraction.extractProduct() || fallbackExtraction.extractProduct(); | ||
port.postMessage({ | ||
from: 'content', | ||
subject: 'ready', | ||
extractedProduct, | ||
}); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Double asterisk for the doc comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doc comment is an excellent summary of the file!