This repository has been archived by the owner on Dec 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from mozilla/36-fathom
#36: Integrate Fathom-based page extraction with a simple ruleset.
- Loading branch information
Showing
10 changed files
with
2,808 additions
and
2,526 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* This Source Code Form is subject to the terms of the Mozilla Public | ||
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | ||
|
||
/* | ||
* Uses CSS selectors, or failing that, Open Graph <meta> tags to extract | ||
* a product from its product page, where a 'product' is defined by the bundle | ||
* of features that makes it identifiable. | ||
* | ||
* Features: title, image, price | ||
*/ | ||
|
||
import extractionData from 'commerce/product_extraction_data.json'; | ||
|
||
const OPEN_GRAPH_PROPERTY_VALUES = { | ||
title: 'og:title', | ||
image: 'og:image', | ||
price: 'og:price:amount', | ||
}; | ||
|
||
/** | ||
* Returns any extraction data found for the vendor based on the URL | ||
* for the page. | ||
*/ | ||
function getProductAttributeInfo() { | ||
const hostname = new URL(window.location.href).host; | ||
for (const [vendor, attributeInfo] of Object.entries(extractionData)) { | ||
if (hostname.includes(vendor)) { | ||
return attributeInfo; | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
/** | ||
* Extracts and returns the string value for a given element property or attribute. | ||
* | ||
* @param {HTMLElement} element | ||
* @param {string} extractionProperty | ||
*/ | ||
function extractValueFromElement(element, extractionProperty) { | ||
switch (extractionProperty) { | ||
case 'content': | ||
return element.getAttribute('content'); | ||
case 'innerText': | ||
return element.innerText; | ||
case 'src': | ||
return element.src; | ||
default: | ||
throw new Error(`Unrecognized extraction property or attribute '${extractionProperty}'.`); | ||
} | ||
} | ||
|
||
/** | ||
* Returns any product information available on the page from CSS | ||
* selectors if they exist, otherwise from Open Graph <meta> tags. | ||
*/ | ||
export default function extractProduct() { | ||
const data = {}; | ||
const attributeInfo = getProductAttributeInfo(); | ||
if (attributeInfo) { | ||
for (const [productAttribute, extractor] of Object.entries(attributeInfo)) { | ||
const {selectors, extractUsing} = extractor; | ||
for (const selector of selectors) { | ||
const element = document.querySelector(selector); | ||
if (element) { | ||
data[productAttribute] = extractValueFromElement(element, extractUsing); | ||
if (data[productAttribute]) { | ||
break; | ||
} else { | ||
throw new Error(`Element found did not return a valid product ${productAttribute}.`); | ||
} | ||
} else if (selector === selectors[selectors.length - 1]) { | ||
// None of the selectors matched an element on the page | ||
throw new Error(`No elements found with vendor data for product ${productAttribute}.`); | ||
} | ||
} | ||
} | ||
} else { | ||
for (const [key, value] of Object.entries(OPEN_GRAPH_PROPERTY_VALUES)) { | ||
const metaEle = document.querySelector(`meta[property='${value}']`); | ||
if (metaEle) { | ||
data[key] = metaEle.getAttribute('content'); | ||
} | ||
} | ||
} | ||
return data; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"hasPriceClass": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* This Source Code Form is subject to the terms of the Mozilla Public | ||
* License, v. 2.0. If a copy of the MPL was not distributed with this | ||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ | ||
|
||
/* | ||
* Uses Fathom to extract a product from its product page, | ||
* where a 'product' is defined by the bundle of features that | ||
* makes it identifiable. | ||
* | ||
* Features: title, image, price | ||
*/ | ||
|
||
import {dom, out, rule, ruleset, score, type} from 'fathom-web'; | ||
import fathomCoeffs from 'commerce/fathom_coefficients.json'; | ||
|
||
const SCORE_THRESHOLD = fathomCoeffs.hasPriceClass; | ||
|
||
/** | ||
* Scores fnodes with a "price" class | ||
*/ | ||
function hasPriceClass(fnode) { | ||
if (fnode.element.classList.contains('price')) { | ||
return fathomCoeffs.hasPriceClass; | ||
} | ||
return 1; | ||
} | ||
|
||
/** | ||
* Ruleset for product features. Each feature has its own type. | ||
*/ | ||
const rules = ruleset( | ||
// get all elements that could contain the price | ||
rule(dom('div'), type('priceish')), | ||
|
||
// check class names to see if they contain 'price' | ||
rule(type('priceish'), score(hasPriceClass)), | ||
|
||
// return price element with max score | ||
rule(type('priceish').max(), out('product-price')), | ||
); | ||
|
||
/** | ||
* Extracts the highest scoring element above a score threshold | ||
* contained in a page's HTML document. | ||
*/ | ||
function runRuleset(doc) { | ||
let fnodesList = rules.against(doc).get('product-price'); | ||
fnodesList = fnodesList.filter(fnode => fnode.scoreFor('priceish') >= SCORE_THRESHOLD); | ||
// It is possible for multiple elements to have the same highest score. | ||
if (fnodesList.length >= 1) { | ||
return fnodesList[0].element; | ||
} | ||
return null; | ||
} | ||
|
||
/* | ||
* Run the ruleset for the product features against the current window document | ||
*/ | ||
export default function extractProduct(doc) { | ||
const priceEle = runRuleset(doc); | ||
if (priceEle) { | ||
const price = (priceEle.tagName !== 'META') ? priceEle.textContent : priceEle.getAttribute('content'); | ||
if (price) { | ||
return { | ||
price, | ||
}; | ||
} | ||
} | ||
return null; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters