Skip to content
This repository has been archived by the owner on Dec 3, 2020. It is now read-only.

Commit

Permalink
Fix #42: Improve price string parsing.
Browse files Browse the repository at this point in the history
* Update how we pull the price string from extracted Fathom price elements to provide main and subunits (e.g. dollars and cents) if available.
* Add price string cleaning methods to remove extra characters (like commas) that were causing price parsing to fail.
* Handle case when price string parsing still fails after cleaning by checking in the background script that the price string is formatted correctly before rendering the browserAction popup.
  * This will guarantee we never see the “blank panel” reported in #79 and #88.

Price element innerText strings now supported as a result of these changes:
* "$1327 /each" ([Home Depot example page](https://www.homedepot.com/p/KitchenAid-Classic-4-5-Qt-Tilt-Head-White-Stand-Mixer-K45SSWH/202546032))
* "$1,049.00" ([Amazon example page](https://www.amazon.com/Fujifilm-X-T2-Mirrorless-F2-8-4-0-Lens/dp/B01I3LNQ6M/ref=sr_1_2?ie=UTF8&qid=1535594119&sr=8-2&keywords=fuji+xt2+camera))
* "US $789.99" ([Ebay example page](https://www.ebay.com/itm/Dell-Inspiron-7570-15-6-Touch-Laptop-i7-8550U-1-8GHz-8GB-1TB-NVIDIA-940MX-W10/263827294291))
* "$4.99+" ([Etsy example page](https://www.etsy.com/listing/555504975/frankenstein-2-custom-stencil?ga_order=most_relevant&ga_search_type=all&ga_view_type=gallery&ga_search_query=&ref=sr_gallery-1-13))

Note: This does not handle the case where there is more than one price for the product page (e.g. if we see a range of prices such as "$19.92 - $38.00" or if the price changes based on size/color, etc.); that’s handled by Issue #86.
  • Loading branch information
biancadanforth committed Sep 19, 2018
1 parent c7f7265 commit f63323c
Showing 1 changed file with 72 additions and 3 deletions.
75 changes: 72 additions & 3 deletions src/extraction/fathom_extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,76 @@ function runRuleset(doc) {
fnodesList = fnodesList.filter(fnode => fnode.scoreFor(`${feature}ish`) >= SCORE_THRESHOLD);
// It is possible for multiple elements to have the same highest score.
if (fnodesList.length >= 1) {
extractedElements[feature] = fnodesList[0].element;
const element = fnodesList[0].element;
// Check for price units and subunits
if (feature === 'price' && element.children.length > 0) {
extractedElements[feature] = getPriceUnitElements(element);
continue;
}
extractedElements[feature] = element;
}
}
return extractedElements;
}

/**
* Returns true if the string contains a number.
*/
function hasNumber(string) {
return /\d/.test(string);
}

/**
* Get the main and sub unit elements for the product price.
*
* @returns {Object} A string:element object with 'mainUnit' and 'subUnit' keys.
*/
function getPriceUnitElements(element) {
let isMainUnit = true;
const priceElements = {};
// Loop through children: first element containing a digit is main unit,
// second is subunit.
for (const priceSubEle of element.children) {
if (hasNumber(priceSubEle.innerText)) {
if (isMainUnit) {
priceElements.mainUnit = priceSubEle;
isMainUnit = false;
} else {
priceElements.subUnit = priceSubEle;
}
}
}
return priceElements;
}

/**
* Checks if a price object has subunits and returns a price string.
*
* @param {Object} If the price has subunits, an object literal, else an HTML element
*/
function getPriceString(priceObj) {
// Check for subunits e.g. dollars and cents.
if ('mainUnit' in priceObj) {
const mainUnitStr = priceObj.mainUnit.innerText;
const subUnitStr = priceObj.subUnit.innerText;
return cleanPriceString(`$${mainUnitStr}.${subUnitStr}`);
}
return cleanPriceString(priceObj.innerText);
}


/**
* Reformats price string to be of form "$NX.XX".
*/
function cleanPriceString(priceStr) {
// Remove any commas
let cleanedPriceStr = priceStr.replace(/,/g, '');
// Remove any characters preceding the '$' and following the '.XX'
cleanedPriceStr = cleanedPriceStr.substring(cleanedPriceStr.indexOf('$'));
cleanedPriceStr = cleanedPriceStr.substring(0, cleanedPriceStr.indexOf('.') + 3);
return cleanedPriceStr;
}

/**
* Returns true if every key in PRODUCT_FEATURES has a truthy value.
*/
Expand All @@ -58,9 +122,14 @@ export default function extractProduct(doc) {
for (const feature of PRODUCT_FEATURES) {
if (feature === 'image') {
extractedProduct[feature] = extractedElements[feature].src;
} else {
extractedProduct[feature] = extractedElements[feature].innerText;
continue;
// Clean up price string and check for subunits
} else if (feature === 'price') {
const priceStr = getPriceString(extractedElements[feature]);
extractedProduct[feature] = priceStr;
continue;
}
extractedProduct[feature] = extractedElements[feature].innerText;
}
}
return hasAllFeatures(extractedProduct) ? extractedProduct : null;
Expand Down

0 comments on commit f63323c

Please sign in to comment.