-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwebscraper.js
69 lines (61 loc) · 2.41 KB
/
webscraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
const {readFileSync, writeFile, readdirSync, statSync} = require('fs');
const {dirname, join} = require('path');
var webdriver = require('selenium-webdriver');
var driver = new webdriver.Builder().
withCapabilities(webdriver.Capabilities.safari()).
build();
function writeToFile(path, thing){
writeFile(path, thing, 'utf-8', (err) => {
if (err) {
console.log("ERROR: ", err);
throw err;
}
console.log(path, "DONE");
});
}
function collectCSS(dir) {
let callback = arguments[arguments.length - 1];
let node_properties = {};
const src = document.documentElement.outerHTML;
const all = document.getElementsByTagName("*");
for (let j = 0; j < all.length; j++) {
const currNode = all[j];
const currRect = currNode.getBoundingClientRect();
node_properties[j] = {
"top" : currRect.top,
"bottom" : currRect.bottom,
"left" : currRect.left,
"right" : currRect.right,
"display" : currNode.style.display,
"visibility" : currNode.style.visibility,
"strikethrough" : window.getComputedStyle(currNode).getPropertyValue('text-decoration')
};
}
setTimeout(function () {
callback({
html: src,
css: node_properties,
folder : dir
});
});
}
if (require.main === module) {
//Get folders
const dirs = p => readdirSync(p).filter(f => statSync(p+"/"+f).isDirectory());
const folders = dirs(join(dirname(__dirname), 'fathom-products' , 'product_classification_test_data'));
driver.manage().window().setSize(1680, 960);
driver.manage().timeouts().setScriptTimeout(100000);
folders.forEach(function(item){
let currUrl = join('file://', dirname(__dirname), 'fathom-products' , 'product_classification_test_data', item, 'source.webarchive');
//Get the page
driver.get(currUrl);
//Get dimensions and css data
driver.executeAsyncScript(collectCSS, item).then(function(doc_info) {
console.log(doc_info.folder);
const base = join(dirname(__dirname), 'fathom-products', 'product_classification_test_data', doc_info.folder);
writeToFile(base + '/source.html', String(doc_info.html));
writeToFile(base + '/nodes.txt', JSON.stringify(doc_info.css));
});
});
driver.close();
}