Skip to content

Commit

Permalink
Also use tempfile for html scraper, for better reliability
Browse files Browse the repository at this point in the history
  • Loading branch information
NielsSteensma committed Apr 22, 2024
1 parent 0b51aa8 commit 6e834cc
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 29 deletions.
20 changes: 0 additions & 20 deletions lib/Dhalang/node_script_invoker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,6 @@ def self.execute_script(script_path, configuration)
end
end

# Executes JS script under given script_path by launching a new Node process. Once script is finished returns
# received stdout.
#
# @param [String] script_path The absolute path of the JS script to execute.
# @param [Object] configuration Set of options to use, configurable by the user.
#
# @return [String] stdout received from script.
def self.execute_script_and_read_stdout(script_path, configuration)
command = create_node_command(script_path, configuration)
Open3.popen3(command) do |_stdin, stdout, stderr, wait|
if wait.value.success?
return stdout.read.strip
end
output = stderr.read.strip
output = nil if output == ''
message = output || "Exited with status #{wait.value.exitstatus}"
raise DhalangError, message
end
end


# Returns a [String] with node command that invokes the provided script with the configuration.
#
Expand Down
18 changes: 12 additions & 6 deletions lib/Scraper.rb
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
module Dhalang
# Provides functionality for scraping webpages.
class Scraper
SCRIPT_PATH = File.expand_path('../js/scraper.js', __FILE__).freeze
SCRIPT_PATH = File.expand_path('../js/html-scraper.js', __FILE__).freeze
private_constant :SCRIPT_PATH

# Scrapes content under the given url.
# Scrapes full HTML content under given url.
#
# @param [String] url Url to scrape.
# @param [Hash] options User configurable options.
#
# @return [String] Scraped HTML content.
def self.scrape(url, options = {})
UrlUtils.validate(url)
configuration = Configuration.new(url, options)
return NodeScriptInvoker.execute_script_and_read_stdout(SCRIPT_PATH, configuration.json)
def self.html(url, options = {})
temp_file = FileUtils.create_temp_file("html")
begin
configuration = Configuration.new(options, url, temp_file.path, "html")
NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
html = IO.read(temp_file.path)
ensure
FileUtils.delete(temp_file)
end
return html
end
end
end
7 changes: 4 additions & 3 deletions lib/js/scraper.js → lib/js/html-scraper.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
'use strict';
const dhalang = require('./dhalang');
const fs = require('node:fs');

const scrapePage = async () => {
const scrapeHtml = async () => {
const configuration = dhalang.getConfiguration();

let browser;
Expand All @@ -11,7 +12,7 @@ const scrapePage = async () => {
await dhalang.configure(page, configuration.userOptions);
await dhalang.navigate(page, configuration);
const html = await page.content();
console.log(html);
fs.writeFileSync(configuration.tempFilePath, html);
} catch (error) {
console.error(error.message);
process.exit(1);
Expand All @@ -22,4 +23,4 @@ const scrapePage = async () => {
process.exit(0);
}
};
scrapePage();
scrapeHtml();

0 comments on commit 6e834cc

Please sign in to comment.