Also use tempfile for html scraper, for better reliability

NielsSteensma · Apr 22, 2024 · 6e834cc · 6e834cc
1 parent 0b51aa8
commit 6e834cc
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 29 deletions.
diff --git a/lib/Dhalang/node_script_invoker.rb b/lib/Dhalang/node_script_invoker.rb
@@ -17,26 +17,6 @@ def self.execute_script(script_path, configuration)
       end
     end
 
-    # Executes JS script under given script_path by launching a new Node process. Once script is finished returns
-    # received stdout.
-    #
-    # @param [String] script_path           The absolute path of the JS script to execute.
-    # @param [Object] configuration         Set of options to use, configurable by the user.
-    #
-    # @return [String] stdout received from script.
-    def self.execute_script_and_read_stdout(script_path, configuration)
-      command = create_node_command(script_path, configuration)
-      Open3.popen3(command) do |_stdin, stdout, stderr, wait|
-        if wait.value.success?
-          return stdout.read.strip
-        end
-        output = stderr.read.strip
-        output = nil if output == ''
-        message = output || "Exited with status #{wait.value.exitstatus}"
-        raise DhalangError, message
-      end
-    end
-
 
     # Returns a [String] with node command that invokes the provided script with the configuration.
     #

diff --git a/lib/Scraper.rb b/lib/Scraper.rb
@@ -1,19 +1,25 @@
 module Dhalang
   # Provides functionality for scraping webpages.
   class Scraper
-    SCRIPT_PATH = File.expand_path('../js/scraper.js', __FILE__).freeze
+    SCRIPT_PATH = File.expand_path('../js/html-scraper.js', __FILE__).freeze
     private_constant :SCRIPT_PATH
 
-    # Scrapes content under the given url.
+    # Scrapes full HTML content under given url.
     #
     # @param  [String] url      Url to scrape.
     # @param  [Hash]   options  User configurable options.
     #
     # @return [String] Scraped HTML content.
-    def self.scrape(url, options = {})
-      UrlUtils.validate(url)
-      configuration = Configuration.new(url, options)
-      return NodeScriptInvoker.execute_script_and_read_stdout(SCRIPT_PATH, configuration.json)
+    def self.html(url, options = {})
+      temp_file = FileUtils.create_temp_file("html")
+      begin
+        configuration = Configuration.new(options, url, temp_file.path, "html")
+        NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
+        html = IO.read(temp_file.path)
+      ensure
+        FileUtils.delete(temp_file)
+      end
+      return html
     end
   end
 end
diff --git a/lib/js/scraper.js → lib/js/html-scraper.js b/lib/js/scraper.js → lib/js/html-scraper.js
@@ -1,7 +1,8 @@
 'use strict';
 const dhalang = require('./dhalang');
+const fs = require('node:fs');
 
-const scrapePage = async () => {
+const scrapeHtml = async () => {
     const configuration = dhalang.getConfiguration();
 
     let browser;
@@ -11,7 +12,7 @@ const scrapePage = async () => {
         await dhalang.configure(page, configuration.userOptions);
         await dhalang.navigate(page, configuration);
         const html = await page.content();
-        console.log(html);
+        fs.writeFileSync(configuration.tempFilePath, html);
     } catch (error) {
         console.error(error.message);
         process.exit(1);
@@ -22,4 +23,4 @@ const scrapePage = async () => {
         process.exit(0);
     }
 };
-scrapePage();
+scrapeHtml();