Feature: Page scraping (#52)

NielsSteensma · Apr 24, 2024 · 2ee0be7 · 2ee0be7
1 parent e85312d
commit 2ee0be7
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -5,9 +5,10 @@
 
 
 ## Features
-* Generate PDFs from pages
-* Generate PDFs from html ( external images/stylesheets supported )  
-* Capture a screenshot of a webpage
+* Generate PDFs from webpages
+* Generate PDFs from HTML ( external images/stylesheets supported )  
+* Capture screenshots from webpages
+* Scrape HTML from webpages
 
 
 
@@ -26,37 +27,42 @@ Install puppeteer in your application's root directory:
 
 <sub>Dhalang and Puppeteer require Node ≥ 18 and Puppeteer ≥ 22</sub>
 ## Usage
-__Get a PDF of a website url__  
+__PDF of a website url__  
 ```ruby
 Dhalang::PDF.get_from_url("https://www.google.com")
 ```
 It is important to pass the complete url, leaving out https://, http:// or www. will result in an error.
 
-__Get a PDF of a HTML string__  
+__PDF of a HTML string__  
 ```ruby
 Dhalang::PDF.get_from_html("<html><head></head><body><h1>examplestring</h1></body></html>") 
 ```
 
-__Get a PNG screenshot of a website__  
+__PNG screenshot of a website__  
 ```ruby
 Dhalang::Screenshot.get_from_url("https://www.google.com", :png)  
 ```
 
-__Get a JPEG screenshot of a website__  
+__JPEG screenshot of a website__  
 ```ruby
 Dhalang::Screenshot.get_from_url("https://www.google.com", :jpeg)  
 ```
 
-__Get a WEBP screenshot of a website__  
+__WEBP screenshot of a website__  
 ```ruby
 Dhalang::Screenshot.get_from_url("https://www.google.com", :webp)  
 ```
 
-All methods return a string containing the PDF or JPEG/PNG/WEBP in binary.   
+__HTML of a website__
+```ruby
+Dhalang::Scraper.html("https://www.google.com")  
+```
+
+Above methods either return a string containing the PDF/JPEG/PNG/WEBP in binary or the scraped HTML.   
 
 
 
-## Custom PDF/screenshot options
+## Custom options
 To override the default options that are set by Dhalang you can pass as last argument a hash with the custom options you want to set.
 
 For example to set custom margins for PDFs:

diff --git a/lib/Dhalang.rb b/lib/Dhalang.rb
@@ -1,6 +1,7 @@
 module Dhalang
   require_relative 'PDF'
   require_relative 'Screenshot'
+  require_relative 'Scraper'
   require_relative 'Dhalang/version'
   require_relative 'Dhalang/url_utils'
   require_relative 'Dhalang/file_utils'

diff --git a/lib/Dhalang/node_script_invoker.rb b/lib/Dhalang/node_script_invoker.rb
@@ -17,6 +17,7 @@ def self.execute_script(script_path, configuration)
       end
     end
 
+
     # Returns a [String] with node command that invokes the provided script with the configuration.
     #
     # @param [String] script_path           Absolute path of JS script to invoke.

diff --git a/lib/Scraper.rb b/lib/Scraper.rb
@@ -0,0 +1,26 @@
+module Dhalang
+  # Provides functionality for scraping webpages.
+  class Scraper
+    SCRIPT_PATH = File.expand_path('../js/html-scraper.js', __FILE__).freeze
+    private_constant :SCRIPT_PATH
+
+    # Scrapes full HTML content under given url.
+    #
+    # @param  [String] url      Url to scrape.
+    # @param  [Hash]   options  User configurable options.
+    #
+    # @return [String] Scraped HTML content.
+    def self.html(url, options = {})
+      UrlUtils.validate(url)
+      temp_file = FileUtils.create_temp_file("html")
+      begin
+        configuration = Configuration.new(options, url, temp_file.path, "html")
+        NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
+        html = IO.read(temp_file.path)
+      ensure
+        FileUtils.delete(temp_file)
+      end
+      return html
+    end
+  end
+end
diff --git a/lib/js/html-scraper.js b/lib/js/html-scraper.js
@@ -0,0 +1,26 @@
+'use strict';
+const dhalang = require('./dhalang');
+const fs = require('node:fs');
+
+const scrapeHtml = async () => {
+    const configuration = dhalang.getConfiguration();
+
+    let browser;
+    try {
+        browser = await dhalang.launchPuppeteer(configuration);
+        const page = await browser.newPage();
+        await dhalang.configure(page, configuration.userOptions);
+        await dhalang.navigate(page, configuration);
+        const html = await page.content();
+        fs.writeFileSync(configuration.tempFilePath, html);
+    } catch (error) {
+        console.error(error.message);
+        process.exit(1);
+    } finally {
+        if (browser) {
+            browser.close();
+        }
+        process.exit(0);
+    }
+};
+scrapeHtml();
diff --git a/spec/Scraper_spec.rb b/spec/Scraper_spec.rb
@@ -0,0 +1,17 @@
+require 'rspec'
+require 'Dhalang'
+
+describe '#html' do
+  context 'url without specified protocol' do
+    it 'raises InvalidURIError' do
+      expect { Dhalang::Scraper.html("google.com") }.to raise_error(URI::InvalidURIError)
+    end
+  end
+
+  context 'valid url' do
+    it 'returns scraped html' do
+      html = Dhalang::Scraper.html("https://www.google.com")
+      expect(html.empty?).to be false
+    end
+  end
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,7 @@ def self.execute_script(script_path, configuration) @@
           end
         end
         # Returns a [String] with node command that invokes the provided script with the configuration.
         #
         # @param [String] script_path           Absolute path of JS script to invoke.
@@ Expand Down @@