Refactored fuzz tests to iterate all files in directory; run timeout …

…tests
jhy · Aug 15, 2021 · 530c5b0 · 530c5b0
1 parent d2c455c
commit 530c5b0
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 203 deletions.
diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java
@@ -108,31 +108,48 @@ public static Connection newSession() {
     /**
      Parse the contents of a file as HTML.
 
-     @param in          file to load HTML from
+     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
      @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
      present, or fall back to {@code UTF-8} (which is often safe to do).
      @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
      @return sane HTML
 
      @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
      */
-    public static Document parse(File in, @Nullable String charsetName, String baseUri) throws IOException {
-        return DataUtil.load(in, charsetName, baseUri);
+    public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
+        return DataUtil.load(file, charsetName, baseUri);
     }
 
     /**
      Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
 
-     @param in          file to load HTML from
+     @param file        file to load HTML from. Supports gzipped files (ending in .z or .gz).
      @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
      present, or fall back to {@code UTF-8} (which is often safe to do).
      @return sane HTML
 
      @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
      @see #parse(File, String, String)
      */
-    public static Document parse(File in, @Nullable String charsetName) throws IOException {
-        return DataUtil.load(in, charsetName, in.getAbsolutePath());
+    public static Document parse(File file, @Nullable String charsetName) throws IOException {
+        return DataUtil.load(file, charsetName, file.getAbsolutePath());
+    }
+
+    /**
+     Parse the contents of a file as HTML.
+
+     @param file          file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @param parser alternate {@link Parser#xmlParser() parser} to use.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @since 1.14.2
+     */
+    public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        return DataUtil.load(file, charsetName, baseUri, parser);
     }
 
      /**

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -49,20 +49,38 @@ public final class DataUtil {
 
     private DataUtil() {}
 
+    /**
+     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
+     * @param file file to load
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
+     * @param baseUri base URI of document, to resolve relative links against
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
+        return load(file, charsetName, baseUri, Parser.htmlParser());
+    }
+
     /**
      * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
      * are supported in addition to uncompressed files.
      *
-     * @param in file to load
+     * @param file file to load
      * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
      *     the file will always override this setting.
      * @param baseUri base URI of document, to resolve relative links against
+     * @param parser alternate {@link Parser#xmlParser() parser} to use.
+
      * @return Document
      * @throws IOException on IO error
+     * @since 1.14.2
      */
-    public static Document load(File in, @Nullable String charsetName, String baseUri) throws IOException {
-        InputStream stream = new FileInputStream(in);
-        String name = Normalizer.lowerCase(in.getName());
+    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        InputStream stream = new FileInputStream(file);
+        String name = Normalizer.lowerCase(file.getName());
         if (name.endsWith(".gz") || name.endsWith(".z")) {
             // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
             boolean zipped;
@@ -72,9 +90,9 @@ public static Document load(File in, @Nullable String charsetName, String baseUr
                 stream.close();
 
             }
-            stream = zipped ? new GZIPInputStream(new FileInputStream(in)) : new FileInputStream(in);
+            stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
         }
-        return parseInputStream(stream, charsetName, baseUri, Parser.htmlParser());
+        return parseInputStream(stream, charsetName, baseUri, parser);
     }
 
     /**

diff --git a/src/test/java/org/jsoup/integration/FuzzFixesIT.java b/src/test/java/org/jsoup/integration/FuzzFixesIT.java
@@ -0,0 +1,61 @@
+package org.jsoup.integration;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ Tests fixes for issues raised by the OSS Fuzz project @ https://oss-fuzz.com/testcases?project=jsoup As some of these
+ are timeout tests - run each file 100 times and ensure under time.
+ */
+public class FuzzFixesIT {
+    static int numIters = 50;
+    static int timeout = 20; // external fuzzer is set to 60 for 100 runs
+    static File testDir = ParseTest.getFile("/fuzztests/");
+
+    private static Stream<File> testFiles() {
+        File[] files = testDir.listFiles();
+        assertNotNull(files);
+        assertTrue(files.length > 10);
+
+        return Stream.of(files);
+    }
+
+    @ParameterizedTest
+    @MethodSource("testFiles")
+    void testHtmlParse(File file) throws IOException {
+        long startTime = System.currentTimeMillis();
+        long completeBy = startTime + timeout * 1000L;
+
+        for (int i = 0; i < numIters; i++) {
+            Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/");
+            assertNotNull(doc);
+            if (System.currentTimeMillis() > completeBy)
+                Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
+        }
+    }
+
+    @ParameterizedTest
+    @MethodSource("testFiles")
+    void testXmlParse(File file) throws IOException {
+        long startTime = System.currentTimeMillis();
+        long completeBy = startTime + timeout * 1000L;
+
+        for (int i = 0; i < numIters; i++) {
+            Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser());
+            assertNotNull(doc);
+            if (System.currentTimeMillis() > completeBy)
+                Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
+        }
+    }
+}