Skip to content

Commit

Permalink
Refactored fuzz tests to iterate all files in directory; run timeout …
Browse files Browse the repository at this point in the history
…tests
  • Loading branch information
jhy committed Aug 15, 2021
1 parent d2c455c commit 530c5b0
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 203 deletions.
29 changes: 23 additions & 6 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,31 +108,48 @@ public static Connection newSession() {
/**
Parse the contents of a file as HTML.
@param in file to load HTML from
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
*/
public static Document parse(File in, @Nullable String charsetName, String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
return DataUtil.load(file, charsetName, baseUri);
}

/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
@param in file to load HTML from
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@see #parse(File, String, String)
*/
public static Document parse(File in, @Nullable String charsetName) throws IOException {
return DataUtil.load(in, charsetName, in.getAbsolutePath());
public static Document parse(File file, @Nullable String charsetName) throws IOException {
return DataUtil.load(file, charsetName, file.getAbsolutePath());
}

/**
Parse the contents of a file as HTML.
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@param parser alternate {@link Parser#xmlParser() parser} to use.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@since 1.14.2
*/
public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return DataUtil.load(file, charsetName, baseUri, parser);
}

/**
Expand Down
30 changes: 24 additions & 6 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,38 @@ public final class DataUtil {

private DataUtil() {}

/**
* Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param file file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @return Document
* @throws IOException on IO error
*/
public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
return load(file, charsetName, baseUri, Parser.htmlParser());
}

/**
* Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param in file to load
* @param file file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @param parser alternate {@link Parser#xmlParser() parser} to use.
* @return Document
* @throws IOException on IO error
* @since 1.14.2
*/
public static Document load(File in, @Nullable String charsetName, String baseUri) throws IOException {
InputStream stream = new FileInputStream(in);
String name = Normalizer.lowerCase(in.getName());
public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = new FileInputStream(file);
String name = Normalizer.lowerCase(file.getName());
if (name.endsWith(".gz") || name.endsWith(".z")) {
// unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
boolean zipped;
Expand All @@ -72,9 +90,9 @@ public static Document load(File in, @Nullable String charsetName, String baseUr
stream.close();

}
stream = zipped ? new GZIPInputStream(new FileInputStream(in)) : new FileInputStream(in);
stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
}
return parseInputStream(stream, charsetName, baseUri, Parser.htmlParser());
return parseInputStream(stream, charsetName, baseUri, parser);
}

/**
Expand Down
61 changes: 61 additions & 0 deletions src/test/java/org/jsoup/integration/FuzzFixesIT.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package org.jsoup.integration;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

import java.io.File;
import java.io.IOException;
import java.util.stream.Stream;

import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

/**
Tests fixes for issues raised by the OSS Fuzz project @ https://oss-fuzz.com/testcases?project=jsoup As some of these
are timeout tests - run each file 100 times and ensure under time.
*/
public class FuzzFixesIT {
static int numIters = 50;
static int timeout = 20; // external fuzzer is set to 60 for 100 runs
static File testDir = ParseTest.getFile("/fuzztests/");

private static Stream<File> testFiles() {
File[] files = testDir.listFiles();
assertNotNull(files);
assertTrue(files.length > 10);

return Stream.of(files);
}

@ParameterizedTest
@MethodSource("testFiles")
void testHtmlParse(File file) throws IOException {
long startTime = System.currentTimeMillis();
long completeBy = startTime + timeout * 1000L;

for (int i = 0; i < numIters; i++) {
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/");
assertNotNull(doc);
if (System.currentTimeMillis() > completeBy)
Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
}
}

@ParameterizedTest
@MethodSource("testFiles")
void testXmlParse(File file) throws IOException {
long startTime = System.currentTimeMillis();
long completeBy = startTime + timeout * 1000L;

for (int i = 0; i < numIters; i++) {
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser());
assertNotNull(doc);
if (System.currentTimeMillis() > completeBy)
Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
}
}
}
Loading

0 comments on commit 530c5b0

Please sign in to comment.