From 163456d3e36e77efead465f6ba4d09fa4e96abc1 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 7 Jan 2025 18:07:50 +1100 Subject: [PATCH 1/2] Support http/2 requests via optional HttpClient This adds support for the Java 11+ HttpClient on systems that support it, enabling http/2 requests. On Java 8, and on Android, requests will still go via the existing HttpURLConnection. This is currently disabled by default -- set the system property `jsoup.useHttpClient` to enable it. --- pom.xml | 44 +++++- .../jsoup/helper/AuthenticationHandler.java | 9 +- .../java/org/jsoup/helper/CookieUtil.java | 8 +- .../java/org/jsoup/helper/HttpConnection.java | 137 +++++------------- .../jsoup/helper/RequestAuthenticator.java | 2 +- .../org/jsoup/helper/RequestDispatch.java | 44 ++++++ .../org/jsoup/helper/RequestExecutor.java | 28 ++++ .../jsoup/helper/UrlConnectionExecutor.java | 125 ++++++++++++++++ .../org/jsoup/internal/SharedConstants.java | 2 + src/main/java11/module-info.java | 12 ++ .../org/jsoup/helper/HttpClientExecutor.java | 136 +++++++++++++++++ .../org/jsoup/helper/RequestAuthHandler.java | 35 +++++ src/main/java9/module-info.java | 11 -- .../org/jsoup/helper/RequestAuthHandler.java | 24 --- .../org/jsoup/integration/ConnectTest.java | 34 +++-- .../org/jsoup/integration/TestServer.java | 3 +- .../jsoup/helper/HttpClientExecutorTest.java | 33 +++++ .../integration/HttpClientConnectIT.java | 31 ++++ .../integration/HttpClientConnectTest.java | 18 +++ .../integration/HttpClientSessionIT.java | 17 +++ .../integration/HttpClientSessionTest.java | 17 +++ 21 files changed, 606 insertions(+), 164 deletions(-) create mode 100644 src/main/java/org/jsoup/helper/RequestDispatch.java create mode 100644 src/main/java/org/jsoup/helper/RequestExecutor.java create mode 100644 src/main/java/org/jsoup/helper/UrlConnectionExecutor.java create mode 100644 src/main/java11/module-info.java create mode 100644 src/main/java11/org/jsoup/helper/HttpClientExecutor.java create mode 100644 src/main/java11/org/jsoup/helper/RequestAuthHandler.java delete mode 100644 src/main/java9/module-info.java delete mode 100644 src/main/java9/org/jsoup/helper/RequestAuthHandler.java create mode 100644 src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java create mode 100644 src/test/java11/org/jsoup/integration/HttpClientConnectIT.java create mode 100644 src/test/java11/org/jsoup/integration/HttpClientConnectTest.java create mode 100644 src/test/java11/org/jsoup/integration/HttpClientSessionIT.java create mode 100644 src/test/java11/org/jsoup/integration/HttpClientSessionTest.java diff --git a/pom.xml b/pom.xml index 10c5ad907a..bfc4b8e718 100644 --- a/pom.xml +++ b/pom.xml @@ -82,6 +82,7 @@ java.net.HttpURLConnection + java.net.http.* @@ -119,6 +120,10 @@ java.util.Spliterators java.nio.ByteBuffer java.net.HttpURLConnection + + java.net.http.* + java.time.Duration + java.util.OptionalLong @@ -372,11 +377,11 @@ - + compile-multi-release - [9,2000) + [11,2000) @@ -398,22 +403,51 @@ - compile-java-9 + compile-java-11 compile compile - 9 + 11 - ${project.basedir}/src/main/java9 + ${project.basedir}/src/main/java11 true + + + testcompile-java-11 + test-compile + + testCompile + + + 11 + + + ${project.basedir}/src/test/java11 + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + ${project.build.outputDirectory}/META-INF/versions/11 + + false + + + diff --git a/src/main/java/org/jsoup/helper/AuthenticationHandler.java b/src/main/java/org/jsoup/helper/AuthenticationHandler.java index bd3e6e1ba7..383163f1a9 100644 --- a/src/main/java/org/jsoup/helper/AuthenticationHandler.java +++ b/src/main/java/org/jsoup/helper/AuthenticationHandler.java @@ -4,7 +4,6 @@ import java.lang.reflect.Constructor; import java.net.Authenticator; -import java.net.HttpURLConnection; import java.net.PasswordAuthentication; /** @@ -13,7 +12,7 @@ ThreadLocal. */ class AuthenticationHandler extends Authenticator { - static final int MaxAttempts = 5; // max authentication attempts per request. allows for multiple auths (e.g. proxy and server) in one request, but saves otherwise 20 requests if credentials are incorrect. + static final int MaxAttempts = 3; // max authentication attempts per request. allows for multiple auths (e.g. proxy and server) in one request, but saves otherwise 20 requests if credentials are incorrect. static AuthShim handler; static { @@ -50,7 +49,7 @@ class AuthenticationHandler extends Authenticator { // it may be an interactive prompt, and the user could eventually get it right). But in Jsoup's context, the // auth will either be correct or not, so just abandon if (delegate.attemptCount > MaxAttempts) - return null; + return null; // When using HttpClient, this will manifest as "No credentials provided" IOException; not ideal; would be clearer if we could then detach the authenticator which would bubble the 401, but there's no path for that if (delegate.auth == null) return null; // detached - would have been the Global Authenticator (not a delegate) @@ -60,7 +59,7 @@ class AuthenticationHandler extends Authenticator { } interface AuthShim { - void enable(RequestAuthenticator auth, HttpURLConnection con); + void enable(RequestAuthenticator auth, Object connOrHttp); void remove(); @@ -76,7 +75,7 @@ static class GlobalHandler implements AuthShim { Authenticator.setDefault(new AuthenticationHandler()); } - @Override public void enable(RequestAuthenticator auth, HttpURLConnection con) { + @Override public void enable(RequestAuthenticator auth, Object ignored) { authenticators.set(new AuthenticationHandler(auth)); } diff --git a/src/main/java/org/jsoup/helper/CookieUtil.java b/src/main/java/org/jsoup/helper/CookieUtil.java index 4267f57439..84dc801087 100644 --- a/src/main/java/org/jsoup/helper/CookieUtil.java +++ b/src/main/java/org/jsoup/helper/CookieUtil.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.net.CookieManager; -import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; @@ -19,6 +18,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.BiConsumer; /** Helper functions to support the Cookie Manager / Cookie Storage in HttpConnection. @@ -35,7 +35,7 @@ class CookieUtil { Pre-request, get any applicable headers out of the Request cookies and the Cookie Store, and add them to the request headers. If the Cookie Store duplicates any Request cookies (same name and value), they will be discarded. */ - static void applyCookiesToRequest(HttpConnection.Request req, HttpURLConnection con) throws IOException { + static void applyCookiesToRequest(HttpConnection.Request req, BiConsumer setter) throws IOException { // Request key/val cookies. LinkedHashSet used to preserve order, as cookie store will return most specific path first Set cookieSet = requestCookieSet(req); Set cookies2 = null; @@ -62,9 +62,9 @@ else if (Cookie2Name.equals(key)) { } if (cookieSet.size() > 0) - con.addRequestProperty(CookieName, StringUtil.join(cookieSet, Sep)); + setter.accept(CookieName, StringUtil.join(cookieSet, Sep)); if (cookies2 != null && cookies2.size() > 0) - con.addRequestProperty(Cookie2Name, StringUtil.join(cookies2, Sep)); + setter.accept(Cookie2Name, StringUtil.join(cookies2, Sep)); } private static LinkedHashSet requestCookieSet(Connection.Request req) { diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index de55c0e5b2..05f2c58c6d 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -5,14 +5,12 @@ import org.jsoup.Progress; import org.jsoup.UnsupportedMimeTypeException; import org.jsoup.internal.ControllableInputStream; -import org.jsoup.internal.Functions; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; -import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLSocketFactory; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -26,7 +24,6 @@ import java.io.UncheckedIOException; import java.net.CookieManager; import java.net.CookieStore; -import java.net.HttpURLConnection; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; @@ -70,7 +67,7 @@ public class HttpConnection implements Connection { public static final String MULTIPART_FORM_DATA = "multipart/form-data"; public static final String FORM_URL_ENCODED = "application/x-www-form-urlencoded"; private static final int HTTP_TEMP_REDIR = 307; // http/1.1 temporary redirect, not in Java's set. - private static final String DefaultUploadType = "application/octet-stream"; + static final String DefaultUploadType = "application/octet-stream"; private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); /** @@ -111,7 +108,7 @@ copied. All other settings (proxy, parser, cookies, etc) are copied. req = new Request(copy); } - private static String encodeMimeName(String val) { + static String encodeMimeName(String val) { return val.replace("\"", "%22"); } @@ -602,6 +599,7 @@ public static class Request extends HttpConnection.Base impl private boolean followRedirects; private final Collection data; private @Nullable String body = null; + @Nullable String mimeBoundary; private boolean ignoreHttpErrors = false; private boolean ignoreContentType = false; private Parser parser; @@ -609,7 +607,7 @@ public static class Request extends HttpConnection.Base impl private String postDataCharset = DataUtil.defaultCharsetName; private @Nullable SSLSocketFactory sslSocketFactory; private CookieManager cookieManager; - private @Nullable RequestAuthenticator authenticator; + @Nullable RequestAuthenticator authenticator; private @Nullable Progress responseProgress; private volatile boolean executing = false; @@ -796,13 +794,14 @@ CookieManager cookieManager() { public static class Response extends HttpConnection.Base implements Connection.Response { private static final int MAX_REDIRECTS = 20; private static final String LOCATION = "Location"; - private final int statusCode; - private final String statusMessage; + int statusCode; + @Nullable String statusMessage; private @Nullable ByteBuffer byteData; private @Nullable ControllableInputStream bodyStream; - private @Nullable HttpURLConnection conn; + @Nullable RequestExecutor executor; private @Nullable String charset; - private @Nullable final String contentType; + @Nullable String contentType; + int contentLength; private boolean executed = false; private boolean inputStreamRead = false; private int numRedirects = 0; @@ -829,7 +828,7 @@ static Response execute(HttpConnection.Request req) throws IOException { return execute(req, null); } - static Response execute(HttpConnection.Request req, @Nullable Response previousResponse) throws IOException { + static Response execute(HttpConnection.Request req, @Nullable Response prevRes) throws IOException { synchronized (req) { Validate.isFalse(req.executing, "Multiple threads were detected trying to execute the same request concurrently. Make sure to use Connection#newRequest() and do not share an executing request between threads."); req.executing = true; @@ -840,38 +839,26 @@ static Response execute(HttpConnection.Request req, @Nullable Response previousR String protocol = url.getProtocol(); if (!protocol.equals("http") && !protocol.equals("https")) throw new MalformedURLException("Only http & https protocols supported"); - final boolean methodHasBody = req.method().hasBody(); - final boolean hasRequestBody = req.requestBody() != null; - if (!methodHasBody) - Validate.isFalse(hasRequestBody, "Cannot set a request body for HTTP method " + req.method()); + final boolean supportsBody = req.method().hasBody(); + final boolean hasBody = req.requestBody() != null; + if (!supportsBody) + Validate.isFalse(hasBody, "Cannot set a request body for HTTP method " + req.method()); // set up the request for execution - String mimeBoundary = null; - if (!req.data().isEmpty() && (!methodHasBody || hasRequestBody)) + if (!req.data().isEmpty() && (!supportsBody || hasBody)) serialiseRequestUrl(req); - else if (methodHasBody) - mimeBoundary = setOutputContentType(req); + else if (supportsBody) + setOutputContentType(req); long startTime = System.nanoTime(); - HttpURLConnection conn = createConnection(req); + RequestExecutor executor = RequestDispatch.get(req, prevRes); Response res = null; try { - conn.connect(); - if (conn.getDoOutput()) { - try (OutputStream out = conn.getOutputStream()) { - writePost(req, out, mimeBoundary); - } catch (IOException e) { - conn.disconnect(); - throw e; - } - } - - int status = conn.getResponseCode(); - res = new Response(conn, req, previousResponse); + res = executor.execute(); // redirect if there's a location header (from 3xx, or 201 etc) if (res.hasHeader(LOCATION) && req.followRedirects()) { - if (status != HTTP_TEMP_REDIR) { + if (res.statusCode != HTTP_TEMP_REDIR) { req.method(Method.GET); // always redirect with a get. any data param from original req are dropped. req.data().clear(); req.requestBody(null); @@ -888,8 +875,8 @@ else if (methodHasBody) req.executing = false; return execute(req, res); } - if ((status < 200 || status >= 400) && !req.ignoreHttpErrors()) - throw new HttpStatusException("HTTP error fetching URL", status, req.url().toString()); + if ((res.statusCode < 200 || res.statusCode >= 400) && !req.ignoreHttpErrors()) + throw new HttpStatusException("HTTP error fetching URL", res.statusCode, req.url().toString()); // check that we can handle the returned content type; if not, abort before fetching it String contentType = res.contentType(); @@ -907,8 +894,8 @@ else if (methodHasBody) } res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it - if (conn.getContentLength() != 0 && req.method() != HEAD) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body - InputStream stream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); + if (res.contentLength != 0 && req.method() != HEAD) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body + InputStream stream = executor.responseBody(); if (res.hasHeaderWithValue(CONTENT_ENCODING, "gzip")) stream = new GZIPInputStream(stream); else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate")) @@ -919,7 +906,7 @@ else if (res.hasHeaderWithValue(CONTENT_ENCODING, "deflate")) .timeout(startTime, req.timeout()); if (req.responseProgress != null) // set response progress listener - res.bodyStream.onProgress(conn.getContentLength(), req.responseProgress, res); + res.bodyStream.onProgress(res.contentLength, req.responseProgress, res); } else { res.byteData = DataUtil.emptyByteBuffer(); } @@ -1073,35 +1060,6 @@ public BufferedInputStream bodyStream() { return bodyStream.inputStream(); } - // set up connection defaults, and details from request - private static HttpURLConnection createConnection(HttpConnection.Request req) throws IOException { - Proxy proxy = req.proxy(); - final HttpURLConnection conn = (HttpURLConnection) ( - proxy == null ? - req.url().openConnection() : - req.url().openConnection(proxy) - ); - - conn.setRequestMethod(req.method().name()); - conn.setInstanceFollowRedirects(false); // don't rely on native redirection support - conn.setConnectTimeout(req.timeout()); - conn.setReadTimeout(req.timeout() / 2); // gets reduced after connection is made and status is read - - if (req.sslSocketFactory() != null && conn instanceof HttpsURLConnection) - ((HttpsURLConnection) conn).setSSLSocketFactory(req.sslSocketFactory()); - if (req.authenticator != null) - AuthenticationHandler.handler.enable(req.authenticator, conn); // removed in finally - if (req.method().hasBody()) - conn.setDoOutput(true); - CookieUtil.applyCookiesToRequest(req, conn); // from the Request key/val cookies and the Cookie Store - for (Map.Entry> header : req.multiHeaders().entrySet()) { - for (String value : header.getValue()) { - conn.addRequestProperty(header.getKey(), value); - } - } - return conn; - } - /** * Call on completion of stream read, to close the body (or error) stream. The connection.disconnect allows * keep-alives to work (as the underlying connection is actually held open, despite the name). @@ -1116,23 +1074,16 @@ private void safeClose() { bodyStream = null; } } - if (conn != null) { - conn.disconnect(); - conn = null; - } + + if (executor != null) executor.safeClose(); // disconnect } - // set up url, method, header, cookies - private Response(HttpURLConnection conn, HttpConnection.Request request, HttpConnection.@Nullable Response previousResponse) throws IOException { - this.conn = conn; + Response(HttpConnection.Request request) { this.req = request; - method = Method.valueOf(conn.getRequestMethod()); - url = conn.getURL(); - statusCode = conn.getResponseCode(); - statusMessage = conn.getResponseMessage(); - contentType = conn.getContentType(); + } - Map> resHeaders = createHeaderMap(conn); + // set up url, method, header, cookies + void prepareResponse(Map> resHeaders, HttpConnection.@Nullable Response previousResponse) throws IOException { processResponseHeaders(resHeaders); // includes cookie key/val read during header scan CookieUtil.storeCookies(req, this, url, resHeaders); // add set cookies to cookie store @@ -1151,25 +1102,6 @@ private Response(HttpURLConnection conn, HttpConnection.Request request, HttpCon } } - private static LinkedHashMap> createHeaderMap(HttpURLConnection conn) { - // the default sun impl of conn.getHeaderFields() returns header values out of order - final LinkedHashMap> headers = new LinkedHashMap<>(); - int i = 0; - while (true) { - final String key = conn.getHeaderFieldKey(i); - final String val = conn.getHeaderField(i); - if (key == null && val == null) - break; - i++; - if (key == null || val == null) - continue; // skip http1.1 line - - final List vals = headers.computeIfAbsent(key, Functions.listFunction()); - vals.add(val); - } - return headers; - } - void processResponseHeaders(Map> resHeaders) { for (Map.Entry> entry : resHeaders.entrySet()) { String name = entry.getKey(); @@ -1243,7 +1175,7 @@ private static boolean looksLikeUtf8(byte[] input) { return foundNonAscii; } - private @Nullable static String setOutputContentType(final Connection.Request req) { + private static void setOutputContentType(final HttpConnection.Request req) { final String contentType = req.header(CONTENT_TYPE); String bound = null; if (contentType != null) { @@ -1263,12 +1195,13 @@ else if (needsMultipart(req)) { } else { req.header(CONTENT_TYPE, FORM_URL_ENCODED + "; charset=" + req.postDataCharset()); } - return bound; + req.mimeBoundary = bound; } - private static void writePost(final Connection.Request req, final OutputStream outputStream, @Nullable final String boundary) throws IOException { + static void writePost(final HttpConnection.Request req, final OutputStream outputStream) throws IOException { final Collection data = req.data(); final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName(req.postDataCharset()))); + final String boundary = req.mimeBoundary; if (boundary != null) { // boundary will be set if we're in multipart mode diff --git a/src/main/java/org/jsoup/helper/RequestAuthenticator.java b/src/main/java/org/jsoup/helper/RequestAuthenticator.java index 7b4adc40ec..2ea44c4e5e 100644 --- a/src/main/java/org/jsoup/helper/RequestAuthenticator.java +++ b/src/main/java/org/jsoup/helper/RequestAuthenticator.java @@ -38,7 +38,7 @@ class Context { } /** - Get he URL that is being requested. + Get the URL that is being requested. * @return URL */ public URL url() { diff --git a/src/main/java/org/jsoup/helper/RequestDispatch.java b/src/main/java/org/jsoup/helper/RequestDispatch.java new file mode 100644 index 0000000000..a1696cf409 --- /dev/null +++ b/src/main/java/org/jsoup/helper/RequestDispatch.java @@ -0,0 +1,44 @@ +package org.jsoup.helper; + +import org.jsoup.internal.SharedConstants; +import org.jspecify.annotations.Nullable; + +import static org.jsoup.helper.HttpConnection.Request; +import static org.jsoup.helper.HttpConnection.Response; + +import java.lang.reflect.Constructor; + +/** + Dispatches requests to either HttpClient (JDK 11+) or HttpURLConnection implementations. At startup, if we + can instantiate the HttpClientExecutor class, requests will use that if the system property + {@link SharedConstants#UseHttpClient} is set to {@code true}. + */ +class RequestDispatch { + + @Nullable + static Constructor clientConstructor; + + static { + try { + //noinspection unchecked + Class httpClass = + (Class) Class.forName("org.jsoup.helper.HttpClientExecutor"); + clientConstructor = httpClass.getConstructor(Request.class, Response.class); + } catch (Exception ignored) { + // either not on Java11+, or on Android; will provide UrlConnectionExecutor + } + + } + + static RequestExecutor get(Request request, @Nullable Response previousResponse) { + if (Boolean.getBoolean(SharedConstants.UseHttpClient) && clientConstructor != null) { + try { + return clientConstructor.newInstance(request, previousResponse); + } catch (Exception e) { + return new UrlConnectionExecutor(request, previousResponse); + } + } else { + return new UrlConnectionExecutor(request, previousResponse); + } + } +} diff --git a/src/main/java/org/jsoup/helper/RequestExecutor.java b/src/main/java/org/jsoup/helper/RequestExecutor.java new file mode 100644 index 0000000000..a980d630d8 --- /dev/null +++ b/src/main/java/org/jsoup/helper/RequestExecutor.java @@ -0,0 +1,28 @@ +package org.jsoup.helper; + +import static org.jsoup.helper.HttpConnection.Request; +import static org.jsoup.helper.HttpConnection.Response; + +import org.jspecify.annotations.Nullable; + +import java.io.IOException; +import java.io.InputStream; + +/** + A shim interface to support both HttpURLConnection and HttpClient implementations, in a multi-version jar. + */ +abstract class RequestExecutor { + final Request req; + final @Nullable Response prevRes; + + RequestExecutor(Request request, @Nullable Response previousResponse) { + this.req = request; + this.prevRes = previousResponse; + } + + abstract Response execute() throws IOException; + + abstract InputStream responseBody() throws IOException; + + abstract void safeClose(); +} diff --git a/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java new file mode 100644 index 0000000000..2499337dbf --- /dev/null +++ b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java @@ -0,0 +1,125 @@ +package org.jsoup.helper; + +import org.jsoup.Connection; +import org.jsoup.internal.Functions; +import org.jspecify.annotations.Nullable; + +import javax.net.ssl.HttpsURLConnection; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.Proxy; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import static org.jsoup.helper.HttpConnection.Response; + +/** + Execute HTTP requests using the HttpURLConnection implementation. Currently used by default; set system property + {@code jsoup.useHttpClient} to {@code false} to explicitly set. + */ +class UrlConnectionExecutor extends RequestExecutor { + @Nullable + HttpURLConnection conn; + + UrlConnectionExecutor(HttpConnection.Request req, HttpConnection.@Nullable Response prevRes) { + super(req, prevRes); + } + + @Override + HttpConnection.Response execute() throws IOException { + try { + conn = createConnection(req); + conn.connect(); + if (conn.getDoOutput()) { + try (OutputStream out = conn.getOutputStream()) { + Response.writePost(req, out); + } catch (IOException e) { + conn.disconnect(); + throw e; + } + } + + // set up url, method, header, cookies + Response res = new Response(req); + res.executor = this; + res.method = Connection.Method.valueOf(conn.getRequestMethod()); + res.url = conn.getURL(); + res.statusCode = conn.getResponseCode(); + res.statusMessage = conn.getResponseMessage(); + res.contentType = conn.getContentType(); + res.contentLength = conn.getContentLength(); + Map> resHeaders = createHeaderMap(conn); + res.prepareResponse(resHeaders, prevRes); + + return res; + } catch (IOException e) { + safeClose(); + throw e; + } + } + + @Override + InputStream responseBody() throws IOException { + if (conn == null) throw new IllegalStateException("Not yet executed"); + return conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); + } + + @Override + void safeClose() { + if (conn != null) { + conn.disconnect(); + conn = null; + } + } + + // set up connection defaults, and details from request + private static HttpURLConnection createConnection(HttpConnection.Request req) throws IOException { + Proxy proxy = req.proxy(); + final HttpURLConnection conn = (HttpURLConnection) ( + proxy == null ? + req.url().openConnection() : + req.url().openConnection(proxy) + ); + + conn.setRequestMethod(req.method().name()); + conn.setInstanceFollowRedirects(false); // don't rely on native redirection support + conn.setConnectTimeout(req.timeout()); + conn.setReadTimeout(req.timeout() / 2); // gets reduced after connection is made and status is read + + if (req.sslSocketFactory() != null && conn instanceof HttpsURLConnection) + ((HttpsURLConnection) conn).setSSLSocketFactory(req.sslSocketFactory()); + if (req.authenticator != null) + AuthenticationHandler.handler.enable(req.authenticator, conn); // removed in finally + if (req.method().hasBody()) + conn.setDoOutput(true); + CookieUtil.applyCookiesToRequest(req, conn::addRequestProperty); // from the Request key/val cookies and the Cookie Store + for (Map.Entry> header : req.multiHeaders().entrySet()) { + for (String value : header.getValue()) { + conn.addRequestProperty(header.getKey(), value); + } + } + return conn; + } + + private static LinkedHashMap> createHeaderMap(HttpURLConnection conn) { + // the default sun impl of conn.getHeaderFields() returns header values out of order + final LinkedHashMap> headers = new LinkedHashMap<>(); + int i = 0; + while (true) { + final String key = conn.getHeaderFieldKey(i); + final String val = conn.getHeaderField(i); + if (key == null && val == null) + break; + i++; + if (key == null || val == null) + continue; // skip http1.1 line + + final List vals = headers.computeIfAbsent(key, Functions.listFunction()); + vals.add(val); + } + return headers; + } +} diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java index a0c45c7b5f..2b4b9e2976 100644 --- a/src/main/java/org/jsoup/internal/SharedConstants.java +++ b/src/main/java/org/jsoup/internal/SharedConstants.java @@ -18,5 +18,7 @@ public final class SharedConstants { public static final String DummyUri = "https://dummy.example/"; // used as a base URI if none provided, to allow abs url resolution to preserve relative links + public static final String UseHttpClient = "jsoup.useHttpClient"; + private SharedConstants() {} } diff --git a/src/main/java11/module-info.java b/src/main/java11/module-info.java new file mode 100644 index 0000000000..4b837b0423 --- /dev/null +++ b/src/main/java11/module-info.java @@ -0,0 +1,12 @@ +module org.jsoup { + exports org.jsoup; + exports org.jsoup.helper; + exports org.jsoup.nodes; + exports org.jsoup.parser; + exports org.jsoup.safety; + exports org.jsoup.select; + + requires transitive java.xml; // for org.w3c.dom out of W3CDom + requires static org.jspecify; // nullability annotations + requires static java.net.http; // HttpClient on Java 11; guarded +} diff --git a/src/main/java11/org/jsoup/helper/HttpClientExecutor.java b/src/main/java11/org/jsoup/helper/HttpClientExecutor.java new file mode 100644 index 0000000000..0b29c594ad --- /dev/null +++ b/src/main/java11/org/jsoup/helper/HttpClientExecutor.java @@ -0,0 +1,136 @@ +package org.jsoup.helper; + +import org.jsoup.Connection; +import org.jspecify.annotations.Nullable; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.Proxy; +import java.net.ProxySelector; +import java.net.SocketAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.http.HttpClient; +import java.net.http.HttpHeaders; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static org.jsoup.helper.HttpConnection.Response; +import static org.jsoup.helper.HttpConnection.Response.writePost; + +/** + Executes requests using the HttpClient, for http/2 support. Currently disabled by default; enable by setting system + property {@code jsoup.useHttpClient} to {@code true}. + */ +class HttpClientExecutor extends RequestExecutor { + @Nullable + HttpResponse hRes; + + public HttpClientExecutor(HttpConnection.Request request, HttpConnection.@Nullable Response previousResponse) { + super(request, previousResponse); + } + + @Override + HttpConnection.Response execute() throws IOException { + try { + HttpClient.Builder builder = HttpClient.newBuilder(); + Proxy proxy = req.proxy(); + if (proxy != null) builder.proxy(new ProxyWrap(proxy)); + builder.followRedirects(HttpClient.Redirect.NEVER); // customized redirects + //builder.connectTimeout(Duration.ofMillis(req.timeout()/2)); // jsoup timeout is total connect + all reads + // todo - how to handle socketfactory? HttpClient wants SSLContext... + if (req.authenticator != null) { + AuthenticationHandler.AuthShim handler = new RequestAuthHandler(); + handler.enable(req.authenticator, builder); + } + HttpClient client = builder.build(); + + HttpRequest.Builder reqBuilder = + HttpRequest.newBuilder(req.url.toURI()).method(req.method.name(), requestBody(req)); + if (req.timeout() > 0) reqBuilder.timeout( + Duration.ofMillis(req.timeout())); // infinite if unset (UrlConnection / jsoup uses 0 for same) + CookieUtil.applyCookiesToRequest(req, reqBuilder::header); + + // headers: + for (Map.Entry> header : req.multiHeaders().entrySet()) { + for (String value : header.getValue()) { + reqBuilder.header(header.getKey(), value); + } + } + + HttpRequest hReq = reqBuilder.build(); + hRes = client.send(hReq, HttpResponse.BodyHandlers.ofInputStream()); + HttpHeaders headers = hRes.headers(); + + // set up the response + Response res = new Response(req); + res.executor = this; + res.method = Connection.Method.valueOf(hRes.request().method()); + res.url = hRes.uri().toURL(); + res.statusCode = hRes.statusCode(); + res.contentType = headers.firstValue("content-type").orElse(""); + long length = headers.firstValueAsLong("content-length").orElse(-1); + res.contentLength = length < Integer.MAX_VALUE ? (int) length : -1; + res.prepareResponse(headers.map(), prevRes); + + return res; + } catch (IOException e) { + safeClose(); + throw e; + } catch (InterruptedException e) { + safeClose(); + throw new IOException(e); + } catch (URISyntaxException e) { + throw new IllegalArgumentException("Malformed URL: " + req.url, e); + } + } + + @Override + InputStream responseBody() throws IOException { + if (hRes == null) throw new IllegalStateException("Not yet executed"); + return hRes.body(); + } + + @Override + void safeClose() { + if (hRes != null) { + // no real closer + // todo - review + hRes = null; + } + } + + static HttpRequest.BodyPublisher requestBody(final HttpConnection.Request req) throws IOException { + if (req.method.hasBody()) { + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + writePost(req, buf); + return HttpRequest.BodyPublishers.ofByteArray(buf.toByteArray()); + } else { + return HttpRequest.BodyPublishers.noBody(); + } + } + + static class ProxyWrap extends ProxySelector { + final List proxies; + + public ProxyWrap(Proxy proxy) { + this.proxies = new ArrayList<>(1); + proxies.add(proxy); + } + + @Override + public List select(URI uri) { + return proxies; + } + + @Override + public void connectFailed(URI uri, SocketAddress sa, IOException ioe) { + // no-op + } + } +} diff --git a/src/main/java11/org/jsoup/helper/RequestAuthHandler.java b/src/main/java11/org/jsoup/helper/RequestAuthHandler.java new file mode 100644 index 0000000000..5e543bc3a7 --- /dev/null +++ b/src/main/java11/org/jsoup/helper/RequestAuthHandler.java @@ -0,0 +1,35 @@ +package org.jsoup.helper; + +import java.net.HttpURLConnection; +import java.net.http.HttpClient; + +/** + A per-request authentication shim, used in Java 9+. + */ +class RequestAuthHandler implements AuthenticationHandler.AuthShim { + public RequestAuthHandler() {} + + @Override public void enable(RequestAuthenticator auth, Object connOrHttp) { + AuthenticationHandler authenticator = new AuthenticationHandler(auth); + + // this is a bit ugly, but a simple way to support setting authentication on both urlconnection and httpclient without more multi-version shims + if (connOrHttp instanceof HttpURLConnection) { + HttpURLConnection conn = (HttpURLConnection) connOrHttp; + conn.setAuthenticator(authenticator); + } else if (connOrHttp instanceof HttpClient.Builder) { + HttpClient.Builder builder = (HttpClient.Builder) connOrHttp; + builder.authenticator(authenticator); + } else { + throw new IllegalArgumentException("Unsupported executor: " + connOrHttp.getClass().getName()); + } + } + + @Override public void remove() { + // noop; would remove thread-local in Global Handler + } + + @Override public AuthenticationHandler get(AuthenticationHandler helper) { + // would get thread-local in Global Handler + return helper; + } +} diff --git a/src/main/java9/module-info.java b/src/main/java9/module-info.java deleted file mode 100644 index 31bd333877..0000000000 --- a/src/main/java9/module-info.java +++ /dev/null @@ -1,11 +0,0 @@ -module org.jsoup { - exports org.jsoup; - exports org.jsoup.helper; - exports org.jsoup.nodes; - exports org.jsoup.parser; - exports org.jsoup.safety; - exports org.jsoup.select; - - requires transitive java.xml; // for org.w3c.dom out of W3CDom - requires static org.jspecify; // nullability annotations -} diff --git a/src/main/java9/org/jsoup/helper/RequestAuthHandler.java b/src/main/java9/org/jsoup/helper/RequestAuthHandler.java deleted file mode 100644 index 0df80de209..0000000000 --- a/src/main/java9/org/jsoup/helper/RequestAuthHandler.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.jsoup.helper; - -import java.net.HttpURLConnection; - -/** - A per-request authentication shim, used in Java 9+. - */ -class RequestAuthHandler implements AuthenticationHandler.AuthShim { - public RequestAuthHandler() {} - - @Override public void enable(RequestAuthenticator auth, HttpURLConnection con) { - AuthenticationHandler authenticator = new AuthenticationHandler(auth); - con.setAuthenticator(authenticator); - } - - @Override public void remove() { - // noop; would remove thread-local in Global Handler - } - - @Override public AuthenticationHandler get(AuthenticationHandler helper) { - // would get thread-local in Global Handler - return helper; - } -} diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 8929dcac2a..b41ade915d 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -8,6 +8,7 @@ import org.jsoup.helper.DataUtil; import org.jsoup.helper.W3CDom; import org.jsoup.integration.servlets.*; +import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -59,6 +60,7 @@ public class ConnectTest { public static void setUp() { TestServer.start(); echoUrl = EchoServlet.Url; + System.setProperty(SharedConstants.UseHttpClient, "false"); // use the default UrlConnection. See HttpClientConnectTest for other version } @BeforeEach @@ -475,9 +477,15 @@ public void handlesLargerContentLengthParseRead() throws IOException { .timeout(400) .execute(); - Document document = res.parse(); - assertEquals("Something", document.title()); - assertEquals(0, document.select("p").size()); + try { + Document document = res.parse(); + assertEquals("Something", document.title()); + assertEquals(0, document.select("p").size()); + } catch (IOException ignored) { + // HttpUrlConnection will read the amount provided and the tests in the try will pass + // HttpClient will throw unexpected EOF during the read, and this will catch it + // Either are OK + } // current impl, jetty won't write past content length // todo - find way to trick jetty into writing larger than set header. Take over the stream? } @@ -944,14 +952,20 @@ void incorrectAuth(String url) throws IOException { assertEquals(HttpServletResponse.SC_UNAUTHORIZED, code); AtomicInteger count = new AtomicInteger(0); - Connection.Response res = session.newRequest(url) - .auth(ctx -> { - count.incrementAndGet(); - return ctx.credentials(AuthFilter.ServerUser, password + "wrong"); // incorrect - }) - .execute(); + try { + Connection.Response res = session.newRequest(url) + .auth(ctx -> { + count.incrementAndGet(); + return ctx.credentials(AuthFilter.ServerUser, password + "wrong"); // incorrect + }) + .execute(); + assertEquals(HttpServletResponse.SC_UNAUTHORIZED, res.statusCode()); + } catch (IOException e) { + assertEquals("No credentials provided", e.getMessage()); + // In HttpClient, will throw IOE if our password delegate stops providing credentials after too many attempts. So we'll get this error. + // In HttpUrlConnection, which would otherwise try 20 times, when the auth stops providing it will cascade to the underyling 401 response (which seems a better path IMO) + } assertEquals(MaxAttempts, count.get()); - assertEquals(HttpServletResponse.SC_UNAUTHORIZED, res.statusCode()); AtomicInteger successCount = new AtomicInteger(0); Connection.Response successRes = session.newRequest(url) diff --git a/src/test/java/org/jsoup/integration/TestServer.java b/src/test/java/org/jsoup/integration/TestServer.java index 777ccd3050..d36a34e272 100644 --- a/src/test/java/org/jsoup/integration/TestServer.java +++ b/src/test/java/org/jsoup/integration/TestServer.java @@ -182,7 +182,6 @@ private static void setupDefaultTrust(File keystoreFile) throws KeyStoreExceptio TrustManager[] managers = trustManagerFactory.getTrustManagers(); SSLContext tls = SSLContext.getInstance("TLS"); tls.init(null, managers, null); - SSLSocketFactory socketFactory = tls.getSocketFactory(); - HttpsURLConnection.setDefaultSSLSocketFactory(socketFactory); + SSLContext.setDefault(tls); } } diff --git a/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java new file mode 100644 index 0000000000..4bfd5a7eb2 --- /dev/null +++ b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java @@ -0,0 +1,33 @@ +package org.jsoup.helper; +import org.jsoup.internal.SharedConstants; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; + +public class HttpClientExecutorTest { + @Test void getsHttpClient() { + try { + enableHttpClient(); + RequestExecutor executor = RequestDispatch.get(null, null); + //assertInstanceOf(HttpClientExecutor.class, executor); + assertEquals("org.jsoup.helper.HttpClientExecutor", executor.getClass().getName()); + // Haven't figured out how to get Maven to allow this mjar code to be on the classpath for the surefire tests, hence not instanceof + } finally { + disableHttpClient(); // reset to default off (currently) + } + } + + @Test void getsHttpUrlConnectionByDefault() { + RequestExecutor executor = RequestDispatch.get(null, null); + assertInstanceOf(UrlConnectionExecutor.class, executor); + } + + public static void enableHttpClient() { + System.setProperty(SharedConstants.UseHttpClient, "true"); + } + + public static void disableHttpClient() { + System.setProperty(SharedConstants.UseHttpClient, "false"); + } +} diff --git a/src/test/java11/org/jsoup/integration/HttpClientConnectIT.java b/src/test/java11/org/jsoup/integration/HttpClientConnectIT.java new file mode 100644 index 0000000000..4bb4d31ae7 --- /dev/null +++ b/src/test/java11/org/jsoup/integration/HttpClientConnectIT.java @@ -0,0 +1,31 @@ +package org.jsoup.integration; + +import org.jsoup.helper.HttpClientExecutorTest; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; + +public class HttpClientConnectIT extends ConnectIT { + @BeforeAll + static void useHttpClient() { + HttpClientExecutorTest.enableHttpClient(); + } + + @AfterAll + static void resetClient() { + HttpClientExecutorTest.disableHttpClient(); + } + + @Override @Disabled + public void canInterruptBodyStringRead() throws InterruptedException { + // noop; can't interrupt the client via the calling thread; probably not required as timeouts are robust + } + + @Override @Disabled + public void canInterruptDocumentRead() throws InterruptedException { + } + + @Override @Disabled + public void canInterruptThenJoinASpawnedThread() throws InterruptedException { + } +} diff --git a/src/test/java11/org/jsoup/integration/HttpClientConnectTest.java b/src/test/java11/org/jsoup/integration/HttpClientConnectTest.java new file mode 100644 index 0000000000..c556c271ca --- /dev/null +++ b/src/test/java11/org/jsoup/integration/HttpClientConnectTest.java @@ -0,0 +1,18 @@ +package org.jsoup.integration; + +import org.jsoup.helper.HttpClientExecutorTest; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; + +public class HttpClientConnectTest extends ConnectTest { + @BeforeAll + static void useHttpClient() { + HttpClientExecutorTest.enableHttpClient(); + } + + @AfterAll + static void resetClient() { + HttpClientExecutorTest.disableHttpClient(); + } + +} diff --git a/src/test/java11/org/jsoup/integration/HttpClientSessionIT.java b/src/test/java11/org/jsoup/integration/HttpClientSessionIT.java new file mode 100644 index 0000000000..ac188ea220 --- /dev/null +++ b/src/test/java11/org/jsoup/integration/HttpClientSessionIT.java @@ -0,0 +1,17 @@ +package org.jsoup.integration; + +import org.jsoup.helper.HttpClientExecutorTest; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; + +public class HttpClientSessionIT extends SessionIT { + @BeforeAll + static void useHttpClient() { + HttpClientExecutorTest.enableHttpClient(); + } + + @AfterAll + static void resetClient() { + HttpClientExecutorTest.disableHttpClient(); + } +} diff --git a/src/test/java11/org/jsoup/integration/HttpClientSessionTest.java b/src/test/java11/org/jsoup/integration/HttpClientSessionTest.java new file mode 100644 index 0000000000..3f1fdb03d8 --- /dev/null +++ b/src/test/java11/org/jsoup/integration/HttpClientSessionTest.java @@ -0,0 +1,17 @@ +package org.jsoup.integration; + +import org.jsoup.helper.HttpClientExecutorTest; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; + +public class HttpClientSessionTest extends SessionTest { + @BeforeAll + static void useHttpClient() { + HttpClientExecutorTest.enableHttpClient(); + } + + @AfterAll + static void resetClient() { + HttpClientExecutorTest.disableHttpClient(); + } +} From cd879d73ee2d42665e9b49a55c54ea4957495c51 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Tue, 7 Jan 2025 18:30:09 +1100 Subject: [PATCH 2/2] OK if there are more progress events Java 21 exhibits more progress events when using HttpClient --- src/test/java/org/jsoup/integration/ConnectTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index b41ade915d..eb7ae342bd 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -1030,11 +1030,11 @@ void progressListener(String path) throws IOException { int num = numProgress.get(); // debug log if not in those ranges: - if (num < expected * 0.75 || num > expected * 1.5) { + if (num < expected * 0.75 || num > expected * 2.5) { System.err.println("Expected: " + expected + ", got: " + num); } assertTrue(num > expected * 0.75); - assertTrue(num < expected * 1.5); + assertTrue(num < expected * 2.5); // check the document works assertEquals(LargeDocTextLen, document.text().length());