Skip to content

Commit db1a32b

Browse files
committed
Merge #513 from branch '511-enableGzipInHttpOpener' of github.com:metafacture/metafacture-core
2 parents b597d30 + 8621e7c commit db1a32b

File tree

2 files changed

+188
-71
lines changed

2 files changed

+188
-71
lines changed

metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java

Lines changed: 142 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2013, 2022 Deutsche Nationalbibliothek et al
2+
* Copyright 2013, 2023 Deutsche Nationalbibliothek et al
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -32,40 +32,52 @@
3232
import java.io.SequenceInputStream;
3333
import java.net.HttpURLConnection;
3434
import java.net.URL;
35+
import java.net.URLDecoder;
3536
import java.util.Arrays;
3637
import java.util.HashMap;
3738
import java.util.Map;
3839
import java.util.regex.Pattern;
40+
import java.util.zip.GZIPInputStream;
3941

4042
/**
4143
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
4244
*
4345
* @author Christoph Böhme
4446
* @author Jan Schnasse
4547
* @author Jens Wille
48+
* @author Pascal Christoph (dr0i)
4649
*/
47-
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
50+
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding`, `Content-Encoding` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header (`accept`) = `*/*`, `Accept-Charset` header (`acceptcharset`) = `UTF-8`, `errorprefix` = `ERROR: `.")
4851
@In(String.class)
4952
@Out(Reader.class)
5053
@FluxCommand("open-http")
5154
public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<Reader>> {
5255

53-
public static final String ACCEPT_DEFAULT = "*/*";
5456
public static final String ACCEPT_HEADER = "accept";
57+
public static final String ACCEPT_CHARSET_HEADER = "accept-charset";
58+
public static final String ACCEPT_ENCODING_HEADER = "accept-encoding";
59+
public static final String CONTENT_ENCODING_HEADER = "content-encoding";
5560
public static final String CONTENT_TYPE_HEADER = "content-type";
61+
62+
public static final String ACCEPT_DEFAULT = "*/*";
63+
public static final String CHARSET_DEFAULT = "UTF-8";
5664
public static final String DEFAULT_PREFIX = "ERROR: ";
57-
public static final String ENCODING_DEFAULT = "UTF-8";
58-
public static final String ENCODING_HEADER = "accept-charset";
65+
public static final String HEADER_FIELD_SEPARATOR = "\n";
66+
public static final String HEADER_VALUE_SEPARATOR = ":";
5967
public static final String INPUT_DESIGNATOR = "@-";
68+
public static final String MIME_PARAMETER_CHARSET = "charset";
69+
public static final String MIME_PARAMETER_SEPARATOR = ";";
70+
public static final String MIME_PARAMETER_VALUE_SEPARATOR = "=";
6071

6172
public static final String DEFAULT_METHOD_NAME = "GET";
6273
public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME);
6374

64-
public static final String HEADER_FIELD_SEPARATOR = "\n";
65-
public static final String HEADER_VALUE_SEPARATOR = ":";
66-
6775
private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR);
6876
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR);
77+
private static final Pattern MIME_PARAMETER_SEPARATOR_PATTERN = Pattern.compile(MIME_PARAMETER_SEPARATOR);
78+
79+
private static final int ALLOWED_REDIRECTIONS = 3;
80+
private static final int CONNECTION_TIMEOUT = 11000;
6981

7082
private final Map<String, String> headers = new HashMap<>();
7183

@@ -118,7 +130,7 @@ public boolean getResponseHasBody() {
118130
*/
119131
public HttpOpener() {
120132
setAccept(ACCEPT_DEFAULT);
121-
setEncoding(ENCODING_DEFAULT);
133+
setAcceptCharset(CHARSET_DEFAULT);
122134
setErrorPrefix(DEFAULT_PREFIX);
123135
setMethod(DEFAULT_METHOD);
124136
setUrl(INPUT_DESIGNATOR);
@@ -137,43 +149,59 @@ public void setAccept(final String accept) {
137149
}
138150

139151
/**
140-
* Sets the HTTP request body. The default value for the request body is
141-
* {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
142-
* method} accepts a request body</i>, which means it will use the {@link
143-
* #process(String) input data} data as request body <i>if the input has
144-
* not already been used</i>; otherwise, no request body will be set by
145-
* default.
152+
* Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
153+
* MIME type such as {@code text/plain} or {@code application/json}.
146154
*
147-
* <p>If a request body has been set, but the request method does not
148-
* accept a body, the method <i>may</i> be changed to {@code POST}.
155+
* @param contentType MIME type to use for the HTTP content-type header
156+
*/
157+
public void setContentType(final String contentType) {
158+
setHeader(CONTENT_TYPE_HEADER, contentType);
159+
}
160+
161+
/**
162+
* Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
163+
* preferred charset for the HTTP response.
164+
* The default charset is {@value CHARSET_DEFAULT}.
149165
*
150-
* @param body the request body
166+
* @param charset name of the charset used for the accept-charset HTTP header
151167
*/
152-
public void setBody(final String body) {
153-
this.body = body;
168+
public void setAcceptCharset(final String charset) {
169+
setHeader(ACCEPT_CHARSET_HEADER, charset);
154170
}
155171

156172
/**
157-
* Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
158-
* MIME type such as {@code text/plain} or {@code application/json}.
173+
* @deprecated Use {@link #setAcceptCharset} instead.
174+
* @param charset name of the charset used for the accept-charset HTTP header
175+
*/
176+
@Deprecated
177+
public void setEncoding(final String charset) {
178+
setAcceptCharset(charset);
179+
}
180+
181+
/**
182+
* Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
183+
* preferred content encoding for the HTTP response. It accepts HTTP compression.
184+
* Allowed values are i.a. "gzip" and "Brotli".
185+
* The default for the content encoding is null, which means "no compression".
159186
*
160-
* @param contentType MIME type to use for the HTTP content-type header
187+
* @param acceptEncoding name of content encoding used for the accept-encoding HTTP
188+
* header
161189
*/
162-
public void setContentType(final String contentType) {
163-
setHeader(CONTENT_TYPE_HEADER, contentType);
190+
public void setAcceptEncoding(final String acceptEncoding) {
191+
setHeader(ACCEPT_ENCODING_HEADER, acceptEncoding);
164192
}
165193

166194
/**
167-
* Sets the HTTP {@value ENCODING_HEADER} header value. This is the
168-
* preferred encoding for the HTTP response. Additionally, the encoding
169-
* is used for reading the HTTP response if it does not specify a content
170-
* encoding. The default for the encoding is {@value ENCODING_DEFAULT}.
195+
* Sets the HTTP {@value CONTENT_ENCODING_HEADER} header value. This is the
196+
* content encoding for the HTTP request. It enables HTTP compression.
197+
* Allowed values are "gzip".
198+
* The default for the content encoding is null, which means "no compression".
171199
*
172-
* @param encoding name of the encoding used for the accept-charset HTTP
200+
* @param contentEncoding name of content encoding used for the content-encoding HTTP
173201
* header
174202
*/
175-
public void setEncoding(final String encoding) {
176-
setHeader(ENCODING_HEADER, encoding);
203+
public void setContentEncoding(final String contentEncoding) {
204+
setHeader(CONTENT_ENCODING_HEADER, contentEncoding);
177205
}
178206

179207
/**
@@ -239,28 +267,40 @@ public void setUrl(final String url) {
239267
this.url = url;
240268
}
241269

270+
/**
271+
* Sets the HTTP request body. The default value for the request body is
272+
* {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
273+
* method} accepts a request body</i>, which means it will use the {@link
274+
* #process(String) input data} data as request body <i>if the input has
275+
* not already been used</i>; otherwise, no request body will be set by
276+
* default.
277+
*
278+
* <p>If a request body has been set, but the request method does not
279+
* accept a body, the method <i>may</i> be changed to {@code POST}.
280+
*
281+
* @param body the request body
282+
*/
283+
public void setBody(final String body) {
284+
this.body = body;
285+
}
286+
242287
@Override
243288
public void process(final String input) {
244289
try {
245290
final String requestUrl = getInput(input, url);
246291
final String requestBody = getInput(input,
247-
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
248-
249-
final HttpURLConnection connection =
250-
(HttpURLConnection) new URL(requestUrl).openConnection();
292+
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
251293

252-
connection.setRequestMethod(method.name());
253-
headers.forEach(connection::addRequestProperty);
254-
255-
if (requestBody != null) {
256-
connection.setDoOutput(true);
257-
connection.getOutputStream().write(requestBody.getBytes());
258-
}
294+
final URL urlToOpen = new URL(requestUrl);
295+
final HttpURLConnection connection = requestBody != null ?
296+
doOutput(urlToOpen, requestBody) : doRedirects(urlToOpen);
259297

260298
final InputStream inputStream = getInputStream(connection);
261-
final String contentEncoding = getEncoding(connection.getContentEncoding());
299+
final String charset = getContentCharset(connection);
262300

263-
getReceiver().process(new InputStreamReader(inputStream, contentEncoding));
301+
getReceiver().process(new InputStreamReader(
302+
"gzip".equalsIgnoreCase(connection.getContentEncoding()) ?
303+
new GZIPInputStream(inputStream) : inputStream, charset));
264304
}
265305
catch (final IOException e) {
266306
throw new MetafactureException(e);
@@ -287,6 +327,46 @@ else if (inputUsed) {
287327
return result;
288328
}
289329

330+
private HttpURLConnection doOutput(final URL urlToOpen, final String requestBody) throws IOException {
331+
final HttpURLConnection connection = openConnection(urlToOpen);
332+
333+
connection.setDoOutput(true);
334+
connection.getOutputStream().write(requestBody.getBytes());
335+
336+
return connection;
337+
}
338+
339+
private HttpURLConnection doRedirects(final URL startingUrl) throws IOException {
340+
URL urlToFollow = startingUrl;
341+
342+
for (int i = 0; i < ALLOWED_REDIRECTIONS; ++i) {
343+
final HttpURLConnection connection = openConnection(urlToFollow);
344+
connection.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections
345+
346+
switch (connection.getResponseCode()) {
347+
case HttpURLConnection.HTTP_MOVED_PERM:
348+
case HttpURLConnection.HTTP_MOVED_TEMP:
349+
final String location = URLDecoder.decode(connection.getHeaderField("Location"), "UTF-8");
350+
urlToFollow = new URL(urlToFollow, location); // Deal with relative URLs
351+
break;
352+
default:
353+
return connection;
354+
}
355+
}
356+
357+
throw new IOException("Too many redirects");
358+
}
359+
360+
private HttpURLConnection openConnection(final URL urlToOpen) throws IOException {
361+
final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection();
362+
363+
connection.setRequestMethod(method.name());
364+
connection.setConnectTimeout(CONNECTION_TIMEOUT);
365+
headers.forEach(connection::setRequestProperty);
366+
367+
return connection;
368+
}
369+
290370
private InputStream getInputStream(final HttpURLConnection connection) throws IOException {
291371
try {
292372
return connection.getInputStream();
@@ -312,8 +392,23 @@ private InputStream getErrorStream(final InputStream errorStream) {
312392
}
313393
}
314394

315-
private String getEncoding(final String contentEncoding) {
316-
return contentEncoding != null ? contentEncoding : headers.get(ENCODING_HEADER);
395+
private String getContentCharset(final HttpURLConnection connection) {
396+
final String contentType = connection.getContentType();
397+
398+
if (contentType != null) {
399+
final String[] parts = MIME_PARAMETER_SEPARATOR_PATTERN.split(contentType);
400+
401+
for (int i = 1; i < parts.length; ++i) {
402+
final String parameter = parts[i].trim();
403+
final int index = parameter.indexOf(MIME_PARAMETER_VALUE_SEPARATOR);
404+
405+
if (index != -1 && MIME_PARAMETER_CHARSET.equalsIgnoreCase(parameter.substring(0, index))) {
406+
return parameter.substring(index + 1);
407+
}
408+
}
409+
}
410+
411+
return CHARSET_DEFAULT;
317412
}
318413

319414
}

0 commit comments

Comments
 (0)