1
1
/*
2
- * Copyright 2013, 2022 Deutsche Nationalbibliothek et al
2
+ * Copyright 2013, 2023 Deutsche Nationalbibliothek et al
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 the "License";
5
5
* you may not use this file except in compliance with the License.
32
32
import java .io .SequenceInputStream ;
33
33
import java .net .HttpURLConnection ;
34
34
import java .net .URL ;
35
+ import java .net .URLDecoder ;
35
36
import java .util .Arrays ;
36
37
import java .util .HashMap ;
37
38
import java .util .Map ;
38
39
import java .util .regex .Pattern ;
40
+ import java .util .zip .GZIPInputStream ;
39
41
40
42
/**
41
43
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
42
44
*
43
45
* @author Christoph Böhme
44
46
* @author Jan Schnasse
45
47
* @author Jens Wille
48
+ * @author Pascal Christoph (dr0i)
46
49
*/
47
- @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding `) = `UTF-8`, `errorPrefix ` = `ERROR: `." )
50
+ @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding`, `Content-Encoding` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header (`accept`) = `*/*`, `Accept-Charset` header (`acceptcharset `) = `UTF-8`, `errorprefix ` = `ERROR: `." )
48
51
@ In (String .class )
49
52
@ Out (Reader .class )
50
53
@ FluxCommand ("open-http" )
51
54
public final class HttpOpener extends DefaultObjectPipe <String , ObjectReceiver <Reader >> {
52
55
53
- public static final String ACCEPT_DEFAULT = "*/*" ;
54
56
public static final String ACCEPT_HEADER = "accept" ;
57
+ public static final String ACCEPT_CHARSET_HEADER = "accept-charset" ;
58
+ public static final String ACCEPT_ENCODING_HEADER = "accept-encoding" ;
59
+ public static final String CONTENT_ENCODING_HEADER = "content-encoding" ;
55
60
public static final String CONTENT_TYPE_HEADER = "content-type" ;
61
+
62
+ public static final String ACCEPT_DEFAULT = "*/*" ;
63
+ public static final String CHARSET_DEFAULT = "UTF-8" ;
56
64
public static final String DEFAULT_PREFIX = "ERROR: " ;
57
- public static final String ENCODING_DEFAULT = "UTF-8 " ;
58
- public static final String ENCODING_HEADER = "accept-charset " ;
65
+ public static final String HEADER_FIELD_SEPARATOR = "\n " ;
66
+ public static final String HEADER_VALUE_SEPARATOR = ": " ;
59
67
public static final String INPUT_DESIGNATOR = "@-" ;
68
+ public static final String MIME_PARAMETER_CHARSET = "charset" ;
69
+ public static final String MIME_PARAMETER_SEPARATOR = ";" ;
70
+ public static final String MIME_PARAMETER_VALUE_SEPARATOR = "=" ;
60
71
61
72
public static final String DEFAULT_METHOD_NAME = "GET" ;
62
73
public static final Method DEFAULT_METHOD = Method .valueOf (DEFAULT_METHOD_NAME );
63
74
64
- public static final String HEADER_FIELD_SEPARATOR = "\n " ;
65
- public static final String HEADER_VALUE_SEPARATOR = ":" ;
66
-
67
75
private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern .compile (HEADER_FIELD_SEPARATOR );
68
76
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern .compile (HEADER_VALUE_SEPARATOR );
77
+ private static final Pattern MIME_PARAMETER_SEPARATOR_PATTERN = Pattern .compile (MIME_PARAMETER_SEPARATOR );
78
+
79
+ private static final int ALLOWED_REDIRECTIONS = 3 ;
80
+ private static final int CONNECTION_TIMEOUT = 11000 ;
69
81
70
82
private final Map <String , String > headers = new HashMap <>();
71
83
@@ -118,7 +130,7 @@ public boolean getResponseHasBody() {
118
130
*/
119
131
public HttpOpener () {
120
132
setAccept (ACCEPT_DEFAULT );
121
- setEncoding ( ENCODING_DEFAULT );
133
+ setAcceptCharset ( CHARSET_DEFAULT );
122
134
setErrorPrefix (DEFAULT_PREFIX );
123
135
setMethod (DEFAULT_METHOD );
124
136
setUrl (INPUT_DESIGNATOR );
@@ -137,43 +149,59 @@ public void setAccept(final String accept) {
137
149
}
138
150
139
151
/**
140
- * Sets the HTTP request body. The default value for the request body is
141
- * {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
142
- * method} accepts a request body</i>, which means it will use the {@link
143
- * #process(String) input data} data as request body <i>if the input has
144
- * not already been used</i>; otherwise, no request body will be set by
145
- * default.
152
+ * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
153
+ * MIME type such as {@code text/plain} or {@code application/json}.
146
154
*
147
- * <p>If a request body has been set, but the request method does not
148
- * accept a body, the method <i>may</i> be changed to {@code POST}.
155
+ * @param contentType MIME type to use for the HTTP content-type header
156
+ */
157
+ public void setContentType (final String contentType ) {
158
+ setHeader (CONTENT_TYPE_HEADER , contentType );
159
+ }
160
+
161
+ /**
162
+ * Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
163
+ * preferred charset for the HTTP response.
164
+ * The default charset is {@value CHARSET_DEFAULT}.
149
165
*
150
- * @param body the request body
166
+ * @param charset name of the charset used for the accept-charset HTTP header
151
167
*/
152
- public void setBody (final String body ) {
153
- this . body = body ;
168
+ public void setAcceptCharset (final String charset ) {
169
+ setHeader ( ACCEPT_CHARSET_HEADER , charset ) ;
154
170
}
155
171
156
172
/**
157
- * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
158
- * MIME type such as {@code text/plain} or {@code application/json}.
173
+ * @deprecated Use {@link #setAcceptCharset} instead.
174
+ * @param charset name of the charset used for the accept-charset HTTP header
175
+ */
176
+ @ Deprecated
177
+ public void setEncoding (final String charset ) {
178
+ setAcceptCharset (charset );
179
+ }
180
+
181
+ /**
182
+ * Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
183
+ * preferred content encoding for the HTTP response. It accepts HTTP compression.
184
+ * Allowed values are i.a. "gzip" and "Brotli".
185
+ * The default for the content encoding is null, which means "no compression".
159
186
*
160
- * @param contentType MIME type to use for the HTTP content-type header
187
+ * @param acceptEncoding name of content encoding used for the accept-encoding HTTP
188
+ * header
161
189
*/
162
- public void setContentType (final String contentType ) {
163
- setHeader (CONTENT_TYPE_HEADER , contentType );
190
+ public void setAcceptEncoding (final String acceptEncoding ) {
191
+ setHeader (ACCEPT_ENCODING_HEADER , acceptEncoding );
164
192
}
165
193
166
194
/**
167
- * Sets the HTTP {@value ENCODING_HEADER } header value. This is the
168
- * preferred encoding for the HTTP response. Additionally, the encoding
169
- * is used for reading the HTTP response if it does not specify a content
170
- * encoding. The default for the encoding is {@value ENCODING_DEFAULT} .
195
+ * Sets the HTTP {@value CONTENT_ENCODING_HEADER } header value. This is the
196
+ * content encoding for the HTTP request. It enables HTTP compression.
197
+ * Allowed values are "gzip".
198
+ * The default for the content encoding is null, which means "no compression" .
171
199
*
172
- * @param encoding name of the encoding used for the accept-charset HTTP
200
+ * @param contentEncoding name of content encoding used for the content-encoding HTTP
173
201
* header
174
202
*/
175
- public void setEncoding (final String encoding ) {
176
- setHeader (ENCODING_HEADER , encoding );
203
+ public void setContentEncoding (final String contentEncoding ) {
204
+ setHeader (CONTENT_ENCODING_HEADER , contentEncoding );
177
205
}
178
206
179
207
/**
@@ -239,28 +267,40 @@ public void setUrl(final String url) {
239
267
this .url = url ;
240
268
}
241
269
270
+ /**
271
+ * Sets the HTTP request body. The default value for the request body is
272
+ * {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
273
+ * method} accepts a request body</i>, which means it will use the {@link
274
+ * #process(String) input data} data as request body <i>if the input has
275
+ * not already been used</i>; otherwise, no request body will be set by
276
+ * default.
277
+ *
278
+ * <p>If a request body has been set, but the request method does not
279
+ * accept a body, the method <i>may</i> be changed to {@code POST}.
280
+ *
281
+ * @param body the request body
282
+ */
283
+ public void setBody (final String body ) {
284
+ this .body = body ;
285
+ }
286
+
242
287
@ Override
243
288
public void process (final String input ) {
244
289
try {
245
290
final String requestUrl = getInput (input , url );
246
291
final String requestBody = getInput (input ,
247
- body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
248
-
249
- final HttpURLConnection connection =
250
- (HttpURLConnection ) new URL (requestUrl ).openConnection ();
292
+ body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
251
293
252
- connection .setRequestMethod (method .name ());
253
- headers .forEach (connection ::addRequestProperty );
254
-
255
- if (requestBody != null ) {
256
- connection .setDoOutput (true );
257
- connection .getOutputStream ().write (requestBody .getBytes ());
258
- }
294
+ final URL urlToOpen = new URL (requestUrl );
295
+ final HttpURLConnection connection = requestBody != null ?
296
+ doOutput (urlToOpen , requestBody ) : doRedirects (urlToOpen );
259
297
260
298
final InputStream inputStream = getInputStream (connection );
261
- final String contentEncoding = getEncoding (connection . getContentEncoding () );
299
+ final String charset = getContentCharset (connection );
262
300
263
- getReceiver ().process (new InputStreamReader (inputStream , contentEncoding ));
301
+ getReceiver ().process (new InputStreamReader (
302
+ "gzip" .equalsIgnoreCase (connection .getContentEncoding ()) ?
303
+ new GZIPInputStream (inputStream ) : inputStream , charset ));
264
304
}
265
305
catch (final IOException e ) {
266
306
throw new MetafactureException (e );
@@ -287,6 +327,46 @@ else if (inputUsed) {
287
327
return result ;
288
328
}
289
329
330
+ private HttpURLConnection doOutput (final URL urlToOpen , final String requestBody ) throws IOException {
331
+ final HttpURLConnection connection = openConnection (urlToOpen );
332
+
333
+ connection .setDoOutput (true );
334
+ connection .getOutputStream ().write (requestBody .getBytes ());
335
+
336
+ return connection ;
337
+ }
338
+
339
+ private HttpURLConnection doRedirects (final URL startingUrl ) throws IOException {
340
+ URL urlToFollow = startingUrl ;
341
+
342
+ for (int i = 0 ; i < ALLOWED_REDIRECTIONS ; ++i ) {
343
+ final HttpURLConnection connection = openConnection (urlToFollow );
344
+ connection .setInstanceFollowRedirects (false ); // Make the logic below easier to detect redirections
345
+
346
+ switch (connection .getResponseCode ()) {
347
+ case HttpURLConnection .HTTP_MOVED_PERM :
348
+ case HttpURLConnection .HTTP_MOVED_TEMP :
349
+ final String location = URLDecoder .decode (connection .getHeaderField ("Location" ), "UTF-8" );
350
+ urlToFollow = new URL (urlToFollow , location ); // Deal with relative URLs
351
+ break ;
352
+ default :
353
+ return connection ;
354
+ }
355
+ }
356
+
357
+ throw new IOException ("Too many redirects" );
358
+ }
359
+
360
+ private HttpURLConnection openConnection (final URL urlToOpen ) throws IOException {
361
+ final HttpURLConnection connection = (HttpURLConnection ) urlToOpen .openConnection ();
362
+
363
+ connection .setRequestMethod (method .name ());
364
+ connection .setConnectTimeout (CONNECTION_TIMEOUT );
365
+ headers .forEach (connection ::setRequestProperty );
366
+
367
+ return connection ;
368
+ }
369
+
290
370
private InputStream getInputStream (final HttpURLConnection connection ) throws IOException {
291
371
try {
292
372
return connection .getInputStream ();
@@ -312,8 +392,23 @@ private InputStream getErrorStream(final InputStream errorStream) {
312
392
}
313
393
}
314
394
315
- private String getEncoding (final String contentEncoding ) {
316
- return contentEncoding != null ? contentEncoding : headers .get (ENCODING_HEADER );
395
+ private String getContentCharset (final HttpURLConnection connection ) {
396
+ final String contentType = connection .getContentType ();
397
+
398
+ if (contentType != null ) {
399
+ final String [] parts = MIME_PARAMETER_SEPARATOR_PATTERN .split (contentType );
400
+
401
+ for (int i = 1 ; i < parts .length ; ++i ) {
402
+ final String parameter = parts [i ].trim ();
403
+ final int index = parameter .indexOf (MIME_PARAMETER_VALUE_SEPARATOR );
404
+
405
+ if (index != -1 && MIME_PARAMETER_CHARSET .equalsIgnoreCase (parameter .substring (0 , index ))) {
406
+ return parameter .substring (index + 1 );
407
+ }
408
+ }
409
+ }
410
+
411
+ return CHARSET_DEFAULT ;
317
412
}
318
413
319
414
}
0 commit comments