Skip to content

Commit ffa3d4a

Browse files
author
Chris Li
committed
Merge SFTP protocol and default factory
1 parent 9365e8b commit ffa3d4a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1081
-231
lines changed

cdi-core/src/main/java/com/linkedin/cdi/configuration/MultistageProperties.java

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
import com.google.gson.Gson;
88
import com.google.gson.JsonArray;
99
import com.google.gson.JsonObject;
10+
import com.linkedin.cdi.factory.DefaultConnectionClientFactory;
1011
import lombok.Getter;
1112
import lombok.extern.slf4j.Slf4j;
1213
import org.apache.commons.lang.StringUtils;
1314
import org.apache.gobblin.configuration.State;
14-
import com.linkedin.cdi.factory.DefaultS3ClientFactory;
1515

1616

1717
/**
@@ -168,6 +168,18 @@ public <T> T getDefaultValue() {
168168
return (T) ",";
169169
}
170170
},
171+
/**
172+
* By default, CsvExtractor tries to infer the true type of fields when inferring schema
173+
* However, in some cases, the inference is not accurate, and users may prefer to keep all fields as strings.
174+
* In this case ms.csv.default.field.type = string
175+
* Supported types: string | int | long | double | boolean | float
176+
*/
177+
MSTAGE_CSV_DEFAULT_FIELD_TYPE("ms.csv.default.field.type", String.class) {
178+
@Override
179+
public <T> T getDefaultValue() {
180+
return (T) StringUtils.EMPTY;
181+
}
182+
},
171183
/**
172184
* if csv.column.header is true, csv.skip.lines will be 1 by default, if more than 1
173185
* row to be skipped, then set this parameter explicitly.
@@ -345,13 +357,13 @@ public Long getMillis(State state) {
345357
}
346358
},
347359
/**
348-
* http.client.factory define an indirect way to specify the type of HttpClient to use.
349-
* default = {@link com.linkedin.cdi.factory.ApacheHttpClientFactory}
360+
* Define an indirect way to specify the type of connection clients
361+
* default = {@link DefaultConnectionClientFactory}
350362
*/
351-
MSTAGE_HTTP_CLIENT_FACTORY("ms.http.client.factory", String.class) {
363+
MSTAGE_CONNECTION_CLIENT_FACTORY("ms.connection.client.factory", String.class) {
352364
@Override
353365
public <T> T getDefaultValue() {
354-
return (T) "com.linkedin.cdi.factory.ApacheHttpClientFactory";
366+
return (T) "com.linkedin.cdi.factory.DefaultConnectionClientFactory";
355367
}
356368
},
357369
/**
@@ -400,17 +412,6 @@ public <T> T getDefaultValue() {
400412
* Currently, we don't allow exceptions being made to revert errors by using reason code.
401413
*/
402414
MSTAGE_HTTP_STATUS_REASONS("ms.http.status.reasons", JsonObject.class),
403-
/**
404-
* jdbc.client.factory define an indirect way to specify the type of JDBC Client to use.
405-
* default = {@link com.linkedin.cdi.factory.DefaultJdbcClientFactory}
406-
*/
407-
MSTAGE_JDBC_CLIENT_FACTORY("ms.jdbc.client.factory", String.class) {
408-
@Override
409-
public <T> T getDefaultValue() {
410-
return (T) "com.linkedin.cdi.factory.DefaultJdbcClientFactory";
411-
}
412-
},
413-
414415
MSTAGE_JDBC_SCHEMA_REFACTOR("ms.jdbc.schema.refactor", String.class) {
415416
@Override
416417
public <T> T getDefaultValue() {
@@ -538,16 +539,6 @@ public <T> T getDefaultValue() {
538539
return (T) retention;
539540
}
540541
},
541-
/**
542-
* s3.client.factory define an indirect way to specify the type of S3 Client to use.
543-
* default = {@link DefaultS3ClientFactory}
544-
*/
545-
MSTAGE_S3_CLIENT_FACTORY("ms.s3.client.factory", String.class) {
546-
@Override
547-
public <T> T getDefaultValue() {
548-
return (T) "com.linkedin.cdi.factory.DefaultS3ClientFactory";
549-
}
550-
},
551542
/**
552543
* Schema cleansing will replace special characters in the schema element names based
553544
* on a pattern. By default it will replace all blank spaces, $, and @ to underscores.
@@ -764,6 +755,27 @@ public <T> T getDefaultValue() {
764755
*/
765756
MSTAGE_WATERMARK("ms.watermark", JsonArray.class),
766757
MSTAGE_WATERMARK_GROUPS("ms.watermark.groups", JsonArray.class),
758+
/**
759+
* Minimum records to be present in order for the work unit to be successful,
760+
* below the minimum value, the work unit will be failed.
761+
*/
762+
MSTAGE_WORK_UNIT_MIN_RECORDS("ms.work.unit.min.records", Long.class) {
763+
@Override
764+
public <T> T getDefaultValue() {
765+
return (T) Long.valueOf(0);
766+
}
767+
},
768+
/**
769+
* Minimum number of work units to be present in order for the job to proceed,
770+
* below the minimum value, the job will be failed. This parameter shold be used
771+
* only when there is a unit watermark.
772+
*/
773+
MSTAGE_WORK_UNIT_MIN_UNITS("ms.work.unit.min.units", Long.class) {
774+
@Override
775+
public <T> T getDefaultValue() {
776+
return (T) Long.valueOf(0);
777+
}
778+
},
767779
MSTAGE_WORK_UNIT_PARALLELISM_MAX("ms.work.unit.parallelism.max", Integer.class) {
768780
@Override
769781
public boolean validateNonblank(State state) {
@@ -830,6 +842,16 @@ public <T> T getDefaultValue() {
830842
return (T) Long.valueOf(500L);
831843
}
832844
},
845+
MSTAGE_AUDIT_ENABLED("ms.audit.enabled", Boolean.class) {
846+
@Override
847+
public <T> T getDefaultValue() {
848+
return (T) Boolean.FALSE;
849+
}
850+
},
851+
MSTAGE_KAFKA_BROKERS("ms.kafka.brokers", String.class),
852+
MSTAGE_KAFKA_SCHEMA_REGISTRY_URL("ms.kafka.schema.registry.url", String.class),
853+
MSTAGE_KAFKA_CLIENT_ID("ms.kafka.clientId", String.class),
854+
MSTAGE_KAFKA_TOPIC_NAME("ms.kafka.audit.topic.name", String.class),
833855
// Properties defined in Gobblin, redefine here to leverage the new features like validation
834856
CONVERTER_CLASSES("converter.classes", String.class),
835857
DATASET_URN_KEY("dataset.urn", String.class),

cdi-core/src/main/java/com/linkedin/cdi/configuration/StaticConstants.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,5 +60,20 @@ public interface StaticConstants {
6060
String KEY_WORD_FAIL = "fail";
6161
String KEY_WORD_SUCCESS = "success";
6262
String KEY_WORD_ERROR_COLUMN = "errorColumn";
63+
String KEY_WORD_INT = "int";
64+
String KEY_WORD_LONG = "long";
65+
String KEY_WORD_DOUBLE = "double";
66+
String KEY_WORD_FLOAT = "float";
67+
String KEY_WORD_JSON = "json";
68+
String KEY_WORD_CSV = "csv";
69+
String KEY_WORD_AVRO = "avro";
70+
71+
String EXCEPTION_WORK_UNIT_MINIMUM = "Job requires a minimum of %s work unit(s) to proceed because ms.work.unit.min.units = %s.";
72+
String EXCEPTION_RECORD_MINIMUM = "Work unit requires a minimum of %s record(s) to succeed because ms.work.unit.min.records = %s.";
73+
74+
String MSG_ROWS_PROCESSED = "Processed %s records, work unit: %s";
75+
String MSG_WORK_UNIT_ALWAYS = "There should be a work unit.";
76+
String MSG_LOW_WATER_MARK_ALWAYS = "There should be a low watermark.";
77+
String MSG_WORK_UNIT_INFO = "Generating Work Unit: %s, watermark: %s";
6378
Gson GSON = new Gson();
6479
}

cdi-core/src/main/java/com/linkedin/cdi/connection/HttpConnection.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import org.apache.gobblin.configuration.State;
2424
import com.linkedin.cdi.configuration.MultistageProperties;
2525
import com.linkedin.cdi.exception.RetriableAuthenticationException;
26-
import com.linkedin.cdi.factory.HttpClientFactory;
26+
import com.linkedin.cdi.factory.ConnectionClientFactory;
2727
import com.linkedin.cdi.keys.ExtractorKeys;
2828
import com.linkedin.cdi.keys.HttpKeys;
2929
import com.linkedin.cdi.keys.JobKeys;
@@ -79,9 +79,9 @@ synchronized HttpClient getHttpClient(State state) {
7979
if (httpClient == null) {
8080
try {
8181
Class<?> factoryClass = Class.forName(
82-
MultistageProperties.MSTAGE_HTTP_CLIENT_FACTORY.getValidNonblankWithDefault(state));
83-
HttpClientFactory factory = (HttpClientFactory) factoryClass.newInstance();
84-
httpClient = factory.get(state);
82+
MultistageProperties.MSTAGE_CONNECTION_CLIENT_FACTORY.getValidNonblankWithDefault(state));
83+
ConnectionClientFactory factory = (ConnectionClientFactory) factoryClass.newInstance();
84+
httpClient = factory.getHttpClient(state);
8585
} catch (Exception e) {
8686
log.error("Error creating HttpClient: {}", e.getMessage());
8787
}

cdi-core/src/main/java/com/linkedin/cdi/connection/JdbcConnection.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import com.google.gson.JsonArray;
88
import com.google.gson.JsonObject;
9+
import com.linkedin.cdi.factory.ConnectionClientFactory;
910
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
1011
import java.io.ByteArrayInputStream;
1112
import java.nio.charset.StandardCharsets;
@@ -26,7 +27,6 @@
2627
import org.apache.gobblin.configuration.State;
2728
import com.linkedin.cdi.configuration.MultistageProperties;
2829
import com.linkedin.cdi.exception.RetriableAuthenticationException;
29-
import com.linkedin.cdi.factory.JdbcClientFactory;
3030
import com.linkedin.cdi.keys.ExtractorKeys;
3131
import com.linkedin.cdi.keys.JdbcKeys;
3232
import com.linkedin.cdi.keys.JobKeys;
@@ -101,10 +101,10 @@ public WorkUnitStatus executeNext(WorkUnitStatus workUnitStatus) throws Retriabl
101101
*/
102102
private synchronized Connection getJdbcConnection(State state) {
103103
try {
104-
Class<?> factoryClass = Class.forName(MultistageProperties.MSTAGE_JDBC_CLIENT_FACTORY.getValidNonblankWithDefault(state));
105-
JdbcClientFactory factory = (JdbcClientFactory) factoryClass.newInstance();
104+
Class<?> factoryClass = Class.forName(MultistageProperties.MSTAGE_CONNECTION_CLIENT_FACTORY.getValidNonblankWithDefault(state));
105+
ConnectionClientFactory factory = (ConnectionClientFactory) factoryClass.newInstance();
106106

107-
return factory.getConnection(
107+
return factory.getJdbcConnection(
108108
jdbcSourceKeys.getSourceUri(),
109109
MultistageProperties.SOURCE_CONN_USERNAME.getValidNonblankWithDefault(state),
110110
MultistageProperties.SOURCE_CONN_PASSWORD.getValidNonblankWithDefault(state),

cdi-core/src/main/java/com/linkedin/cdi/connection/S3Connection.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package com.linkedin.cdi.connection;
66

77
import com.google.common.collect.Lists;
8+
import com.linkedin.cdi.factory.ConnectionClientFactory;
89
import java.net.URI;
910
import java.time.Duration;
1011
import java.util.List;
@@ -16,7 +17,6 @@
1617
import org.apache.gobblin.configuration.State;
1718
import com.linkedin.cdi.configuration.MultistageProperties;
1819
import com.linkedin.cdi.exception.RetriableAuthenticationException;
19-
import com.linkedin.cdi.factory.S3ClientFactory;
2020
import com.linkedin.cdi.keys.ExtractorKeys;
2121
import com.linkedin.cdi.keys.JobKeys;
2222
import com.linkedin.cdi.keys.S3Keys;
@@ -135,8 +135,8 @@ public WorkUnitStatus executeFirst(WorkUnitStatus workUnitStatus) throws Retriab
135135
synchronized S3Client getS3HttpClient(State state) {
136136
if (s3Client == null) {
137137
try {
138-
Class<?> factoryClass = Class.forName(MultistageProperties.MSTAGE_S3_CLIENT_FACTORY.getValidNonblankWithDefault(state));
139-
S3ClientFactory factory = (S3ClientFactory) factoryClass.newInstance();
138+
Class<?> factoryClass = Class.forName(MultistageProperties.MSTAGE_CONNECTION_CLIENT_FACTORY.getValidNonblankWithDefault(state));
139+
ConnectionClientFactory factory = (ConnectionClientFactory) factoryClass.getDeclaredConstructor().newInstance();
140140

141141
Integer connectionTimeout = s3SourceV2Keys.getConnectionTimeout();
142142
AttributeMap config = connectionTimeout == null ? GLOBAL_HTTP_DEFAULTS
@@ -147,7 +147,7 @@ synchronized S3Client getS3HttpClient(State state) {
147147
s3Client = S3Client.builder()
148148
.region(this.s3SourceV2Keys.getRegion())
149149
.endpointOverride(URI.create(s3SourceV2Keys.getEndpoint()))
150-
.httpClient(factory.getHttpClient(state, config))
150+
.httpClient(factory.getS3Client(state, config))
151151
.credentialsProvider(getCredentialsProvider(state))
152152
.build();
153153
} catch (Exception e) {

0 commit comments

Comments
 (0)