Skip to content

Commit 9365e8b

Browse files
authored
Merge pull request #3 from chris9692/master
Merge up to RB 2639094
2 parents d569f39 + 4b440c6 commit 9365e8b

20 files changed

+344
-144
lines changed

cdi-core/src/main/java/com/linkedin/cdi/configuration/MultistageProperties.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ public <T> T getDefaultValue() {
125125
}
126126
},
127127
MSTAGE_CSV_COLUMN_HEADER("ms.csv.column.header", Boolean.class),
128+
MSTAGE_CSV_COLUMN_HEADER_INDEX("ms.csv.column.header.index", Integer.class) {
129+
@Override
130+
public <T> T getDefaultValue() {
131+
return (T) Integer.valueOf(0);
132+
}
133+
},
128134
/**
129135
* a comma-separated string, where each value is either an integer or a range
130136
* representing the index of the field to include

cdi-core/src/main/java/com/linkedin/cdi/configuration/StaticConstants.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public interface StaticConstants {
1919
String KEY_WORD_DATA_TYPE = "dataType";
2020
String KEY_WORD_DATA_TYPE_TYPE = "dataType.type";
2121
String KEY_WORD_EOF = "EOF";
22+
String KEY_WORD_EPOC = "epoc";
2223
String KEY_WORD_FIELDS = "fields";
2324
String KEY_WORD_RANGE_FROM = "from";
2425
String KEY_WORD_HTTP_OK = "ok";
@@ -38,6 +39,7 @@ public interface StaticConstants {
3839
String KEY_WORD_PROPERTIES = "properties";
3940
String KEY_WORD_RANGE = "range";
4041
String KEY_WORD_RECORD = "record";
42+
String KEY_WORD_REGEXP = "regexp";
4143
String KEY_WORD_RETRY = "retry";
4244
String KEY_WORD_RETRY_COUNT = "retryCount";
4345
String KEY_WORD_RETRY_DELAY_IN_SEC = "delayInSec";

cdi-core/src/main/java/com/linkedin/cdi/connection/HdfsConnection.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package com.linkedin.cdi.connection;
66

77
import com.google.common.annotations.VisibleForTesting;
8+
import com.google.common.base.Preconditions;
89
import com.google.common.collect.Lists;
910
import java.io.InputStream;
1011
import java.net.URI;
@@ -66,7 +67,7 @@ public HdfsConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys)
6667
*/
6768
@Override
6869
public WorkUnitStatus execute(final WorkUnitStatus status) {
69-
assert hdfsKeys.getSourceUri() != null;
70+
Preconditions.checkNotNull(hdfsKeys.getSourceUri(), "ms.source.uri is missing or of wrong format");
7071
URI uri = URI.create(getWorkUnitSpecificString(hdfsKeys.getSourceUri(),
7172
getExtractorKeys().getDynamicParameters()));
7273

@@ -142,6 +143,7 @@ private List<String> readFileList(final String path, final String pattern) {
142143
* @return the file content in an InputStream
143144
*/
144145
private InputStream readSingleFile(final String path) {
146+
log.info("Processing file: {}", path);
145147
try {
146148
return fsHelper.getFileStream(path);
147149
} catch (FileBasedHelperException e) {

cdi-core/src/main/java/com/linkedin/cdi/connection/HttpConnection.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
import org.apache.http.HttpResponse;
3636
import org.apache.http.client.HttpClient;
3737
import org.apache.http.client.methods.CloseableHttpResponse;
38+
import org.apache.http.client.methods.HttpUriRequest;
39+
import org.apache.http.client.protocol.HttpClientContext;
3840
import org.apache.http.util.EntityUtils;
3941

4042
import static com.linkedin.cdi.configuration.StaticConstants.*;
@@ -212,6 +214,7 @@ private Pair<String, CloseableHttpResponse> executeHttpRequest(final HttpRequest
212214
// trying to make a Http request, capture the client side error and
213215
// fail the task if any encoding exception or IO exception
214216
CloseableHttpResponse response;
217+
HttpClientContext context = HttpClientContext.create();
215218
try {
216219
JsonObject payloads = new JsonObject();
217220
JsonObject queryParameters = new JsonObject();
@@ -222,8 +225,9 @@ private Pair<String, CloseableHttpResponse> executeHttpRequest(final HttpRequest
222225
queryParameters.add(entry.getKey(), entry.getValue());
223226
}
224227
}
225-
response = (CloseableHttpResponse) httpClient.execute(
226-
command.getHttpRequest(httpUriTemplate, queryParameters, headers, payloads));
228+
HttpUriRequest request = command.getHttpRequest(httpUriTemplate, queryParameters, headers, payloads);
229+
response = (CloseableHttpResponse) httpClient.execute(request, context);
230+
log.debug(context.toString());
227231
} catch (Exception e) {
228232
throw new RuntimeException(e.getMessage(), e);
229233
}

cdi-core/src/main/java/com/linkedin/cdi/converter/InFlowValidationConverter.java

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
import com.google.gson.JsonObject;
1111
import com.linkedin.cdi.configuration.MultistageProperties;
1212
import com.linkedin.cdi.configuration.StaticConstants;
13+
import com.linkedin.cdi.util.JsonUtils;
1314
import java.text.DecimalFormat;
1415
import java.util.ArrayList;
1516
import java.util.List;
17+
import java.util.concurrent.atomic.AtomicInteger;
1618
import lombok.extern.slf4j.Slf4j;
1719
import org.apache.avro.Schema;
1820
import org.apache.avro.generic.GenericData;
@@ -29,20 +31,36 @@
2931

3032
/**
3133
* This converter does basic count validation based on the Failure Records or Success Records criteria.
34+
*
35+
* To use this converter for validation, the main source should be the dataset to be validated,
36+
* and the secondary input should be the base dataset to validate against.
37+
*
38+
* The base dataset can be in a nested column of the secondary input, i.e. a field, which can be
39+
* retrieved through a JSON path, contains the actual base records.
40+
*
41+
* Currently following rules are defined:
42+
*
43+
* fail (upper bound rule): the source should be failed records
44+
* Job succeeds when the row count in validation set / row count in base set < threshold
45+
* Job fails when the row count in validation set / row count in base set >= threshold
46+
*
47+
* success (lower bound rule): the source should be succeeded records
48+
* Job succeeds when the row count in validation set / row count in base set >= threshold
49+
* Job fails when the row count in validation set / row count in base set < threshold
3250
*/
3351
@Slf4j
3452
public class InFlowValidationConverter extends Converter<Schema, Schema, GenericRecord, GenericRecord> {
3553
int expectedRecordsCount;
3654
int actualRecordsCount;
3755
private String field;
38-
private int failurePercentage;
56+
private int threshold;
3957
private String criteria;
4058
private String errorColumn;
4159

4260
@Override
4361
public Converter<Schema, Schema, GenericRecord, GenericRecord> init(WorkUnitState workUnitState) {
4462
//Load the input to memory
45-
getPayloads(workUnitState);
63+
expectedRecordsCount = getBaseRowCount(workUnitState);
4664
fillValidationAttributes(workUnitState);
4765
return super.init(workUnitState);
4866
}
@@ -74,20 +92,15 @@ private void verifyAndUpdateCount(GenericRecord inputRecord) {
7492
actualRecordsCount += ((GenericData.Array<?>) inputRecord.get(fieldList.get(0).name())).size();
7593
}
7694
} else {
77-
if (errorColumn != null) {
78-
updateFailureCount(inputRecord);
79-
} else {
80-
throw new RuntimeException("Invalid ms.data.field/ms.validation.attributes configuration. "
81-
+ "InputRecord should be of type Array or should have errorColumn");
82-
}
95+
actualRecordsCount += (errorColumn == null || inputRecord.get(errorColumn) != null ? 1 : 0);
8396
}
8497
}
8598

8699
private void fillValidationAttributes(WorkUnitState workUnitState) {
87100
JsonObject validationAttributes =
88101
MultistageProperties.MSTAGE_VALIDATION_ATTRIBUTES.getValidNonblankWithDefault(workUnitState);
89102
if (validationAttributes.has(KEY_WORD_THRESHOLD)) {
90-
failurePercentage = validationAttributes.get(KEY_WORD_THRESHOLD).getAsInt();
103+
threshold = validationAttributes.get(KEY_WORD_THRESHOLD).getAsInt();
91104
}
92105
if (validationAttributes.has(KEY_WORD_CRITERIA)) {
93106
criteria = validationAttributes.get(KEY_WORD_CRITERIA).getAsString();
@@ -102,34 +115,41 @@ private void fillValidationAttributes(WorkUnitState workUnitState) {
102115
* If field is configured in the secondary input and field column
103116
* is of type array expected record count with array size
104117
* else use all the input records as expected size
118+
* @param workUnitState the work unit state object containing secondary input parameter
119+
* @return the expected row count
105120
*/
106-
private void getPayloads(WorkUnitState workUnitState) {
107-
JsonArray payloads = MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(workUnitState);
108-
JsonArray records = new JsonArray();
109-
List<String> fields = new ArrayList<>();
121+
private int getBaseRowCount(WorkUnitState workUnitState) {
122+
JsonArray payloads = JsonUtils.filter(KEY_WORD_CATEGORY, KEY_WORD_PAYLOAD,
123+
MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(workUnitState));
124+
125+
// by default, we expect 1 record
126+
if (payloads.size() == 0) {
127+
return 1;
128+
}
129+
130+
// secondary input can have multiple payload entries, and each can configure a "fields" element
131+
// but for validation purpose, only the first payload entry, and the first field is used.
132+
JsonElement fields = JsonUtils.get(KEY_WORD_FIELDS, payloads.get(0).getAsJsonObject());
133+
field = StringUtils.EMPTY;
134+
if (fields.isJsonArray() && fields.getAsJsonArray().size() > 0) {
135+
field = fields.getAsJsonArray().get(0).getAsString();
136+
}
137+
138+
AtomicInteger rowCount = new AtomicInteger();
110139
for (JsonElement entry : payloads) {
111-
if (!entry.isJsonObject()) {
112-
log.error("Elements within secondary input should be valid JsonObjects, provided: {}", entry.toString());
113-
}
114140
JsonObject entryJson = entry.getAsJsonObject();
141+
JsonArray records = new JsonArray();
115142
records.addAll(new HdfsReader(workUnitState).readSecondary(entryJson));
116-
if (entryJson.has(StaticConstants.KEY_WORD_FIELDS)) {
117-
if (entryJson.get(StaticConstants.KEY_WORD_FIELDS).isJsonArray()) {
118-
entryJson.get(StaticConstants.KEY_WORD_FIELDS)
119-
.getAsJsonArray()
120-
.forEach(arrayItem -> fields.add(arrayItem.getAsString()));
121-
}
122-
field = fields.size() >= 1 ? fields.get(0) : StringUtils.EMPTY;
123-
}
143+
124144
// No of expected records
125-
if (records.size() > 0 && StringUtils.isNotBlank(field) && (records.get(0)
126-
.getAsJsonObject()
127-
.get(field) instanceof JsonArray)) {
128-
records.forEach(record -> expectedRecordsCount += record.getAsJsonObject().get(field).getAsJsonArray().size());
129-
} else if (records.size() > 0) {
130-
expectedRecordsCount = records.size();
145+
if (records.size() > 0
146+
&& StringUtils.isNotBlank(field)
147+
&& (records.get(0).getAsJsonObject().get(field) instanceof JsonArray)) {
148+
records.forEach(record -> rowCount.addAndGet(record.getAsJsonObject().get(field).getAsJsonArray().size()));
149+
} else {
150+
rowCount.addAndGet(records.size());
131151
}
132-
}
152+
} return rowCount.get();
133153
}
134154

135155
private void updateFailureCount(GenericRecord record) {
@@ -144,21 +164,20 @@ private void updateFailureCount(GenericRecord record) {
144164
private void validateRule() {
145165
// check the threshold and throw new Runtime Exception
146166
float actualPercentage = ((float) actualRecordsCount / expectedRecordsCount) * 100;
147-
boolean failJob = false;
148-
// validate rules based on type of records
149-
if (criteria.equalsIgnoreCase(KEY_WORD_FAIL)) {
150-
failJob = actualPercentage > failurePercentage;
151-
} else if (criteria.equalsIgnoreCase(KEY_WORD_SUCCESS)) {
152-
failJob = (100 - actualPercentage) > failurePercentage;
153-
}
154-
log.info("Total expectedRecords: {} , failedRecords: {}", expectedRecordsCount, actualRecordsCount);
167+
log.info("base row count: {}, actual row count: {}", expectedRecordsCount, actualRecordsCount);
168+
169+
boolean failJob = criteria.equalsIgnoreCase(KEY_WORD_FAIL) && actualPercentage >= threshold
170+
|| criteria.equalsIgnoreCase(KEY_WORD_SUCCESS) && actualPercentage < threshold;
155171

156172
if (failJob) {
157173
// Fail the validation by throwing runtime exception
158-
throw new RuntimeException("Failure Threshold exceeds more than " + failurePercentage + "%");
174+
throw new RuntimeException("Failure Threshold exceeds more than " + threshold + "%");
159175
} else {
160-
log.info("Validation passed with failure rate {}% less than {}%",
161-
new DecimalFormat("##.##").format(actualPercentage), failurePercentage);
176+
log.info("Validation passed with {} rate {}% {} {}%",
177+
criteria.equalsIgnoreCase(KEY_WORD_FAIL) ? "failure" : "success",
178+
new DecimalFormat("##.##").format(actualPercentage),
179+
criteria.equalsIgnoreCase(KEY_WORD_FAIL) ? "less than" : "greater than or equal",
180+
threshold);
162181
}
163182
}
164183
}

cdi-core/src/main/java/com/linkedin/cdi/converter/JsonNormalizerConverter.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
package com.linkedin.cdi.converter;
66

7+
import com.google.common.base.Preconditions;
78
import com.google.gson.JsonArray;
89
import com.google.gson.JsonElement;
910
import com.google.gson.JsonObject;
@@ -64,7 +65,8 @@ public JsonArray convertSchema(JsonArray inputSchema, WorkUnitState workUnit) {
6465
normalizedField = columnName;
6566
}
6667
}
67-
assert normalizedField != null;
68+
69+
Preconditions.checkNotNull(normalizedField, "Normalized field is NULL.");
6870
JsonObject dataType = JsonUtils.get(KEY_WORD_COLUMN_NAME,
6971
normalizedField, KEY_WORD_DATA_TYPE, targetSchema).getAsJsonObject();
7072
String trueType = JsonUtils.get(KEY_WORD_TYPE, dataType).getAsString();

cdi-core/src/main/java/com/linkedin/cdi/extractor/AvroExtractor.java

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
import org.apache.gobblin.util.AvroUtils;
3535
import org.testng.Assert;
3636

37+
import static com.linkedin.cdi.configuration.StaticConstants.*;
3738
import static org.apache.avro.Schema.Type.*;
3839

39-
4040
/**
4141
* AvroExtractor reads Avro formatted files from HDFS locations.
4242
*
@@ -129,6 +129,8 @@ protected void setRowFilter(JsonArray schemaArray) {
129129
@Nullable
130130
@Override
131131
public GenericRecord readRecord(GenericRecord reuse) {
132+
super.readRecord(reuse);
133+
132134
if (avroExtractorKeys.getAvroRecordIterator() == null
133135
&& !processInputStream(0)) {
134136
return null;
@@ -168,6 +170,11 @@ protected boolean processInputStream(long starting) {
168170
return false;
169171
}
170172

173+
// returning false to end the work unit if the buffer is null
174+
if (workUnitStatus.getBuffer() == null) {
175+
return false;
176+
}
177+
171178
DataFileStream<GenericRecord> avroRecordIterator;
172179
try {
173180
avroRecordIterator = new DataFileStream<>(workUnitStatus.getBuffer(),
@@ -193,12 +200,10 @@ protected boolean processInputStream(long starting) {
193200
}
194201

195202
// return false to stop the job under these situations
196-
if (workUnitStatus.getBuffer() == null
197-
|| avroExtractorKeys.getAvroRecordIterator() == null) {
203+
if (avroExtractorKeys.getAvroRecordIterator() == null) {
198204
return false;
199205
}
200206
avroExtractorKeys.incrCurrentPageNumber();
201-
202207
avroExtractorKeys.logDebugAll(state.getWorkunit());
203208
workUnitStatus.logDebugAll();
204209
extractorKeys.logDebugAll(state.getWorkunit());
@@ -264,22 +269,22 @@ private Schema addDerivedFieldsToSchema(Schema schema) {
264269
List<Schema.Field> fields = AvroUtils.deepCopySchemaFields(schema);
265270
for (Map.Entry<String, Map<String, String>> derivedField: derivedFields) {
266271
String name = derivedField.getKey();
267-
String type = derivedField.getValue().get("type");
272+
String type = derivedField.getValue().get(KEY_WORD_TYPE);
268273
switch (type) {
269-
case "epoc":
274+
case KEY_WORD_EPOC:
270275
fields.add(new Schema.Field(name, Schema.create(LONG), name, null));
271276
break;
272-
case "string":
273-
case "regexp":
277+
case KEY_WORD_STRING:
278+
case KEY_WORD_REGEXP:
274279
fields.add(new Schema.Field(name, Schema.create(STRING), name, null));
275280
break;
276-
case "boolean":
281+
case KEY_WORD_BOOLEAN:
277282
fields.add(new Schema.Field(name, Schema.create(BOOLEAN), name, null));
278283
break;
279-
case "integer":
284+
case KEY_WORD_INTEGER:
280285
fields.add(new Schema.Field(name, Schema.create(INT), name, null));
281286
break;
282-
case "number":
287+
case KEY_WORD_NUMBER:
283288
fields.add(new Schema.Field(name, Schema.create(DOUBLE), name, null));
284289
break;
285290
default:

0 commit comments

Comments
 (0)