diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/configuration/MultistageProperties.java‎
Lines changed: 6 additions & 0 deletions b/‎cdi-core/src/main/java/com/linkedin/cdi/configuration/MultistageProperties.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/configuration/StaticConstants.java‎
Lines changed: 2 additions & 0 deletions b/‎cdi-core/src/main/java/com/linkedin/cdi/configuration/StaticConstants.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/connection/HdfsConnection.java‎
Lines changed: 3 additions & 1 deletion b/‎cdi-core/src/main/java/com/linkedin/cdi/connection/HdfsConnection.java‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/connection/HttpConnection.java‎
Lines changed: 6 additions & 2 deletions b/‎cdi-core/src/main/java/com/linkedin/cdi/connection/HttpConnection.java‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/converter/InFlowValidationConverter.java‎
Lines changed: 61 additions & 42 deletions b/‎cdi-core/src/main/java/com/linkedin/cdi/converter/InFlowValidationConverter.java‎
Lines changed: 61 additions & 42 deletions
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/converter/JsonNormalizerConverter.java‎
Lines changed: 3 additions & 1 deletion b/‎cdi-core/src/main/java/com/linkedin/cdi/converter/JsonNormalizerConverter.java‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cdi-core/src/main/java/com/linkedin/cdi/extractor/AvroExtractor.java‎
Lines changed: 16 additions & 11 deletions b/‎cdi-core/src/main/java/com/linkedin/cdi/extractor/AvroExtractor.java‎
Lines changed: 16 additions & 11 deletions
@@ -125,6 +125,12 @@ public <T> T getDefaultValue() {
     }
   },
   MSTAGE_CSV_COLUMN_HEADER("ms.csv.column.header", Boolean.class),
+  MSTAGE_CSV_COLUMN_HEADER_INDEX("ms.csv.column.header.index", Integer.class) {
+    @Override
+    public <T> T getDefaultValue() {
+      return (T) Integer.valueOf(0);
+    }
+  },
   /**
    * a comma-separated string, where each value is either an integer or a range
    * representing the index of the field to include
 
@@ -19,6 +19,7 @@ public interface StaticConstants {
   String KEY_WORD_DATA_TYPE = "dataType";
   String KEY_WORD_DATA_TYPE_TYPE = "dataType.type";
   String KEY_WORD_EOF = "EOF";
+  String KEY_WORD_EPOC = "epoc";
   String KEY_WORD_FIELDS = "fields";
   String KEY_WORD_RANGE_FROM = "from";
   String KEY_WORD_HTTP_OK = "ok";
@@ -38,6 +39,7 @@ public interface StaticConstants {
   String KEY_WORD_PROPERTIES = "properties";
   String KEY_WORD_RANGE = "range";
   String KEY_WORD_RECORD = "record";
+  String KEY_WORD_REGEXP = "regexp";
   String KEY_WORD_RETRY = "retry";
   String KEY_WORD_RETRY_COUNT = "retryCount";
   String KEY_WORD_RETRY_DELAY_IN_SEC = "delayInSec";
 
@@ -5,6 +5,7 @@
 package com.linkedin.cdi.connection;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import java.io.InputStream;
 import java.net.URI;
@@ -66,7 +67,7 @@ public HdfsConnection(State state, JobKeys jobKeys, ExtractorKeys extractorKeys)
    */
   @Override
   public WorkUnitStatus execute(final WorkUnitStatus status) {
-    assert hdfsKeys.getSourceUri() != null;
+    Preconditions.checkNotNull(hdfsKeys.getSourceUri(), "ms.source.uri is missing or of wrong format");
     URI uri = URI.create(getWorkUnitSpecificString(hdfsKeys.getSourceUri(),
         getExtractorKeys().getDynamicParameters()));
 
@@ -142,6 +143,7 @@ private List<String> readFileList(final String path, final String pattern) {
    * @return the file content in an InputStream
    */
   private InputStream readSingleFile(final String path) {
+    log.info("Processing file: {}", path);
     try {
       return fsHelper.getFileStream(path);
     } catch (FileBasedHelperException e) {
 
@@ -35,6 +35,8 @@
 import org.apache.http.HttpResponse;
 import org.apache.http.client.HttpClient;
 import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.client.protocol.HttpClientContext;
 import org.apache.http.util.EntityUtils;
 
 import static com.linkedin.cdi.configuration.StaticConstants.*;
@@ -212,6 +214,7 @@ private Pair<String, CloseableHttpResponse> executeHttpRequest(final HttpRequest
     // trying to make a Http request, capture the client side error and
     // fail the task if any encoding exception or IO exception
     CloseableHttpResponse response;
+    HttpClientContext context = HttpClientContext.create();
     try {
       JsonObject payloads = new JsonObject();
       JsonObject queryParameters = new JsonObject();
@@ -222,8 +225,9 @@ private Pair<String, CloseableHttpResponse> executeHttpRequest(final HttpRequest
           queryParameters.add(entry.getKey(), entry.getValue());
         }
       }
-      response = (CloseableHttpResponse) httpClient.execute(
-          command.getHttpRequest(httpUriTemplate, queryParameters, headers, payloads));
+      HttpUriRequest request = command.getHttpRequest(httpUriTemplate, queryParameters, headers, payloads);
+      response = (CloseableHttpResponse) httpClient.execute(request, context);
+      log.debug(context.toString());
     } catch (Exception e) {
       throw new RuntimeException(e.getMessage(), e);
     }
 
@@ -10,9 +10,11 @@
 import com.google.gson.JsonObject;
 import com.linkedin.cdi.configuration.MultistageProperties;
 import com.linkedin.cdi.configuration.StaticConstants;
+import com.linkedin.cdi.util.JsonUtils;
 import java.text.DecimalFormat;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericData;
@@ -29,20 +31,36 @@
 
 /**
  * This converter does basic count validation based on the Failure Records or Success Records criteria.
+ *
+ * To use this converter for validation, the main source should be the dataset to be validated,
+ * and the secondary input should be the base dataset to validate against.
+ *
+ * The base dataset can be in a nested column of the secondary input, i.e. a field, which can be
+ * retrieved through a JSON path, contains the actual base records.
+ *
+ * Currently following rules are defined:
+ *
+ *  fail (upper bound rule): the source should be failed records
+ *    Job succeeds when the row count in validation set / row count in base set < threshold
+ *    Job fails when the row count in validation set / row count in base set >= threshold
+ *
+ *  success (lower bound rule): the source should be succeeded records
+ *    Job succeeds when the row count in validation set / row count in base set >= threshold
+ *    Job fails when the row count in validation set / row count in base set < threshold
  */
 @Slf4j
 public class InFlowValidationConverter extends Converter<Schema, Schema, GenericRecord, GenericRecord> {
   int expectedRecordsCount;
   int actualRecordsCount;
   private String field;
-  private int failurePercentage;
+  private int threshold;
   private String criteria;
   private String errorColumn;
 
   @Override
   public Converter<Schema, Schema, GenericRecord, GenericRecord> init(WorkUnitState workUnitState) {
     //Load the input to memory
-    getPayloads(workUnitState);
+    expectedRecordsCount = getBaseRowCount(workUnitState);
     fillValidationAttributes(workUnitState);
     return super.init(workUnitState);
   }
@@ -74,20 +92,15 @@ private void verifyAndUpdateCount(GenericRecord inputRecord) {
         actualRecordsCount += ((GenericData.Array<?>) inputRecord.get(fieldList.get(0).name())).size();
       }
     } else {
-      if (errorColumn != null) {
-        updateFailureCount(inputRecord);
-      } else {
-        throw new RuntimeException("Invalid ms.data.field/ms.validation.attributes configuration. "
-            + "InputRecord should be of type Array or should have errorColumn");
-      }
+      actualRecordsCount += (errorColumn == null || inputRecord.get(errorColumn) != null ? 1 : 0);
     }
   }
 
   private void fillValidationAttributes(WorkUnitState workUnitState) {
     JsonObject validationAttributes =
         MultistageProperties.MSTAGE_VALIDATION_ATTRIBUTES.getValidNonblankWithDefault(workUnitState);
     if (validationAttributes.has(KEY_WORD_THRESHOLD)) {
-      failurePercentage = validationAttributes.get(KEY_WORD_THRESHOLD).getAsInt();
+      threshold = validationAttributes.get(KEY_WORD_THRESHOLD).getAsInt();
     }
     if (validationAttributes.has(KEY_WORD_CRITERIA)) {
       criteria = validationAttributes.get(KEY_WORD_CRITERIA).getAsString();
@@ -102,34 +115,41 @@ private void fillValidationAttributes(WorkUnitState workUnitState) {
    * If field is configured in the secondary input and field column
    *  is of type array expected record count with array size
    *  else use all the input records as expected size
+   * @param workUnitState the work unit state object containing secondary input parameter
+   * @return the expected row count
    */
-  private void getPayloads(WorkUnitState workUnitState) {
-    JsonArray payloads = MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(workUnitState);
-    JsonArray records = new JsonArray();
-    List<String> fields = new ArrayList<>();
+  private int getBaseRowCount(WorkUnitState workUnitState) {
+    JsonArray payloads = JsonUtils.filter(KEY_WORD_CATEGORY, KEY_WORD_PAYLOAD,
+        MultistageProperties.MSTAGE_SECONDARY_INPUT.getValidNonblankWithDefault(workUnitState));
+
+    // by default, we expect 1 record
+    if (payloads.size() == 0) {
+      return 1;
+    }
+
+    // secondary input can have multiple payload entries, and each can configure a "fields" element
+    // but for validation purpose, only the first payload entry, and the first field is used.
+    JsonElement fields = JsonUtils.get(KEY_WORD_FIELDS, payloads.get(0).getAsJsonObject());
+    field = StringUtils.EMPTY;
+    if (fields.isJsonArray() && fields.getAsJsonArray().size() > 0) {
+      field = fields.getAsJsonArray().get(0).getAsString();
+    }
+
+    AtomicInteger rowCount = new AtomicInteger();
     for (JsonElement entry : payloads) {
-      if (!entry.isJsonObject()) {
-        log.error("Elements within secondary input should be valid JsonObjects, provided: {}", entry.toString());
-      }
       JsonObject entryJson = entry.getAsJsonObject();
+      JsonArray records = new JsonArray();
       records.addAll(new HdfsReader(workUnitState).readSecondary(entryJson));
-      if (entryJson.has(StaticConstants.KEY_WORD_FIELDS)) {
-        if (entryJson.get(StaticConstants.KEY_WORD_FIELDS).isJsonArray()) {
-          entryJson.get(StaticConstants.KEY_WORD_FIELDS)
-              .getAsJsonArray()
-              .forEach(arrayItem -> fields.add(arrayItem.getAsString()));
-        }
-        field = fields.size() >= 1 ? fields.get(0) : StringUtils.EMPTY;
-      }
+
       // No of expected records
-      if (records.size() > 0 && StringUtils.isNotBlank(field) && (records.get(0)
-          .getAsJsonObject()
-          .get(field) instanceof JsonArray)) {
-        records.forEach(record -> expectedRecordsCount += record.getAsJsonObject().get(field).getAsJsonArray().size());
-      } else if (records.size() > 0) {
-        expectedRecordsCount = records.size();
+      if (records.size() > 0
+          && StringUtils.isNotBlank(field)
+          && (records.get(0).getAsJsonObject().get(field) instanceof JsonArray)) {
+        records.forEach(record -> rowCount.addAndGet(record.getAsJsonObject().get(field).getAsJsonArray().size()));
+      } else {
+        rowCount.addAndGet(records.size());
       }
-    }
+    }    return rowCount.get();
   }
 
   private void updateFailureCount(GenericRecord record) {
@@ -144,21 +164,20 @@ private void updateFailureCount(GenericRecord record) {
   private void validateRule() {
     // check the threshold and throw new Runtime Exception
     float actualPercentage = ((float) actualRecordsCount / expectedRecordsCount) * 100;
-    boolean failJob = false;
-    // validate rules based on type of records
-    if (criteria.equalsIgnoreCase(KEY_WORD_FAIL)) {
-      failJob = actualPercentage > failurePercentage;
-    } else if (criteria.equalsIgnoreCase(KEY_WORD_SUCCESS)) {
-      failJob = (100 - actualPercentage) > failurePercentage;
-    }
-    log.info("Total expectedRecords: {} , failedRecords: {}", expectedRecordsCount, actualRecordsCount);
+    log.info("base row count: {}, actual row count: {}", expectedRecordsCount, actualRecordsCount);
+
+    boolean failJob = criteria.equalsIgnoreCase(KEY_WORD_FAIL) && actualPercentage >= threshold
+        || criteria.equalsIgnoreCase(KEY_WORD_SUCCESS) && actualPercentage < threshold;
 
     if (failJob) {
       // Fail the validation by throwing runtime exception
-      throw new RuntimeException("Failure Threshold exceeds more than " + failurePercentage + "%");
+      throw new RuntimeException("Failure Threshold exceeds more than " + threshold + "%");
     } else {
-      log.info("Validation passed with failure rate {}% less than {}%",
-          new DecimalFormat("##.##").format(actualPercentage), failurePercentage);
+      log.info("Validation passed with {} rate {}% {} {}%",
+          criteria.equalsIgnoreCase(KEY_WORD_FAIL) ? "failure" : "success",
+          new DecimalFormat("##.##").format(actualPercentage),
+          criteria.equalsIgnoreCase(KEY_WORD_FAIL) ? "less than" : "greater than or equal",
+          threshold);
     }
   }
 }
@@ -4,6 +4,7 @@
 
 package com.linkedin.cdi.converter;
 
+import com.google.common.base.Preconditions;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonObject;
@@ -64,7 +65,8 @@ public JsonArray convertSchema(JsonArray inputSchema, WorkUnitState workUnit) {
         normalizedField = columnName;
       }
     }
-    assert normalizedField != null;
+
+    Preconditions.checkNotNull(normalizedField, "Normalized field is NULL.");
     JsonObject dataType = JsonUtils.get(KEY_WORD_COLUMN_NAME,
         normalizedField, KEY_WORD_DATA_TYPE, targetSchema).getAsJsonObject();
     String trueType = JsonUtils.get(KEY_WORD_TYPE, dataType).getAsString();
 
@@ -34,9 +34,9 @@
 import org.apache.gobblin.util.AvroUtils;
 import org.testng.Assert;
 
+import static com.linkedin.cdi.configuration.StaticConstants.*;
 import static org.apache.avro.Schema.Type.*;
 
-
 /**
  * AvroExtractor reads Avro formatted files from HDFS locations.
  *
@@ -129,6 +129,8 @@ protected void setRowFilter(JsonArray schemaArray) {
   @Nullable
   @Override
   public GenericRecord readRecord(GenericRecord reuse) {
+    super.readRecord(reuse);
+
     if (avroExtractorKeys.getAvroRecordIterator() == null
         && !processInputStream(0)) {
       return null;
@@ -168,6 +170,11 @@ protected boolean processInputStream(long starting) {
       return false;
     }
 
+    // returning false to end the work unit if the buffer is null
+    if (workUnitStatus.getBuffer() == null) {
+      return false;
+    }
+
     DataFileStream<GenericRecord> avroRecordIterator;
     try {
       avroRecordIterator = new DataFileStream<>(workUnitStatus.getBuffer(),
@@ -193,12 +200,10 @@ protected boolean processInputStream(long starting) {
     }
 
     // return false to stop the job under these situations
-    if (workUnitStatus.getBuffer() == null
-        || avroExtractorKeys.getAvroRecordIterator() == null) {
+    if (avroExtractorKeys.getAvroRecordIterator() == null) {
       return false;
     }
     avroExtractorKeys.incrCurrentPageNumber();
-
     avroExtractorKeys.logDebugAll(state.getWorkunit());
     workUnitStatus.logDebugAll();
     extractorKeys.logDebugAll(state.getWorkunit());
@@ -264,22 +269,22 @@ private Schema addDerivedFieldsToSchema(Schema schema) {
     List<Schema.Field> fields = AvroUtils.deepCopySchemaFields(schema);
     for (Map.Entry<String, Map<String, String>> derivedField: derivedFields) {
       String name = derivedField.getKey();
-      String type = derivedField.getValue().get("type");
+      String type = derivedField.getValue().get(KEY_WORD_TYPE);
       switch (type) {
-        case "epoc":
+        case KEY_WORD_EPOC:
           fields.add(new Schema.Field(name, Schema.create(LONG), name, null));
           break;
-        case "string":
-        case "regexp":
+        case KEY_WORD_STRING:
+        case KEY_WORD_REGEXP:
           fields.add(new Schema.Field(name, Schema.create(STRING), name, null));
           break;
-        case "boolean":
+        case KEY_WORD_BOOLEAN:
           fields.add(new Schema.Field(name, Schema.create(BOOLEAN), name, null));
           break;
-        case "integer":
+        case KEY_WORD_INTEGER:
           fields.add(new Schema.Field(name, Schema.create(INT), name, null));
           break;
-        case "number":
+        case KEY_WORD_NUMBER:
           fields.add(new Schema.Field(name, Schema.create(DOUBLE), name, null));
           break;
         default: