Skip to content

Commit 40c13bf

Browse files
authored
CDAP-19939 Fix handling of high precision datetime (#190)
* Fix handling of high precision datetime * Review fixes for handling of high precision datetime * CDAP-19939 - Minor refactoring * Review fixes for handling of high precision datetime
1 parent 7657efb commit 40c13bf

File tree

3 files changed

+145
-6
lines changed

3 files changed

+145
-6
lines changed

src/main/java/io/cdap/delta/bigquery/StructuredRecordToJson.java

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import io.cdap.cdap.api.common.Bytes;
2121
import io.cdap.cdap.api.data.format.StructuredRecord;
2222
import io.cdap.cdap.api.data.schema.Schema;
23+
import org.slf4j.Logger;
24+
import org.slf4j.LoggerFactory;
2325

2426
import java.io.IOException;
2527
import java.math.BigDecimal;
@@ -39,22 +41,35 @@
3941
import java.util.Objects;
4042
import java.util.Set;
4143
import java.util.concurrent.TimeUnit;
44+
import java.util.regex.Matcher;
45+
import java.util.regex.Pattern;
4246
import javax.annotation.Nullable;
4347

4448
/**
4549
* Util class to convert structured record into json.
4650
*/
4751
public final class StructuredRecordToJson {
52+
private static final Logger LOG = LoggerFactory.getLogger(StructuredRecordToJson.class);
53+
4854
private static final DateTimeFormatter DATETIME_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSSSSS");
4955
private static final DateTimeFormatter TIME_FORMATTER = DateTimeFormatter.ofPattern("HH:mm:ss.SSSSSS");
5056
// array of arrays and map of arrays are not supported by big query
5157
private static final Set<Schema.Type> UNSUPPORTED_ARRAY_TYPES = ImmutableSet.of(Schema.Type.ARRAY, Schema.Type.MAP);
5258

59+
private static final int MAX_LOGICAL_DATE_TIME_FRACTION_PRECISION = 6;
60+
/* BigQuery format for DateTime: YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.F]]
61+
* [.F]: Up to six fractional digits (microsecond precision)
62+
*/
63+
private static final Pattern LOGICAL_DATE_PATTERN =
64+
Pattern.compile("\\d{4}-\\d{1,2}-\\d{1,2}([ T]\\d{1,2}:\\d{1,2}:\\d{1,2}(\\.(\\d+))?)?");
65+
private static final int TIME_FRACTION_GROUP = 3;
66+
5367
/**
5468
* Writes object and writes to json writer.
55-
* @param writer json writer to write the object to
56-
* @param name name of the field to be written
57-
* @param object object to be written
69+
*
70+
* @param writer json writer to write the object to
71+
* @param name name of the field to be written
72+
* @param object object to be written
5873
* @param fieldSchema field schema to be written
5974
*/
6075
public static void write(JsonWriter writer, String name, Object object, Schema fieldSchema) throws IOException {
@@ -143,8 +158,14 @@ private static void writeSimpleTypes(JsonWriter writer, String name, boolean isA
143158
writer.value(Objects.requireNonNull(getDecimal((byte[]) object, schema)).toPlainString());
144159
break;
145160
case DATETIME:
146-
//datetime should be already an ISO-8601 string
147-
writer.value(Objects.requireNonNull(object.toString()));
161+
String strValue = object.toString();
162+
// Datetime should be already an ISO-8601 string
163+
// But BigQuery format is stricter than ISO-8601 and does not support Zone and Offset
164+
// Hence it is more closer to DateTimeFormatter.ISO_LOCAL_DATE_TIME but with microsecond precision
165+
// Check if the value matches expected format for DateTime and trim time fraction to
166+
// MAX_TIME_FRACTION_PRECISION if it exceeds it
167+
strValue = checkAndTrimToMaxSupportedPrecision(strValue);
168+
writer.value(Objects.requireNonNull(strValue));
148169
break;
149170
default:
150171
throw new IllegalStateException(
@@ -185,6 +206,27 @@ private static void writeSimpleTypes(JsonWriter writer, String name, boolean isA
185206
}
186207
}
187208

209+
private static String checkAndTrimToMaxSupportedPrecision(String strValue) {
210+
Matcher matcher = LOGICAL_DATE_PATTERN.matcher(strValue);
211+
if (matcher.matches()) {
212+
String timeFraction = matcher.group(TIME_FRACTION_GROUP);
213+
//matcher.group returns null for a group if an optional group did not exist in the string
214+
if (timeFraction != null && timeFraction.length() > MAX_LOGICAL_DATE_TIME_FRACTION_PRECISION) {
215+
//Trim the time fraction to max supported precision
216+
String trimmedTimeFraction = timeFraction.substring(0, MAX_LOGICAL_DATE_TIME_FRACTION_PRECISION);
217+
strValue = new StringBuilder(strValue)
218+
.replace(matcher.start(TIME_FRACTION_GROUP), matcher.end(TIME_FRACTION_GROUP), trimmedTimeFraction)
219+
.toString();
220+
}
221+
} else {
222+
//Don't throw exception for now as we might be missing some scenario in the format
223+
//Let it fail during BigQuery insert in case of wrong format
224+
LOG.warn("Invalid value {} for DATETIME type, it should match the " +
225+
"format YYYY-[M]M-[D]D[( |T)[H]H:[M]M:[S]S[.F]]", strValue);
226+
}
227+
return strValue;
228+
}
229+
188230
private static void writeArray(JsonWriter writer,
189231
String name,
190232
@Nullable Object value,

src/test/java/io/cdap/delta/bigquery/BigQueryEventConsumerTest.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,10 @@
6363
import java.nio.charset.StandardCharsets;
6464
import java.time.Instant;
6565
import java.time.LocalDate;
66+
import java.time.LocalDateTime;
6667
import java.time.ZoneId;
6768
import java.time.ZonedDateTime;
69+
import java.time.temporal.ChronoUnit;
6870
import java.util.ArrayList;
6971
import java.util.Arrays;
7072
import java.util.Collections;
@@ -84,7 +86,7 @@
8486
* Tests for BigQueryEventConsumer. In order to run these tests, service account credentials must be set in the system
8587
* properties. The service account must have permission to create and write to BigQuery datasets and tables,
8688
* as well as permission to write to GCS.
87-
*
89+
* <p>
8890
* The tests create real resources in GCP and will cost some small amount of money for each run.
8991
*/
9092
public class BigQueryEventConsumerTest {
@@ -93,6 +95,7 @@ public class BigQueryEventConsumerTest {
9395
Schema.Field.of("id", Schema.of(Schema.Type.INT)),
9496
Schema.Field.of("name", Schema.of(Schema.Type.STRING)),
9597
Schema.Field.of("created", Schema.of(Schema.LogicalType.TIMESTAMP_MILLIS)),
98+
Schema.Field.of("updated", Schema.nullableOf(Schema.of(Schema.LogicalType.DATETIME))),
9699
Schema.Field.of("bday", Schema.of(Schema.LogicalType.DATE)),
97100
Schema.Field.of("score", Schema.nullableOf(Schema.of(Schema.Type.DOUBLE))),
98101
Schema.Field.of("partition", Schema.nullableOf(Schema.of(Schema.Type.INT))));
@@ -794,6 +797,8 @@ private void insertTruncate(BigQueryEventConsumer eventConsumer, String dataset)
794797
.set("id", 0)
795798
.set("name", "alice")
796799
.setTimestamp("created", ZonedDateTime.ofInstant(Instant.EPOCH, ZoneId.of("UTC")))
800+
//check nano precision value is correctly handled, minusNanos is required otherwise zeroes in nanos are truncated
801+
.setDateTime("updated", LocalDateTime.ofInstant(Instant.now(), ZoneId.of("UTC")).minusNanos(1))
797802
.setDate("bday", LocalDate.ofEpochDay(0))
798803
.set("score", 0.0d)
799804
.build();
@@ -814,6 +819,8 @@ private void insertTruncate(BigQueryEventConsumer eventConsumer, String dataset)
814819
.set("id", 1)
815820
.set("name", "bob")
816821
.setTimestamp("created", ZonedDateTime.ofInstant(Instant.ofEpochSecond(86400), ZoneId.of("UTC")))
822+
.setDateTime("updated", LocalDateTime.ofInstant(Instant.now().truncatedTo(ChronoUnit.SECONDS),
823+
ZoneId.of("UTC")))
817824
.setDate("bday", LocalDate.ofEpochDay(1))
818825
.set("score", 1.0d)
819826
.build();
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* Copyright © 2022 Cask Data, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
5+
* use this file except in compliance with the License. You may obtain a copy of
6+
* the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12+
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13+
* License for the specific language governing permissions and limitations under
14+
* the License.
15+
*/
16+
package io.cdap.delta.bigquery;
17+
18+
import com.google.gson.stream.JsonWriter;
19+
import io.cdap.cdap.api.data.schema.Schema;
20+
import org.junit.Test;
21+
import org.junit.runner.RunWith;
22+
import org.mockito.Mock;
23+
import org.mockito.Mockito;
24+
import org.mockito.junit.MockitoJUnitRunner;
25+
26+
import java.io.IOException;
27+
28+
@RunWith(MockitoJUnitRunner.class)
29+
public class StructuredRecordToJsonTest {
30+
31+
private static final String UPDATED_COLUMN = "updated";
32+
public static final Schema DATETIME_SCHEMA = Schema.of(Schema.LogicalType.DATETIME);
33+
34+
@Mock
35+
private JsonWriter jsonWriter;
36+
37+
@Test
38+
public void testWriteDateTimeNanoPrecision() throws IOException {
39+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 14:31:12.123456789", DATETIME_SCHEMA);
40+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 14:31:12.123456");
41+
}
42+
43+
@Test
44+
public void testWriteDateTimeNanoPrecisionAlternateFormat() throws IOException {
45+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19T14:31:12.123456789", DATETIME_SCHEMA);
46+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19T14:31:12.123456");
47+
}
48+
49+
@Test
50+
public void testWriteDateTimeHundredNanoPrecision() throws IOException {
51+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 14:31:12.1234567", DATETIME_SCHEMA);
52+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 14:31:12.123456");
53+
}
54+
55+
@Test
56+
public void testWriteDateTimeMicrosecondPrecision() throws IOException {
57+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 14:31:12.123456", DATETIME_SCHEMA);
58+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 14:31:12.123456");
59+
}
60+
61+
@Test
62+
public void testWriteDateTimeMillisPrecision() throws IOException {
63+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 14:31:12.123", DATETIME_SCHEMA);
64+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 14:31:12.123");
65+
}
66+
67+
@Test
68+
public void testWriteDateTimeSecondPrecision() throws IOException {
69+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 14:31:12", DATETIME_SCHEMA);
70+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 14:31:12");
71+
}
72+
73+
@Test
74+
public void testWriteDateTimeSecondPrecisionSingleDigit() throws IOException {
75+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19 4:1:2", DATETIME_SCHEMA);
76+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19 4:1:2");
77+
}
78+
79+
@Test
80+
public void testWriteDateTimeWithoutTime() throws IOException {
81+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-04-19", DATETIME_SCHEMA);
82+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-04-19");
83+
}
84+
85+
@Test
86+
public void testWriteDateTimeSingleDigitDate() throws IOException {
87+
StructuredRecordToJson.write(jsonWriter, UPDATED_COLUMN, "2022-4-1", DATETIME_SCHEMA);
88+
Mockito.verify(jsonWriter, Mockito.times(1)).value("2022-4-1");
89+
}
90+
}

0 commit comments

Comments
 (0)