Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.

Commit ceef217

Browse files
committed
[BitSail#106][Connector] Migrate hadoop source connector to v1 interface & support more InputFormat.
1 parent ddb35af commit ceef217

File tree

23 files changed

+647
-190
lines changed

23 files changed

+647
-190
lines changed

bitsail-connectors/connector-hadoop/pom.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,26 @@
148148
<scope>provided</scope>
149149
</dependency>
150150

151+
<dependency>
152+
<groupId>com.bytedance.bitsail</groupId>
153+
<artifactId>bitsail-shaded-hive</artifactId>
154+
<version>${revision}</version>
155+
<exclusions>
156+
<exclusion>
157+
<groupId>org.apache.ant</groupId>
158+
<artifactId>ant</artifactId>
159+
</exclusion>
160+
<exclusion>
161+
<artifactId>log4j</artifactId>
162+
<groupId>log4j</groupId>
163+
</exclusion>
164+
<exclusion>
165+
<artifactId>commons-net</artifactId>
166+
<groupId>commons-net</groupId>
167+
</exclusion>
168+
</exclusions>
169+
</dependency>
170+
151171
<dependency>
152172
<groupId>com.bytedance.bitsail</groupId>
153173
<artifactId>bitsail-connector-test</artifactId>

bitsail-connectors/connector-hadoop/src/main/java/com/bytedance/bitsail/connector/hadoop/constant/HadoopConstants.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,4 @@
1818

1919
public class HadoopConstants {
2020
public static String HADOOP_CONNECTOR_NAME = "hadoop";
21-
public static final String HDFS_IMPL = "org.apache.hadoop.hdfs.DistributedFileSystem";
22-
public static final String SCHEMA = "hdfs";
2321
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
* Copyright 2022 Bytedance Ltd. and/or its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.bytedance.bitsail.connector.hadoop.format;
18+
19+
import com.bytedance.bitsail.base.format.DeserializationSchema;
20+
import com.bytedance.bitsail.common.BitSailException;
21+
import com.bytedance.bitsail.common.configuration.BitSailConfiguration;
22+
import com.bytedance.bitsail.common.model.ColumnInfo;
23+
import com.bytedance.bitsail.common.row.Row;
24+
import com.bytedance.bitsail.common.typeinfo.TypeInfo;
25+
import com.bytedance.bitsail.connector.hadoop.error.HadoopErrorCode;
26+
import com.bytedance.bitsail.connector.hadoop.option.HadoopReaderOptions;
27+
28+
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
29+
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
30+
import org.apache.hadoop.hive.serde2.SerDeException;
31+
import org.apache.hadoop.hive.serde2.io.DateWritable;
32+
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
33+
import org.apache.hadoop.hive.serde2.io.ShortWritable;
34+
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
35+
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
36+
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
37+
import org.apache.hadoop.io.BooleanWritable;
38+
import org.apache.hadoop.io.ByteWritable;
39+
import org.apache.hadoop.io.BytesWritable;
40+
import org.apache.hadoop.io.DoubleWritable;
41+
import org.apache.hadoop.io.FloatWritable;
42+
import org.apache.hadoop.io.IntWritable;
43+
import org.apache.hadoop.io.LongWritable;
44+
import org.apache.hadoop.io.Text;
45+
import org.apache.hadoop.io.Writable;
46+
import org.apache.hadoop.mapred.JobConf;
47+
48+
import java.util.List;
49+
import java.util.Properties;
50+
import java.util.stream.Collectors;
51+
52+
public class HiveInputFormatDeserializationSchema implements DeserializationSchema<Writable, Row> {
53+
private final BitSailConfiguration deserializationConfiguration;
54+
private final TypeInfo<?>[] typeInfos;
55+
private final String[] fieldNames;
56+
private final StructObjectInspector inspector;
57+
public HiveInputFormatDeserializationSchema(BitSailConfiguration deserializationConfiguration,
58+
TypeInfo<?>[] typeInfos,
59+
String[] fieldNames) {
60+
61+
this.deserializationConfiguration = deserializationConfiguration;
62+
this.typeInfos = typeInfos;
63+
this.fieldNames = fieldNames;
64+
65+
List<ColumnInfo> columnInfos = deserializationConfiguration.get(HadoopReaderOptions.COLUMNS);
66+
Properties p = new Properties();
67+
String columns = columnInfos.stream().map(ColumnInfo::getName).collect(Collectors.joining(","));
68+
String columnsTypes = columnInfos.stream().map(ColumnInfo::getType).collect(Collectors.joining(":"));
69+
p.setProperty("columns", columns);
70+
p.setProperty("columns.types", columnsTypes);
71+
String inputFormatClass = deserializationConfiguration.get(HadoopReaderOptions.HADOOP_INPUT_FORMAT_CLASS);
72+
try {
73+
switch (inputFormatClass) {
74+
case "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat": {
75+
OrcSerde serde = new OrcSerde();
76+
serde.initialize(new JobConf(), p);
77+
this.inspector = (StructObjectInspector) serde.getObjectInspector();
78+
break;
79+
}
80+
case "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat": {
81+
ParquetHiveSerDe serde = new ParquetHiveSerDe();
82+
serde.initialize(new JobConf(), p);
83+
this.inspector = (StructObjectInspector) serde.getObjectInspector();
84+
break;
85+
}
86+
default:
87+
throw BitSailException.asBitSailException(HadoopErrorCode.UNSUPPORTED_ENCODING, "unsupported input format class: " + inputFormatClass);
88+
}
89+
} catch (SerDeException e) {
90+
throw BitSailException.asBitSailException(HadoopErrorCode.UNSUPPORTED_COLUMN_TYPE, "unsupported column information.");
91+
}
92+
}
93+
94+
@Override
95+
public Row deserialize(Writable message) {
96+
int arity = fieldNames.length;
97+
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
98+
Row row = new Row(arity);
99+
for (int i = 0; i < arity; ++i) {
100+
Object writableData = inspector.getStructFieldData(message, fields.get(i));
101+
row.setField(i, getWritableValue(writableData));
102+
}
103+
return row;
104+
}
105+
106+
@Override
107+
public boolean isEndOfStream(Row nextElement) {
108+
return false;
109+
}
110+
111+
private Object getWritableValue(Object writable) {
112+
Object ret;
113+
114+
if (writable == null) {
115+
ret = null;
116+
} else if (writable instanceof IntWritable) {
117+
ret = ((IntWritable) writable).get();
118+
} else if (writable instanceof Text) {
119+
ret = writable.toString();
120+
} else if (writable instanceof LongWritable) {
121+
ret = ((LongWritable) writable).get();
122+
} else if (writable instanceof ByteWritable) {
123+
ret = ((ByteWritable) writable).get();
124+
} else if (writable instanceof DateWritable) {
125+
ret = ((DateWritable) writable).get();
126+
} else if (writable instanceof DoubleWritable) {
127+
ret = ((DoubleWritable) writable).get();
128+
} else if (writable instanceof TimestampWritable) {
129+
ret = ((TimestampWritable) writable).getTimestamp();
130+
} else if (writable instanceof FloatWritable) {
131+
ret = ((FloatWritable) writable).get();
132+
} else if (writable instanceof BooleanWritable) {
133+
ret = ((BooleanWritable) writable).get();
134+
} else if (writable instanceof BytesWritable) {
135+
BytesWritable bytesWritable = (BytesWritable) writable;
136+
byte[] bytes = bytesWritable.getBytes();
137+
ret = new byte[bytesWritable.getLength()];
138+
System.arraycopy(bytes, 0, ret, 0, bytesWritable.getLength());
139+
} else if (writable instanceof HiveDecimalWritable) {
140+
ret = ((HiveDecimalWritable) writable).getHiveDecimal().bigDecimalValue();
141+
} else if (writable instanceof ShortWritable) {
142+
ret = ((ShortWritable) writable).get();
143+
} else {
144+
ret = writable.toString();
145+
}
146+
return ret;
147+
}
148+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2022 Bytedance Ltd. and/or its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.bytedance.bitsail.connector.hadoop.format;
18+
19+
import com.bytedance.bitsail.base.enumerate.ContentType;
20+
import com.bytedance.bitsail.base.format.DeserializationSchema;
21+
import com.bytedance.bitsail.common.BitSailException;
22+
import com.bytedance.bitsail.common.configuration.BitSailConfiguration;
23+
import com.bytedance.bitsail.common.row.Row;
24+
import com.bytedance.bitsail.common.typeinfo.TypeInfo;
25+
import com.bytedance.bitsail.component.format.csv.CsvDeserializationSchema;
26+
import com.bytedance.bitsail.component.format.json.JsonDeserializationSchema;
27+
import com.bytedance.bitsail.connector.hadoop.error.HadoopErrorCode;
28+
import com.bytedance.bitsail.connector.hadoop.option.HadoopReaderOptions;
29+
30+
import org.apache.hadoop.io.Writable;
31+
32+
public class TextInputFormatDeserializationSchema implements DeserializationSchema<Writable, Row> {
33+
private final BitSailConfiguration deserializationConfiguration;
34+
private final TypeInfo<?>[] typeInfos;
35+
private final String[] fieldNames;
36+
private final DeserializationSchema<byte[], Row> deserializationSchema;
37+
38+
public TextInputFormatDeserializationSchema(BitSailConfiguration deserializationConfiguration,
39+
TypeInfo<?>[] typeInfos,
40+
String[] fieldNames) {
41+
this.deserializationConfiguration = deserializationConfiguration;
42+
this.typeInfos = typeInfos;
43+
this.fieldNames = fieldNames;
44+
45+
ContentType contentType = ContentType.valueOf(
46+
deserializationConfiguration.getNecessaryOption(HadoopReaderOptions.CONTENT_TYPE, HadoopErrorCode.REQUIRED_VALUE).toUpperCase());
47+
switch (contentType) {
48+
case CSV:
49+
this.deserializationSchema =
50+
new CsvDeserializationSchema(deserializationConfiguration, typeInfos, fieldNames);
51+
break;
52+
case JSON:
53+
this.deserializationSchema =
54+
new JsonDeserializationSchema(deserializationConfiguration, typeInfos, fieldNames);
55+
break;
56+
default:
57+
throw BitSailException.asBitSailException(HadoopErrorCode.UNSUPPORTED_ENCODING, "unsupported parser type: " + contentType);
58+
}
59+
}
60+
61+
@Override
62+
public Row deserialize(Writable message) {
63+
return deserializationSchema.deserialize((message.toString()).getBytes());
64+
}
65+
66+
@Override
67+
public boolean isEndOfStream(Row nextElement) {
68+
return false;
69+
}
70+
}

bitsail-connectors/connector-hadoop/src/main/java/com/bytedance/bitsail/connector/hadoop/option/HadoopReaderOptions.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,10 @@
2525

2626
public interface HadoopReaderOptions extends ReaderOptions.BaseReaderOptions {
2727
@Essential
28-
ConfigOption<String> DEFAULT_FS =
29-
key(READER_PREFIX + "defaultFS")
30-
.noDefaultValue(String.class);
31-
@Essential
3228
ConfigOption<String> PATH_LIST =
3329
key(READER_PREFIX + "path_list")
3430
.noDefaultValue(String.class);
3531

36-
@Essential
3732
ConfigOption<String> CONTENT_TYPE =
3833
key(READER_PREFIX + "content_type")
3934
.noDefaultValue(String.class);
@@ -45,4 +40,8 @@ public interface HadoopReaderOptions extends ReaderOptions.BaseReaderOptions {
4540
ConfigOption<Integer> DEFAULT_HADOOP_PARALLELISM_THRESHOLD =
4641
key(READER_PREFIX + "default_hadoop_parallelism_threshold")
4742
.defaultValue(2);
48-
}
43+
44+
ConfigOption<String> HADOOP_INPUT_FORMAT_CLASS =
45+
key(READER_PREFIX + "hadoop_inputformat_class")
46+
.defaultValue("org.apache.hadoop.mapred.TextInputFormat");
47+
}

bitsail-connectors/connector-hadoop/src/main/java/com/bytedance/bitsail/connector/hadoop/source/HadoopSource.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
import java.util.Arrays;
4343
import java.util.List;
4444

45-
public class HadoopSource implements Source<Row, HadoopSourceSplit, EmptyState>, ParallelismComputable {
45+
public class HadoopSource<K, V> implements Source<Row, HadoopSourceSplit, EmptyState>, ParallelismComputable {
4646
private static final Logger LOG = LoggerFactory.getLogger(HadoopSource.class);
4747

4848
private BitSailConfiguration readerConfiguration;
@@ -61,12 +61,12 @@ public Boundedness getSourceBoundedness() {
6161

6262
@Override
6363
public SourceReader<Row, HadoopSourceSplit> createReader(SourceReader.Context readerContext) {
64-
return new HadoopSourceReader(readerConfiguration, readerContext);
64+
return new HadoopSourceReader<K, V>(readerConfiguration, readerContext, hadoopPathList);
6565
}
6666

6767
@Override
6868
public SourceSplitCoordinator<HadoopSourceSplit, EmptyState> createSplitCoordinator(SourceSplitCoordinator.Context<HadoopSourceSplit, EmptyState> coordinatorContext) {
69-
return new HadoopSourceSplitCoordinator(readerConfiguration, coordinatorContext, hadoopPathList);
69+
return new HadoopSourceSplitCoordinator<K, V>(readerConfiguration, coordinatorContext, hadoopPathList);
7070
}
7171

7272
@Override

bitsail-connectors/connector-hadoop/src/main/java/com/bytedance/bitsail/connector/hadoop/source/config/HadoopConf.java

Lines changed: 0 additions & 52 deletions
This file was deleted.

0 commit comments

Comments
 (0)