Skip to content

Commit 32d1e0e

Browse files
author
Yin Ye
committed
TODO
1 parent fcd7e92 commit 32d1e0e

File tree

1 file changed

+11
-24
lines changed

1 file changed

+11
-24
lines changed

tensorflow_io/core/kernels/arrow/arrow_dataset_ops.cc

+11-24
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@ namespace data {
3232

3333
namespace {
3434

35+
// TODO(yye): implement this func and support reading parquet data in
36+
// streamining way.
3537
Status OpenParquetFile(arrow::fs::FileSystem *fs,
36-
arrow::io::RandomAccessFile *file, )
38+
arrow::io::RandomAccessFile *file) {}
3739

3840
} // namespace
3941

@@ -616,55 +618,37 @@ class ArrowParquetDatasetOp : public ArrowOpKernelBase {
616618
: ArrowBaseIterator<Dataset>(params) {}
617619

618620
private:
621+
// TODO(yye): implementation of getting the first batch.
619622
Status SetupStreamsLocked(Env *env)
620623
TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
621624
TF_RETURN_IF_ERROR(ReadFile(current_file_idx_));
622625

626+
// Open and read parquet file.
623627
while (record_batches_.empty() &&
624628
++current_file_idx_ < dataset()->file_paths_.size()) {
625629
TF_RETURN_IF_ERROR(ReadFile(current_file_idx_));
626630
}
627631

628-
if (current_batch_idx_ < record_batches_.size()) {
629-
current_batch_ = record_batches_[current_batch_idx_];
630-
}
631632
return OkStatus();
632633
}
633634

635+
// TODO(yye): implementation of getting the next batch.
634636
Status NextStreamLocked(Env *env)
635637
TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
636-
ArrowBaseIterator<Dataset>::NextStreamLocked(env);
637-
if (++current_batch_idx_ < record_batches_.size()) {
638-
current_batch_ = record_batches_[current_batch_idx_];
639-
} else if (++current_file_idx_ < dataset()->file_paths_.size()) {
640-
current_batch_idx_ = 0;
641-
while (record_batches_.empty() &&
642-
++current_file_idx_ < dataset()->file_paths_.size()) {
643-
TF_RETURN_IF_ERROR(ReadFile(current_file_idx_));
644-
}
645-
646-
if (!record_batches_.empty()) {
647-
current_batch_ = record_batches_[current_batch_idx_];
648-
} else {
649-
current_batch_ = nullptr;
650-
}
651-
}
652638
return OkStatus();
653639
}
654640

655641
void ResetStreamsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
656642
ArrowBaseIterator<Dataset>::ResetStreamsLocked();
657643
current_file_idx_ = 0;
658-
current_batch_idx_ = 0;
659644
record_batches_.clear();
660645
}
661646

662647
Status ReadFile(int file_index) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
663648
Status res = OkStatus();
664649
do {
665650
std::shared_ptr<arrow::io::RandomAccessFile> file;
666-
res = ArrowUtil::OpenParquetFile(&fs_, &file,
667-
dataset()->file_paths_[file_index]);
651+
res = OpenParquetFile(&fs_, file.get());
668652
if (!res.ok()) {
669653
break;
670654
}
@@ -682,7 +666,7 @@ class ArrowParquetDatasetOp : public ArrowOpKernelBase {
682666
std::unique_ptr<parquet::arrow::FileReader> reader;
683667
builder->properties(properties)->Build(&reader);
684668

685-
if (column_indices_.empty() || !dataset()->same_header_) {
669+
if (column_indices_.empty()) {
686670
column_indices_.clear();
687671
std::shared_ptr<arrow::Schema> schema;
688672
reader->GetSchema(&schema);
@@ -741,9 +725,12 @@ class ArrowParquetDatasetOp : public ArrowOpKernelBase {
741725
}
742726

743727
size_t current_file_idx_ TF_GUARDED_BY(mu_) = 0;
728+
// TODO(yye): stop maintaining/holding all the record batches.
744729
std::vector<std::shared_ptr<arrow::RecordBatch>> record_batches_
745730
TF_GUARDED_BY(mu_);
746731
std::shared_ptr<arrow::fs::FileSystem> fs_ TF_GUARDED_BY(mu_) = nullptr;
732+
733+
// Maintains the index of the columns to read.
747734
std::vector<int> column_indices_ TF_GUARDED_BY(mu_);
748735
};
749736

0 commit comments

Comments
 (0)