From e1a528c83693616cb2d5b8b0d46c3287e1fec85d Mon Sep 17 00:00:00 2001 From: taidnguyen Date: Mon, 3 Jun 2024 10:11:15 -0400 Subject: [PATCH] add smartopen for gzipped jsonl files --- open_lm/datapreprocess/make_2048.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/open_lm/datapreprocess/make_2048.py b/open_lm/datapreprocess/make_2048.py index 045f60a3..8e025f15 100644 --- a/open_lm/datapreprocess/make_2048.py +++ b/open_lm/datapreprocess/make_2048.py @@ -13,7 +13,7 @@ import argparse from pathlib import Path from transformers import GPTNeoXTokenizerFast - +import smart_open # ======================================== # = Global variables = @@ -52,10 +52,17 @@ def upload_to_s3_and_remove(fname): @contextmanager def get_item_reader(file_name): - """Creates iterator for reading .jsonl files or Zstd compressed .jsonl files""" + """ + Creates iterator for reading .jsonl files, gzip compressed, + or Zstd compressed .jsonl files + """ if file_name.endswith(".jsonl"): with jsonlines.open(file_name) as reader: yield reader + elif file_name.endswith((".jsonl.gz", ".json.gz")): + with smart_open.open(file_name, "r") as f: + with jsonlines.Reader(f) as reader: + yield reader else: dctx = zstd.ZstdDecompressor() with open(file_name, "rb") as compressed_file: