feat: no compression for tar archive

flavioschneider · flavioschneider · commit 1d2472192ab3 · 2022-10-11T18:12:06.000+02:00
diff --git a/audio_data_pytorch/datasets/audio_web_dataset.py b/audio_data_pytorch/datasets/audio_web_dataset.py
@@ -102,12 +102,12 @@ def str_to_tags(self, str: str) -> List[str]:
 
     async def preprocess(self):
         urls, path = self.urls, self.root
-        tarfile_name = os.path.join(path, f"{self.name}.tar.gz")
+        tarfile_name = os.path.join(path, f"{self.name}.tar")
         waveform_id = 0
 
         async with Downloader(urls, path=path) as files:
             async with Decompressor(files, path=path) as folders:
-                with tarfile.open(tarfile_name, "w:gz") as archive:
+                with tarfile.open(tarfile_name, "w") as archive:
                     for folder in tqdm(folders):
                         for wav in tqdm(glob.glob(folder + "/**/*.wav")):
                             waveform, rate = torchaudio.load(wav)
diff --git a/audio_data_pytorch/datasets/clotho_dataset.py b/audio_data_pytorch/datasets/clotho_dataset.py
@@ -45,7 +45,7 @@ def data_path(self) -> str:
 
     @property
     def tar_file_name(self) -> str:
-        return os.path.join(self.data_path, f"clotho_{self.split}.tar.gz")
+        return os.path.join(self.data_path, f"clotho_{self.split}.tar")
 
     async def preprocess(self):
         urls, path = self.urls, self.data_path
@@ -58,7 +58,7 @@ async def preprocess(self):
                 captions = pd.read_csv(caption_csv_file)
                 length = len(captions.index)
 
-                with tarfile.open(self.tar_file_name, "w:gz") as archive:
+                with tarfile.open(self.tar_file_name, "w") as archive:
                     for i, caption in tqdm(captions.iterrows(), total=length):
                         wav_file_name = caption.file_name
                         wav_path = os.path.join(folders[0], self.split, wav_file_name)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-data-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.0.15",
+    version="0.0.16",
     license="MIT",
     description="Audio Data - PyTorch",
     long_description_content_type="text/markdown",