[BUG] Fix data loader (#2810)

TonyBagnall · web-flow · commit 06d8e649bea1 · 2025-05-26T10:58:33.000+01:00
* Update _data_loaders.py

* Update _data_loaders.py

* stop deleting directories
diff --git a/aeon/datasets/_data_loaders.py b/aeon/datasets/_data_loaders.py
@@ -468,15 +468,17 @@ def _download_and_extract(url, extract_path=None):
         extract_path = os.path.join(extract_path, "%s/" % file_name.split(".")[0])
 
     try:
-        if not os.path.exists(extract_path):
+        already_exists = os.path.exists(extract_path)
+        if not already_exists:
             os.makedirs(extract_path)
         zipfile.ZipFile(zip_file_name, "r").extractall(extract_path)
         shutil.rmtree(dl_dir)
         return extract_path
     except zipfile.BadZipFile:
         shutil.rmtree(dl_dir)
-        if os.path.exists(extract_path):
-            shutil.rmtree(extract_path)
+        if not already_exists:
+            if os.path.exists(extract_path):
+                shutil.rmtree(extract_path)
         raise zipfile.BadZipFile(
             "Could not unzip dataset. Please make sure the URL is valid."
         )
@@ -546,7 +548,7 @@ def _load_tsc_dataset(
             except zipfile.BadZipFile as e:
                 raise ValueError(
                     f"Invalid dataset name ={name} is not available on extract path ="
-                    f"{extract_path}. Nor is it available on {url}",
+                    f"{extract_path} nor is it available on {url}",
                 ) from e
 
     return _load_saved_dataset(
@@ -1342,7 +1344,7 @@ def load_classification(
             try_zenodo = False
             error_str = (
                 f"Invalid dataset name ={name} that is not available on extract path "
-                f"={extract_path}. Nor is it available on "
+                f"={extract_path} nor is it available on "
                 f"https://timeseriesclassification.com/ or zenodo."
             )
             try:
diff --git a/aeon/datasets/tests/test_data_loaders.py b/aeon/datasets/tests/test_data_loaders.py
@@ -6,6 +6,7 @@
 import shutil
 import tempfile
 from urllib.error import URLError
+from zipfile import BadZipFile
 
 import numpy as np
 import pandas as pd
@@ -24,6 +25,7 @@
 from aeon.datasets._data_loaders import (
     CONNECTION_ERRORS,
     _alias_datatype_check,
+    _download_and_extract,
     _get_channel_strings,
     _load_data,
     _load_header_info,
@@ -551,3 +553,21 @@ def test_load_tsc_dataset():
         assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
         with pytest.raises(ValueError, match="Invalid dataset name"):
             _load_tsc_dataset("FOO", split="TEST", extract_path=tmp)
+
+
+@pytest.mark.skipif(
+    PR_TESTING,
+    reason="Only run on overnights because of intermittent fail for read/write",
+)
+@pytest.mark.xfail(raises=(URLError, TimeoutError, ConnectionError))
+def test_download_and_extract():
+    """Test that the function does not delete a directory if already present."""
+    name = "Foo"
+    with tempfile.TemporaryDirectory() as tmp:
+        extract_path = os.path.join(tmp, name)
+        os.makedirs(extract_path)
+        url = "https://timeseriesclassification.com/aeon-toolkit/%s.zip" % name
+        try:
+            _download_and_extract(url, extract_path=extract_path)
+        except BadZipFile:
+            assert os.path.exists(extract_path)