UC Merced: redistribute split files on Hugging Face (#2433)

adamjstewart · web-flow · commit 9f2cc0243b8a · 2024-12-03T21:28:40.000+01:00
diff --git a/tests/datasets/test_ucmerced.py b/tests/datasets/test_ucmerced.py
@@ -21,34 +21,11 @@ class TestUCMerced:
     def dataset(
         self, monkeypatch: MonkeyPatch, tmp_path: Path, request: SubRequest
     ) -> UCMerced:
-        md5 = 'a42ef8779469d196d8f2971ee135f030'
-        monkeypatch.setattr(UCMerced, 'md5', md5)
-        url = os.path.join('tests', 'data', 'ucmerced', 'UCMerced_LandUse.zip')
+        url = os.path.join('tests', 'data', 'ucmerced') + os.sep
         monkeypatch.setattr(UCMerced, 'url', url)
-        monkeypatch.setattr(
-            UCMerced,
-            'split_urls',
-            {
-                'train': os.path.join(
-                    'tests', 'data', 'ucmerced', 'uc_merced-train.txt'
-                ),
-                'val': os.path.join('tests', 'data', 'ucmerced', 'uc_merced-val.txt'),
-                'test': os.path.join('tests', 'data', 'ucmerced', 'uc_merced-test.txt'),
-            },
-        )
-        monkeypatch.setattr(
-            UCMerced,
-            'split_md5s',
-            {
-                'train': 'a01fa9f13333bb176fc1bfe26ff4c711',
-                'val': 'a01fa9f13333bb176fc1bfe26ff4c711',
-                'test': 'a01fa9f13333bb176fc1bfe26ff4c711',
-            },
-        )
-        root = tmp_path
         split = request.param
         transforms = nn.Identity()
-        return UCMerced(root, split, transforms, download=True, checksum=True)
+        return UCMerced(tmp_path, split, transforms, download=True)
 
     def test_getitem(self, dataset: UCMerced) -> None:
         x = dataset[0]
@@ -65,14 +42,14 @@ def test_add(self, dataset: UCMerced) -> None:
         assert len(ds) == 8
 
     def test_already_downloaded(self, dataset: UCMerced, tmp_path: Path) -> None:
-        UCMerced(root=tmp_path, download=True)
+        UCMerced(tmp_path)
 
     def test_already_downloaded_not_extracted(
         self, dataset: UCMerced, tmp_path: Path
     ) -> None:
         shutil.rmtree(dataset.root)
-        shutil.copy(dataset.url, tmp_path)
-        UCMerced(root=tmp_path, download=False)
+        shutil.copy(dataset.url + dataset.filename, tmp_path)
+        UCMerced(tmp_path)
 
     def test_not_downloaded(self, tmp_path: Path) -> None:
         with pytest.raises(DatasetNotFoundError, match='Dataset not found'):
diff --git a/torchgeo/datasets/ucmerced.py b/torchgeo/datasets/ucmerced.py
@@ -66,17 +66,17 @@ class UCMerced(NonGeoClassificationDataset):
     * https://dl.acm.org/doi/10.1145/1869790.1869829
     """
 
-    url = 'https://hf.co/datasets/torchgeo/ucmerced/resolve/d0af6e2eeea2322af86078068bd83337148a2149/UCMerced_LandUse.zip'
+    url = 'https://hf.co/datasets/torchgeo/ucmerced/resolve/7c5ef3454d9b1cccfa7ccde0c01fc8f00a45909a/'
     filename = 'UCMerced_LandUse.zip'
     md5 = '5b7ec56793786b6dc8a908e8854ac0e4'
 
     base_dir = os.path.join('UCMerced_LandUse', 'Images')
 
     splits = ('train', 'val', 'test')
-    split_urls: ClassVar[dict[str, str]] = {
-        'train': 'https://storage.googleapis.com/remote_sensing_representations/uc_merced-train.txt',
-        'val': 'https://storage.googleapis.com/remote_sensing_representations/uc_merced-val.txt',
-        'test': 'https://storage.googleapis.com/remote_sensing_representations/uc_merced-test.txt',
+    split_filenames: ClassVar[dict[str, str]] = {
+        'train': 'uc_merced-train.txt',
+        'val': 'uc_merced-val.txt',
+        'test': 'uc_merced-test.txt',
     }
     split_md5s: ClassVar[dict[str, str]] = {
         'train': 'f2fb12eb2210cfb53f93f063a35ff374',
@@ -113,7 +113,7 @@ def __init__(
         self._verify()
 
         valid_fns = set()
-        with open(os.path.join(self.root, f'uc_merced-{split}.txt')) as f:
+        with open(os.path.join(self.root, self.split_filenames[split])) as f:
             for fn in f:
                 valid_fns.add(fn.strip())
 
@@ -173,16 +173,12 @@ def _verify(self) -> None:
     def _download(self) -> None:
         """Download the dataset."""
         download_url(
-            self.url,
-            self.root,
-            filename=self.filename,
-            md5=self.md5 if self.checksum else None,
+            self.url + self.filename, self.root, md5=self.md5 if self.checksum else None
         )
         for split in self.splits:
             download_url(
-                self.split_urls[split],
+                self.url + self.split_filenames[split],
                 self.root,
-                filename=f'uc_merced-{split}.txt',
                 md5=self.split_md5s[split] if self.checksum else None,
             )