Issue #747 add chunked downloading (get bytes 0-0 based)

dsamaey · dsamaey · commit 241a01935390 · 2025-04-07T16:32:03.000+02:00
diff --git a/openeo/rest/job.py b/openeo/rest/job.py
@@ -3,12 +3,11 @@
 import datetime
 import json
 import logging
+import re
 import time
 import typing
-import urllib.error
 from pathlib import Path
 from typing import Dict, List, Optional, Union
-from urllib.error import HTTPError
 
 import requests
 import shutil
@@ -35,8 +34,6 @@
 from openeo.rest.models.logs import LogEntry, log_level_name, normalize_log_level
 from openeo.util import ensure_dir
 
-MAX_RETRIES_DOWNLOAD = 3
-
 if typing.TYPE_CHECKING:
     # Imports for type checking only (circular import issue at runtime).
     from openeo.rest.connection import Connection
@@ -45,7 +42,8 @@
 
 
 DEFAULT_JOB_RESULTS_FILENAME = "job-results.json"
-
+MAX_RETRIES_PER_CHUNK = 3
+RETRIABLE_STATUSCODES = [408, 429, 500, 501, 502, 503, 504]
 
 class BatchJob:
     """
@@ -407,40 +405,9 @@ def download(
             target = target / self.name
         ensure_dir(target.parent)
         logger.info("Downloading Job result asset {n!r} from {h!s} to {t!s}".format(n=self.name, h=self.href, t=target))
-        self._download_chunked(target, chunk_size)
+        _download_chunked(self.href, target, chunk_size)
         return target
 
-    def _download_chunked(self, target: Path, chunk_size: int):
-        file_size = None
-        try:
-            head = requests.head(self.href, stream=True)
-            if head.ok:
-                file_size = int(head.headers['Content-Length'])
-            else:
-                head.raise_for_status()
-            with target.open('wb') as f:
-                for from_byte_index in range(0, file_size, chunk_size):
-                    to_byte_index = min(from_byte_index + chunk_size - 1, file_size - 1)
-                    tries_left = MAX_RETRIES_DOWNLOAD
-                    while tries_left > 0:
-                        try:
-                            range_headers = {"Range": f"bytes={from_byte_index}-{to_byte_index}"}
-                            with requests.get(self.href, headers=range_headers, stream=True) as r:
-                                if r.ok:
-                                    shutil.copyfileobj(r.raw, f)
-                                    break
-                                else:
-                                    r.raise_for_status()
-                        except requests.exceptions.HTTPError as error:
-                            tries_left -= 1
-                            if tries_left < 1:
-                                raise error
-                            else:
-                                logger.warning(f"Failed to retrieve chunk {from_byte_index}-{to_byte_index} from {self.href} (status {error.response.status_code}) - retrying")
-                                continue
-        except requests.exceptions.HTTPError as http_error:
-            raise OpenEoApiPlainError(message=f"Failed to download {self.href}", http_status_code=http_error.response.status_code, error_message=http_error.response.text)
-
     def _get_response(self, stream=True) -> requests.Response:
         return self.job.connection.get(self.href, stream=stream)
 
@@ -457,6 +424,51 @@ def load_bytes(self) -> bytes:
     # TODO: more `load` methods e.g.: load GTiff asset directly as numpy array
 
 
+def _download_chunked(url: str, target: Path, chunk_size: int):
+    try:
+        file_size = _determine_content_length(url)
+        with target.open('wb') as f:
+            for from_byte_index in range(0, file_size, chunk_size):
+                to_byte_index = min(from_byte_index + chunk_size - 1, file_size - 1)
+                tries_left = MAX_RETRIES_PER_CHUNK
+                while tries_left > 0:
+                    try:
+                        range_headers = {"Range": f"bytes={from_byte_index}-{to_byte_index}"}
+                        with requests.get(url, headers=range_headers, stream=True) as r:
+                            if r.ok:
+                                shutil.copyfileobj(r.raw, f)
+                                break
+                            else:
+                                r.raise_for_status()
+                    except requests.exceptions.HTTPError as error:
+                        tries_left -= 1
+                        if tries_left > 0 and error.response.status_code in RETRIABLE_STATUSCODES:
+                            logger.warning(f"Failed to retrieve chunk {from_byte_index}-{to_byte_index} from {url} (status {error.response.status_code}) - retrying")
+                            continue
+                        else:
+                            raise error
+    except requests.exceptions.HTTPError as http_error:
+        raise OpenEoApiPlainError(message=f"Failed to download {url}", http_status_code=http_error.response.status_code, error_message=http_error.response.text)
+
+
+def _determine_content_length(url: str) -> int:
+    range_0_0_response = requests.get(url, headers={"Range": f"bytes=0-0"})
+    if range_0_0_response.status_code == 206:
+        content_range_header = range_0_0_response.headers.get("Content-Range")
+        match = re.match(r"^bytes \d+-\d+/(\d+)$", content_range_header)
+        if match:
+            return int(match.group(1))
+
+        content_range_prefix = "bytes 0-0/"
+        if content_range_header.startswith(content_range_prefix):
+            return int(content_range_header[len(content_range_prefix):])
+    head = requests.head(url, stream=True)
+    if head.ok:
+        return int(head.headers['Content-Length'])
+    else:
+        head.raise_for_status()
+
+
 class MultipleAssetException(OpenEoClientException):
     pass
 
diff --git a/tests/rest/test_job.py b/tests/rest/test_job.py
@@ -543,7 +543,7 @@ def job_with_1_asset(con100, requests_mock, tmp_path) -> BatchJob:
     return job
 
 @pytest.fixture
-def job_with_chunked_asset(con100, requests_mock, tmp_path) -> BatchJob:
+def job_with_chunked_asset_using_head(con100, requests_mock, tmp_path) -> BatchJob:
     requests_mock.get(API_URL + "/jobs/jj1/results", json={"assets": {
         "1.tiff": {"href": API_URL + "/dl/jjr1.tiff", "type": "image/tiff; application=geotiff"},
     }})
@@ -554,8 +554,29 @@ def job_with_chunked_asset(con100, requests_mock, tmp_path) -> BatchJob:
         from_bytes = r
         to_bytes = min(r + chunk_size, len(TIFF_CONTENT)) - 1
         # fail the 1st time, serve the content chunk the 2nd time
+        requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": "bytes=0-0"},
+                          response_list=[{"status_code": 404, "text": "Not found"}])
         requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": f"bytes={from_bytes}-{to_bytes}"},
-                     response_list = [{"status_code": 500, "text": "Server error"},     {"content": TIFF_CONTENT[from_bytes:to_bytes+1]}])
+                     response_list = [{"status_code": 500, "text": "Server error"},
+                                      {"status_code": 206, "content": TIFF_CONTENT[from_bytes:to_bytes+1]}])
+    job = BatchJob("jj1", connection=con100)
+    return job
+
+@pytest.fixture
+def job_with_chunked_asset_using_get_0_0(con100, requests_mock, tmp_path) -> BatchJob:
+    requests_mock.get(API_URL + "/jobs/jj1/results", json={"assets": {
+        "1.tiff": {"href": API_URL + "/dl/jjr1.tiff", "type": "image/tiff; application=geotiff"},
+    }})
+    requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": "bytes=0-0"},
+                      response_list=[{"status_code": 206, "text": "", "headers": {"Content-Range": f"bytes 0-0/{len(TIFF_CONTENT)}"}}])
+    chunk_size = 1000
+    for r in range(0, len(TIFF_CONTENT), chunk_size):
+        from_bytes = r
+        to_bytes = min(r + chunk_size, len(TIFF_CONTENT)) - 1
+        # fail the 1st time, serve the content chunk the 2nd time
+        requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": f"bytes={from_bytes}-{to_bytes}"},
+                     response_list = [{"status_code": 408, "text": "Server error"},
+                                      {"status_code": 206, "content": TIFF_CONTENT[from_bytes:to_bytes+1]}])
     job = BatchJob("jj1", connection=con100)
     return job
 
@@ -595,8 +616,16 @@ def test_get_results_download_file(job_with_1_asset: BatchJob, tmp_path):
     with target.open("rb") as f:
         assert f.read() == TIFF_CONTENT
 
-def test_get_results_download_chunked_file(job_with_chunked_asset: BatchJob, tmp_path):
-    job = job_with_chunked_asset
+def test_get_results_download_chunked_file_using_get_0_0(job_with_chunked_asset_using_get_0_0: BatchJob, tmp_path):
+    job = job_with_chunked_asset_using_get_0_0
+    target = tmp_path / "result.tiff"
+    res = job.get_results().download_file(target, chunk_size=1000)
+    assert res == target
+    with target.open("rb") as f:
+        assert f.read() == TIFF_CONTENT
+
+def test_get_results_download_chunked_file_using_head(job_with_chunked_asset_using_head: BatchJob, tmp_path):
+    job = job_with_chunked_asset_using_head
     target = tmp_path / "result.tiff"
     res = job.get_results().download_file(target, chunk_size=1000)
     assert res == target