Open-EO · dsamaey · May 12, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 10, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `sar_backscatter`: try to retrieve coefficient options from backend ([#693](https://github.com/Open-EO/openeo-python-client/issues/693))
 - Improve error message when OIDC provider is unavailable ([#751](https://github.com/Open-EO/openeo-python-client/issues/751))
 - Added `on_response_headers` argument to `DataCube.download()` and related to handle (e.g. `print`) the response headers ([#560](https://github.com/Open-EO/openeo-python-client/issues/560))
+- Added more robust download for large job result files (if supported by the server)
 
 ### Changed
 
@@ -21,7 +22,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - `STACAPIJobDatabase.get_by_status()` now always returns a `pandas.DataFrame` with an index compatible with `MultiBackendJobManager`. ([#707](https://github.com/Open-EO/openeo-python-client/issues/707))
 
-
 ## [0.39.1] - 2025-02-26
 
 ### Fixed

diff --git a/openeo/rest/_connection.py b/openeo/rest/_connection.py
@@ -194,6 +194,24 @@ def get(
         """
         return self.request("get", path=path, params=params, stream=stream, auth=auth, **kwargs)
 
+    def head(
+        self,
+        path: str,
+        *,
+        params: Optional[dict] = None,
+        auth: Optional[AuthBase] = None,
+        **kwargs,
+    ) -> Response:
+        """
+        Do HEAD request to REST API.
+
+        :param path: API path (without root url)
+        :param params: Additional query parameters
+        :param auth: optional custom authentication to use instead of the default one
+        :return: response: Response
+        """
+        return self.request("get", path=path, params=params, auth=auth, **kwargs)
+
     def post(self, path: str, json: Optional[dict] = None, **kwargs) -> Response:
         """
         Do POST request to REST API.

diff --git a/openeo/rest/job.py b/openeo/rest/job.py
@@ -3,7 +3,6 @@
 import datetime
 import json
 import logging
-import re
 import time
 import typing
 from pathlib import Path
@@ -15,7 +14,6 @@
 from openeo.internal.documentation import openeo_endpoint
 from openeo.internal.jupyter import (
     VisualDict,
-    VisualList,
     render_component,
     render_error,
 )
@@ -31,7 +29,7 @@
     OpenEoClientException,
 )
 from openeo.rest.models.general import LogsResponse
-from openeo.rest.models.logs import LogEntry, log_level_name, normalize_log_level
+from openeo.rest.models.logs import log_level_name
 from openeo.util import ensure_dir
 
 if typing.TYPE_CHECKING:
@@ -405,7 +403,7 @@ def download(
             target = target / self.name
         ensure_dir(target.parent)
         logger.info("Downloading Job result asset {n!r} from {h!s} to {t!s}".format(n=self.name, h=self.href, t=target))
-        _download_chunked(self.href, target, chunk_size)
+        self._download_to_file(url=self.href, target=target, chunk_size=chunk_size)
         return target
 
     def _get_response(self, stream=True) -> requests.Response:
@@ -424,49 +422,41 @@ def load_bytes(self) -> bytes:
     # TODO: more `load` methods e.g.: load GTiff asset directly as numpy array
 
 
-def _download_chunked(url: str, target: Path, chunk_size: int):
-    try:
-        file_size = _determine_content_length(url)
+    def _download_to_file(self, url: str, target: Path, chunk_size: int):
+        head = requests.head(url, stream=True)
+        if head.ok and 'Accept-Ranges' in head.headers and 'bytes' in head.headers['Accept-Ranges']:
+            file_size = int(head.headers['Content-Length'])
+            self._download_chunked(url=url, target=target, file_size=file_size, chunk_size=chunk_size)
+        else:
+            self._download_unchunked(url=url, target=target)
+
+
+    def _download_chunked(self, url: str, target: Path, file_size: int, chunk_size: int):
         with target.open('wb') as f:
             for from_byte_index in range(0, file_size, chunk_size):
                 to_byte_index = min(from_byte_index + chunk_size - 1, file_size - 1)
                 tries_left = MAX_RETRIES_PER_CHUNK
                 while tries_left > 0:
                     try:
                         range_headers = {"Range": f"bytes={from_byte_index}-{to_byte_index}"}
-                        with requests.get(url, headers=range_headers, stream=True) as r:
-                            if r.ok:
-                                shutil.copyfileobj(r.raw, f)
-                                break
-                            else:
-                                r.raise_for_status()
-                    except requests.exceptions.HTTPError as error:
+                        with self.job.connection.get(path=url, headers=range_headers, stream=True) as r:
+                            r.raise_for_status()
+                            shutil.copyfileobj(r.raw, f)
+                        break
+                    except OpenEoApiPlainError as error:
                         tries_left -= 1
-                        if tries_left > 0 and error.response.status_code in RETRIABLE_STATUSCODES:
-                            logger.warning(f"Failed to retrieve chunk {from_byte_index}-{to_byte_index} from {url} (status {error.response.status_code}) - retrying")
+                        if tries_left > 0 and error.http_status_code in RETRIABLE_STATUSCODES:
+                            logger.warning(f"Failed to retrieve chunk {from_byte_index}-{to_byte_index} from {url} (status {error.http_status_code}) - retrying")
                             continue
                         else:
                             raise error
-    except requests.exceptions.HTTPError as http_error:
-        raise OpenEoApiPlainError(message=f"Failed to download {url}", http_status_code=http_error.response.status_code, error_message=http_error.response.text)
-
-
-def _determine_content_length(url: str) -> int:
-    range_0_0_response = requests.get(url, headers={"Range": f"bytes=0-0"})
-    if range_0_0_response.status_code == 206:
-        content_range_header = range_0_0_response.headers.get("Content-Range")
-        match = re.match(r"^bytes \d+-\d+/(\d+)$", content_range_header)
-        if match:
-            return int(match.group(1))
-
-        content_range_prefix = "bytes 0-0/"
-        if content_range_header.startswith(content_range_prefix):
-            return int(content_range_header[len(content_range_prefix):])
-    head = requests.head(url, stream=True)
-    if head.ok:
-        return int(head.headers['Content-Length'])
-    else:
-        head.raise_for_status()
+
+
+    def _download_unchunked(self, url: str, target: Path):
+        with self.job.connection.get(path=url, stream=True) as r:
+            r.raise_for_status()
+            with target.open("wb") as f:
+                shutil.copyfileobj(r.raw, f)
 
 
 class MultipleAssetException(OpenEoClientException):

diff --git a/tests/rest/test_job.py b/tests/rest/test_job.py
@@ -547,39 +547,18 @@ def job_with_chunked_asset_using_head(con100, requests_mock, tmp_path) -> BatchJ
     requests_mock.get(API_URL + "/jobs/jj1/results", json={"assets": {
         "1.tiff": {"href": API_URL + "/dl/jjr1.tiff", "type": "image/tiff; application=geotiff"},
     }})
-    requests_mock.head(API_URL + "/dl/jjr1.tiff", headers={"Content-Length": f"{len(TIFF_CONTENT)}"})
+    requests_mock.head(API_URL + "/dl/jjr1.tiff", headers={"Content-Length": f"{len(TIFF_CONTENT)}", "Accept-Ranges": "bytes"})
 
     chunk_size = 1000
     for r in range(0, len(TIFF_CONTENT), chunk_size):
         from_bytes = r
         to_bytes = min(r + chunk_size, len(TIFF_CONTENT)) - 1
-        # fail the 1st time, serve the content chunk the 2nd time
-        requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": "bytes=0-0"},
-                          response_list=[{"status_code": 404, "text": "Not found"}])
         requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": f"bytes={from_bytes}-{to_bytes}"},
                      response_list = [{"status_code": 500, "text": "Server error"},
                                       {"status_code": 206, "content": TIFF_CONTENT[from_bytes:to_bytes+1]}])
     job = BatchJob("jj1", connection=con100)
     return job
 
-@pytest.fixture
-def job_with_chunked_asset_using_get_0_0(con100, requests_mock, tmp_path) -> BatchJob:
-    requests_mock.get(API_URL + "/jobs/jj1/results", json={"assets": {
-        "1.tiff": {"href": API_URL + "/dl/jjr1.tiff", "type": "image/tiff; application=geotiff"},
-    }})
-    requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": "bytes=0-0"},
-                      response_list=[{"status_code": 206, "text": "", "headers": {"Content-Range": f"bytes 0-0/{len(TIFF_CONTENT)}"}}])
-    chunk_size = 1000
-    for r in range(0, len(TIFF_CONTENT), chunk_size):
-        from_bytes = r
-        to_bytes = min(r + chunk_size, len(TIFF_CONTENT)) - 1
-        # fail the 1st time, serve the content chunk the 2nd time
-        requests_mock.get(API_URL + "/dl/jjr1.tiff", request_headers={"Range": f"bytes={from_bytes}-{to_bytes}"},
-                     response_list = [{"status_code": 408, "text": "Server error"},
-                                      {"status_code": 206, "content": TIFF_CONTENT[from_bytes:to_bytes+1]}])
-    job = BatchJob("jj1", connection=con100)
-    return job
-
 @pytest.fixture
 def job_with_2_assets(con100, requests_mock, tmp_path) -> BatchJob:
     requests_mock.get(API_URL + "/jobs/jj2/results", json={
@@ -616,14 +595,6 @@ def test_get_results_download_file(job_with_1_asset: BatchJob, tmp_path):
     with target.open("rb") as f:
         assert f.read() == TIFF_CONTENT
 
-def test_get_results_download_chunked_file_using_get_0_0(job_with_chunked_asset_using_get_0_0: BatchJob, tmp_path):
-    job = job_with_chunked_asset_using_get_0_0
-    target = tmp_path / "result.tiff"
-    res = job.get_results().download_file(target, chunk_size=1000)
-    assert res == target
-    with target.open("rb") as f:
-        assert f.read() == TIFF_CONTENT
-
 def test_get_results_download_chunked_file_using_head(job_with_chunked_asset_using_head: BatchJob, tmp_path):
     job = job_with_chunked_asset_using_head
     target = tmp_path / "result.tiff"