Install Spark 4 release version (#2300)

mathbunnyru · web-flow · commit f9a75e58c526 · 2025-05-30T09:11:31.000+01:00
diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile
@@ -64,13 +64,12 @@ USER ${NB_UID}
 # NOTE: It's important to ensure compatibility between Pandas versions.
 # The pandas version in this Dockerfile should match the version
 # on which the Pandas API for Spark is built.
-# To find the right version:
-# 1. Check out the Spark branch you are on: <https://github.com/apache/spark>
-# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
+# To find the right version, check the pandas version being installed here:
+# https://github.com/apache/spark/blob/<SPARK_VERSION>/dev/infra/Dockerfile
 RUN mamba install --yes \
     'grpcio-status' \
     'grpcio' \
-    'pandas=2.2.2' \
+    'pandas=2.2.3' \
     'pyarrow' && \
     mamba clean --all -f -y && \
     fix-permissions "${CONDA_DIR}" && \
diff --git a/images/pyspark-notebook/setup_spark.py b/images/pyspark-notebook/setup_spark.py
@@ -35,11 +35,8 @@ def get_latest_spark_version() -> str:
     LOGGER.info("Downloading Spark versions information")
     all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
     LOGGER.info(f"All refs: {all_refs}")
-    versions = [
-        ref.removeprefix("spark-").removesuffix("/")
-        for ref in all_refs
-        if re.match(r"^spark-\d", ref) is not None and "incubating" not in ref
-    ]
+    pattern = re.compile(r"^spark-(\d+\.\d+\.\d+)/$")
+    versions = [match.group(1) for ref in all_refs if (match := pattern.match(ref))]
     LOGGER.info(f"Available versions: {versions}")
 
     # Compare versions semantically
@@ -74,6 +71,7 @@ def download_spark(
         spark_dir_name += f"-scala{scala_version}"
     LOGGER.info(f"Spark directory name: {spark_dir_name}")
     spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
+    LOGGER.info(f"Spark download URL: {spark_url}")
 
     tmp_file = Path("/tmp/spark.tar.gz")
     subprocess.check_call(
diff --git a/tests/by_image/pyspark-notebook/units/unit_pandas_version.py b/tests/by_image/pyspark-notebook/units/unit_pandas_version.py
@@ -2,4 +2,4 @@
 # Distributed under the terms of the Modified BSD License.
 import pandas
 
-assert pandas.__version__ == "2.2.2"
+assert pandas.__version__ == "2.2.3"
diff --git a/tests/shared_checks/nbconvert_check.py b/tests/shared_checks/nbconvert_check.py
@@ -17,7 +17,7 @@ def check_nbconvert(
     no_warnings: bool = True,
 ) -> str:
     """Check if nbconvert is able to convert a notebook file"""
-    cont_data_file = "/home/jovyan/data/" + host_file.name
+    cont_data_file = "/home/jovyan/" + host_file.name
 
     output_dir = "/tmp"
     LOGGER.info(