From 07df38e53d74f83802c3b1fcebcaa8dab8d9b15f Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Mon, 14 Apr 2025 09:24:38 +0100 Subject: [PATCH 01/18] add draft garden size flow script `run_calculate_garden_size_flow.py` --- .../run_calculate_garden_size_flow.py | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py new file mode 100644 index 00000000..102b0cd8 --- /dev/null +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py @@ -0,0 +1,165 @@ +""" +Calculate garden area (m2) where possible for properties in the domestic EPC register using Land Registry data and +Microsoft Building Footprints data. + +To run: +python -i asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size.py --epc [path/to/EPC/data] -y [YYYY] -q [Q] -n ews + +[Set -n nation flag to "ew" or "s" for generating garden size estimates for either England and Wales or Scotland INSPIRE +files only. It is recommended to process England-Wales and Scotland separately due to long run time (2+ days).] + +NB: this pipeline takes the preprocessed and deduplicated EPC dataset in parquet file format. +""" + +import logging +import shapely.errors +import polars as pl +import geopandas as gpd +from asf_heat_pump_suitability.utils import save_utils +from asf_heat_pump_suitability.pipeline.prepare_features import ( + lat_lon, + land_extent, + building_footprint, + garden_size, +) +from metaflow import FlowSpec, step, Parameter + + +class CalculateGardenSizeFlow(FlowSpec): + + # Parameters + epc = Parameter( + name="epc", + help="Path to processed and deduplicated EPC dataset in parquet file format", + type=str, + required=True, + ) + + year = Parameter( + name="year", + help="EPC data year. Format YYYY", + type=int, + required=True, + ) + + quarter = Parameter( + name="quarter", + help="EPC data quarter", + type=int, + required=True, + ) + + nations = Parameter( + name="nations", + help="Nations to get INSPIRE land registry file bounds for. Select from England and Wales (ew); Scotland (s); or all (ews).", + type=str, + required=True, + ) + + @step + def start(self): + """ + Load datasets and start flow. + """ + logging.info("Load EPC UPRNs") + epc_df = pl.read_parquet(self.epc, columns=["UPRN"]) + + logging.info("Adding lat/lon data to EPC") + uprn_coords_df = lat_lon.transform_df_osopen_uprn_latlon() + epc_df = epc_df.join(uprn_coords_df, how="left", on="UPRN") + self.epc_gdf = lat_lon.generate_gdf_uprn_coords(epc_df, usecols=["UPRN"])[ + ["UPRN", "geometry"] + ] + + logging.info("Loading land registry file boundaries") + land_file_bounds = gpd.read_file( + f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/inspire_file_bounds_{self.nations.upper()}.geojson" + ) + microsoft_file_bounds = building_footprint.transform_df_uk_dataset_links() + + # Match land extent files with overlapping building footprint files + self.file_matches = garden_size.match_series_files_land_building( + land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds + ) + + logging.info( + f"Estimating garden size for properties across {len(self.file_matches)} pairs of land extent and building footprint files." + ) + + self.next( + self.estimate_garden_size, foreach=list(self.file_matches.index.unique()) + ) + + @step + def estimate_garden_size(self): + land_file = self.input + building_files = self.file_matches.filter(like=land_file, axis=0).values + + # Prepare land parcel data + land_parcels_gdf = land_extent.transform_gdf_land_parcels(f"s3://{land_file}") + + for building_file in building_files: + # Prepare building footprints data + try: + building_footprints_gdf = ( + building_footprint.transform_gdf_building_footprints(building_file) + ) + except shapely.errors.GEOSException as e: + logging.warning( + f"Error loading building footprint file {building_file}. Error message: {e}.\n" + f"Skipping this land extent & building footprint pairing." + ) + continue + + # Get intersection of building footprint polygons and land polygons + intersection_gdf = garden_size.generate_gdf_land_building_overlay( + land_parcels_gdf=land_parcels_gdf, + building_footprints_gdf=building_footprints_gdf, + ) + + # Get garden size + gardens_gdf = garden_size.generate_gdf_garden_size( + intersection_gdf, land_parcels_gdf + ) + gardens_gdf = gardens_gdf.assign( + inspire_land_extent_file=land_file, + microsoft_building_footprint_file=building_file, + ) + + # Match EPC UPRNs with land parcels and gardens using UPRN coordinates + # This will keep only EPC records for which garden size can be estimated + epc_df = gpd.sjoin( + self.epc_gdf, + gardens_gdf, + how="inner", + predicate="intersects", + ).drop(columns=["geometry", "index_right"]) + + self.epc_df = pl.from_pandas(epc_df) + self.next(self.join) + + @step + def join(self, inputs): + self.epc_gardens_df = pl.concat([input.epc_df for input in inputs]) + logging.info( + f"Garden size calculated for {len(self.epc_gardens_df)} EPC properties in total." + ) + + @step + def end(self): + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" + save_utils.save_to_s3(self.epc_gardens_df, save_as) + + self.epc_gardens_df = self.epc_gardens_df.with_columns( + pl.col(pl.Float64).round(2) + ) + self.epc_gardens_df = garden_size.deduplicate_df_garden_size( + self.epc_gardens_df + ) + + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" + save_utils.save_to_s3(self.epc_gardens_df, save_as) + + +if __name__ == "__main__": + CalculateGardenSizeFlow() From 2f8c2acfaf07082cf77b6a82277fd49f330c6dbd Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:58:59 +0100 Subject: [PATCH 02/18] fix bugs in `run_calculate_garden_size_flow.py` and add script documentation --- .../run_calculate_garden_size_flow.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py index 102b0cd8..de175185 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py @@ -1,12 +1,12 @@ """ -Calculate garden area (m2) where possible for properties in the domestic EPC register using Land Registry data and +Flow to calculate garden area (m2) where possible for properties in the domestic EPC register using Land Registry data and Microsoft Building Footprints data. To run: -python -i asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size.py --epc [path/to/EPC/data] -y [YYYY] -q [Q] -n ews +python asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py run --epc [path/to/EPC/data] --year [YYYY] --quarter [Q] --nations ews --max-num-splits 400 [Set -n nation flag to "ew" or "s" for generating garden size estimates for either England and Wales or Scotland INSPIRE -files only. It is recommended to process England-Wales and Scotland separately due to long run time (2+ days).] +files only.] NB: this pipeline takes the preprocessed and deduplicated EPC dataset in parquet file format. """ @@ -15,6 +15,7 @@ import shapely.errors import polars as pl import geopandas as gpd +import pandas as pd from asf_heat_pump_suitability.utils import save_utils from asf_heat_pump_suitability.pipeline.prepare_features import ( lat_lon, @@ -81,14 +82,13 @@ def start(self): self.file_matches = garden_size.match_series_files_land_building( land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds ) + self.land_files = list(self.file_matches.index.unique()) logging.info( f"Estimating garden size for properties across {len(self.file_matches)} pairs of land extent and building footprint files." ) - self.next( - self.estimate_garden_size, foreach=list(self.file_matches.index.unique()) - ) + self.next(self.estimate_garden_size, foreach="land_files") @step def estimate_garden_size(self): @@ -98,10 +98,11 @@ def estimate_garden_size(self): # Prepare land parcel data land_parcels_gdf = land_extent.transform_gdf_land_parcels(f"s3://{land_file}") + building_footprints_gdfs = [] for building_file in building_files: # Prepare building footprints data try: - building_footprints_gdf = ( + _building_footprints_gdf = ( building_footprint.transform_gdf_building_footprints(building_file) ) except shapely.errors.GEOSException as e: @@ -110,6 +111,14 @@ def estimate_garden_size(self): f"Skipping this land extent & building footprint pairing." ) continue + else: + _building_footprints_gdf["microsoft_building_footprint_file"] = ( + building_file + ) + building_footprints_gdfs.append(_building_footprints_gdf) + + if building_footprints_gdfs: + building_footprints_gdf = pd.concat(building_footprints_gdfs) # Get intersection of building footprint polygons and land polygons intersection_gdf = garden_size.generate_gdf_land_building_overlay( @@ -123,7 +132,6 @@ def estimate_garden_size(self): ) gardens_gdf = gardens_gdf.assign( inspire_land_extent_file=land_file, - microsoft_building_footprint_file=building_file, ) # Match EPC UPRNs with land parcels and gardens using UPRN coordinates @@ -136,14 +144,16 @@ def estimate_garden_size(self): ).drop(columns=["geometry", "index_right"]) self.epc_df = pl.from_pandas(epc_df) - self.next(self.join) + + self.next(self.concatenate_garden_size_dfs) @step - def join(self, inputs): + def concatenate_garden_size_dfs(self, inputs): self.epc_gardens_df = pl.concat([input.epc_df for input in inputs]) logging.info( f"Garden size calculated for {len(self.epc_gardens_df)} EPC properties in total." ) + self.next(self.end) @step def end(self): From 519b615bb1e19bb606aacf97297074ee71b1e75e Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 15 Apr 2025 13:47:32 +0100 Subject: [PATCH 03/18] set default `nations` parameter value in `run_calculate_garden_size_flow.py` --- .../pipeline/run_scripts/run_calculate_garden_size_flow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py index de175185..e46358c6 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py @@ -5,10 +5,10 @@ To run: python asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py run --epc [path/to/EPC/data] --year [YYYY] --quarter [Q] --nations ews --max-num-splits 400 -[Set -n nation flag to "ew" or "s" for generating garden size estimates for either England and Wales or Scotland INSPIRE +[Set --nations flag to "ew" or "s" for generating garden size estimates for either England and Wales or Scotland INSPIRE files only.] -NB: this pipeline takes the preprocessed and deduplicated EPC dataset in parquet file format. +NB: this flow takes the preprocessed and deduplicated EPC dataset in parquet file format. """ import logging @@ -55,6 +55,7 @@ class CalculateGardenSizeFlow(FlowSpec): help="Nations to get INSPIRE land registry file bounds for. Select from England and Wales (ew); Scotland (s); or all (ews).", type=str, required=True, + default="ews", ) @step From 3840997be6a3de80c6ff8b6713eb27a660710188 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 15 Apr 2025 15:53:35 +0100 Subject: [PATCH 04/18] add `batch` decorators to `run_calculate_garden_size_flow.py` --- .../pipeline/run_scripts/run_calculate_garden_size_flow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py index e46358c6..76a3d1e1 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py @@ -23,7 +23,7 @@ building_footprint, garden_size, ) -from metaflow import FlowSpec, step, Parameter +from metaflow import FlowSpec, step, Parameter, batch class CalculateGardenSizeFlow(FlowSpec): @@ -91,6 +91,7 @@ def start(self): self.next(self.estimate_garden_size, foreach="land_files") + @batch() @step def estimate_garden_size(self): land_file = self.input @@ -156,6 +157,7 @@ def concatenate_garden_size_dfs(self, inputs): ) self.next(self.end) + @batch() @step def end(self): save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" From d52d12696ea70857918092f088fa8a1678623968 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:30:42 +0100 Subject: [PATCH 05/18] add `batch` resources to `run_calculate_garden_size_flow.py` --- .../pipeline/run_scripts/run_calculate_garden_size_flow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py index 76a3d1e1..02ade92b 100644 --- a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py @@ -91,7 +91,7 @@ def start(self): self.next(self.estimate_garden_size, foreach="land_files") - @batch() + @batch(cpu=2, memory=16000) @step def estimate_garden_size(self): land_file = self.input @@ -157,7 +157,7 @@ def concatenate_garden_size_dfs(self, inputs): ) self.next(self.end) - @batch() + @batch(cpu=2, memory=16000) @step def end(self): save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" From 94267ec9a4d2dfb0760835fe5af873aff04941c8 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:44:15 +0100 Subject: [PATCH 06/18] add MANIFEST.in file and modify setup files to allow pip install of repo --- MANIFEST.in | 1 + asf_heat_pump_suitability/pipeline/flows/__init__.py | 0 .../{run_scripts => flows}/run_calculate_garden_size_flow.py | 0 asf_heat_pump_suitability/utils/parallel_utils.py | 0 setup.cfg | 3 +++ setup.py | 3 +++ 6 files changed, 7 insertions(+) create mode 100644 MANIFEST.in create mode 100644 asf_heat_pump_suitability/pipeline/flows/__init__.py rename asf_heat_pump_suitability/pipeline/{run_scripts => flows}/run_calculate_garden_size_flow.py (100%) create mode 100644 asf_heat_pump_suitability/utils/parallel_utils.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..2cf8782f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include asf_heat_pump_suitability/config/*.yaml diff --git a/asf_heat_pump_suitability/pipeline/flows/__init__.py b/asf_heat_pump_suitability/pipeline/flows/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py similarity index 100% rename from asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py rename to asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py diff --git a/asf_heat_pump_suitability/utils/parallel_utils.py b/asf_heat_pump_suitability/utils/parallel_utils.py new file mode 100644 index 00000000..e69de29b diff --git a/setup.cfg b/setup.cfg index 40a0c8cd..888eb3b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,3 +36,6 @@ per-file-ignores = # Don't pester about parameters for a one-line docstring strictness=short docstring_style=google + +[options] +include_package_data = True diff --git a/setup.py b/setup.py index b5197809..466762dc 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ """asf_heat_pump_suitability.""" + from pathlib import Path from setuptools import find_packages from setuptools import setup @@ -18,7 +19,9 @@ def read_lines(path): long_description=open(BASE_DIR / "README.md").read(), install_requires=read_lines(BASE_DIR / "requirements.txt"), extras_require={"dev": read_lines(BASE_DIR / "requirements_dev.txt")}, + include_package_data=True, packages=find_packages(exclude=["docs"]), + package_data={"": ["*.txt", "*.yaml"]}, version="0.1.0", description="Early-stage scoping of a project to identify which homes/streets are likely to be suitable (or unsuitable) for which types of heat pumps.", author="Nesta", From 857def4f271e1148311bd02389a85cbf415d2113 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:44:59 +0100 Subject: [PATCH 07/18] add metaflow to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 64c1c1aa..5f88d625 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ tenacity numpy<2.0.0 fiona bs4 +metaflow From 3db749ba81908e99b3a85d2219cf79fb986e67aa Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:45:43 +0100 Subject: [PATCH 08/18] add parallel_utils.py with function to chunk df --- .../utils/parallel_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/asf_heat_pump_suitability/utils/parallel_utils.py b/asf_heat_pump_suitability/utils/parallel_utils.py index e69de29b..7004e502 100644 --- a/asf_heat_pump_suitability/utils/parallel_utils.py +++ b/asf_heat_pump_suitability/utils/parallel_utils.py @@ -0,0 +1,16 @@ +from typing import List +import pandas as pd + + +def chunk_df(df: pd.DataFrame, size: int) -> List[pd.DataFrame]: + """ + Split dataframe into chunks of specified size. + + Args: + df (pl.DataFrame): dataframe + size (int): number of records per chunk + + Returns: + List[pl.DataFrame]: list of dataframe chunks + """ + return [df.iloc[i : i + size] for i in range(0, len(df), size)] From 85f355060436ecbc422c1e1801292eeed35b1d5e Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:47:25 +0100 Subject: [PATCH 09/18] update CalculateGardenSize flow to improve chunking chunk into groups of 30 files instead of single files --- .../flows/run_calculate_garden_size_flow.py | 132 ++++++++++-------- 1 file changed, 77 insertions(+), 55 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index 02ade92b..fc7a55b3 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -11,18 +11,6 @@ NB: this flow takes the preprocessed and deduplicated EPC dataset in parquet file format. """ -import logging -import shapely.errors -import polars as pl -import geopandas as gpd -import pandas as pd -from asf_heat_pump_suitability.utils import save_utils -from asf_heat_pump_suitability.pipeline.prepare_features import ( - lat_lon, - land_extent, - building_footprint, - garden_size, -) from metaflow import FlowSpec, step, Parameter, batch @@ -63,6 +51,16 @@ def start(self): """ Load datasets and start flow. """ + import logging + import polars as pl + import geopandas as gpd + from asf_heat_pump_suitability.utils import parallel_utils + from asf_heat_pump_suitability.pipeline.prepare_features import ( + building_footprint, + garden_size, + lat_lon, + ) + logging.info("Load EPC UPRNs") epc_df = pl.read_parquet(self.epc, columns=["UPRN"]) @@ -80,86 +78,110 @@ def start(self): microsoft_file_bounds = building_footprint.transform_df_uk_dataset_links() # Match land extent files with overlapping building footprint files - self.file_matches = garden_size.match_series_files_land_building( + file_matches = garden_size.match_series_files_land_building( land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds ) - self.land_files = list(self.file_matches.index.unique()) + self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) logging.info( f"Estimating garden size for properties across {len(self.file_matches)} pairs of land extent and building footprint files." ) - self.next(self.estimate_garden_size, foreach="land_files") + self.next(self.estimate_garden_size, foreach="chunked_file_matches") @batch(cpu=2, memory=16000) @step def estimate_garden_size(self): - land_file = self.input - building_files = self.file_matches.filter(like=land_file, axis=0).values + import os + + os.system( + "pip install git+https://github.com/nestauk/asf_heat_pump_suitability.git@153_parallelise_garden_script" + ) + + import shapely + import geopandas as gpd + from asf_heat_pump_suitability.pipeline.prepare_features import ( + building_footprint, + garden_size, + land_extent, + ) + + prev = None + self.epc_gardens = [] - # Prepare land parcel data - land_parcels_gdf = land_extent.transform_gdf_land_parcels(f"s3://{land_file}") + for land_file, building_file in self.input.items(): + if land_file != prev: + # Prepare land parcel data + land_parcels_gdf = land_extent.transform_gdf_land_parcels( + f"s3://{land_file}" + ) - building_footprints_gdfs = [] - for building_file in building_files: # Prepare building footprints data try: - _building_footprints_gdf = ( + building_footprints_gdf = ( building_footprint.transform_gdf_building_footprints(building_file) ) except shapely.errors.GEOSException as e: - logging.warning( + print( f"Error loading building footprint file {building_file}. Error message: {e}.\n" f"Skipping this land extent & building footprint pairing." ) continue else: - _building_footprints_gdf["microsoft_building_footprint_file"] = ( + building_footprints_gdf["microsoft_building_footprint_file"] = ( building_file ) - building_footprints_gdfs.append(_building_footprints_gdf) - - if building_footprints_gdfs: - building_footprints_gdf = pd.concat(building_footprints_gdfs) - - # Get intersection of building footprint polygons and land polygons - intersection_gdf = garden_size.generate_gdf_land_building_overlay( - land_parcels_gdf=land_parcels_gdf, - building_footprints_gdf=building_footprints_gdf, - ) - - # Get garden size - gardens_gdf = garden_size.generate_gdf_garden_size( - intersection_gdf, land_parcels_gdf - ) - gardens_gdf = gardens_gdf.assign( - inspire_land_extent_file=land_file, - ) - - # Match EPC UPRNs with land parcels and gardens using UPRN coordinates - # This will keep only EPC records for which garden size can be estimated - epc_df = gpd.sjoin( - self.epc_gdf, - gardens_gdf, - how="inner", - predicate="intersects", - ).drop(columns=["geometry", "index_right"]) - - self.epc_df = pl.from_pandas(epc_df) + + # Get intersection of building footprint polygons and land polygons + intersection_gdf = garden_size.generate_gdf_land_building_overlay( + land_parcels_gdf=land_parcels_gdf, + building_footprints_gdf=building_footprints_gdf, + ) + + # Get garden size + gardens_gdf = garden_size.generate_gdf_garden_size( + intersection_gdf, land_parcels_gdf + ) + gardens_gdf = gardens_gdf.assign( + inspire_land_extent_file=land_file, + microsoft_building_footprint_file=building_file, + ) + + # Match EPC UPRNs with land parcels and gardens using UPRN coordinates + # This will keep only EPC records for which garden size can be estimated + epc_df = gpd.sjoin( + self.epc_gdf, + gardens_gdf, + how="inner", + predicate="intersects", + ).drop(columns=["geometry", "index_right"]) + + epc_df = pl.from_pandas(epc_df) + self.epc_gardens.append(epc_df) self.next(self.concatenate_garden_size_dfs) @step def concatenate_garden_size_dfs(self, inputs): - self.epc_gardens_df = pl.concat([input.epc_df for input in inputs]) + import itertools + import polars as pl + import logging + + self.epc_gardens_df = pl.concat( + list(itertools.chain.from_iterable([input.epc_gardens for input in inputs])) + ) logging.info( f"Garden size calculated for {len(self.epc_gardens_df)} EPC properties in total." ) self.next(self.end) - @batch(cpu=2, memory=16000) + # @batch(cpu=2, memory=16000) @step def end(self): + import polars as pl + from asf_heat_pump_suitability.utils import save_utils + from asf_heat_pump_suitability.pipeline.prepare_features import garden_size + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) From 1dd48715192ff1969d031873d2a8bd49a2e48d3e Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:51:27 +0100 Subject: [PATCH 10/18] add end step to CalculateGardenSize flow --- .../flows/run_calculate_garden_size_flow.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index fc7a55b3..8a7ec70f 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -173,11 +173,14 @@ def concatenate_garden_size_dfs(self, inputs): logging.info( f"Garden size calculated for {len(self.epc_gardens_df)} EPC properties in total." ) - self.next(self.end) + self.next(self.save_outputs) # @batch(cpu=2, memory=16000) @step - def end(self): + def save_outputs(self): + """ + Save outputs to S3. + """ import polars as pl from asf_heat_pump_suitability.utils import save_utils from asf_heat_pump_suitability.pipeline.prepare_features import garden_size @@ -195,6 +198,17 @@ def end(self): save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) + self.next(self.end) + + @step + def end(self): + """ + Finish flow. + """ + import logging + + logging.info("Calculate garden size flow complete!") + if __name__ == "__main__": CalculateGardenSizeFlow() From 56bd60ce5ab499a7d0a87ce8daf3b97c1f1b1d00 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:55:04 +0100 Subject: [PATCH 11/18] set up CalculateGardenSize script to use sample dataset for testing --- .../flows/run_calculate_garden_size_flow.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index 8a7ec70f..79aa521f 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -81,7 +81,12 @@ def start(self): file_matches = garden_size.match_series_files_land_building( land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds ) - self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) + # self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) + + # TODO remove before merge + self.chunked_file_matches = parallel_utils.chunk_df( + file_matches.sample(3), size=1 + ) logging.info( f"Estimating garden size for properties across {len(self.file_matches)} pairs of land extent and building footprint files." @@ -89,7 +94,8 @@ def start(self): self.next(self.estimate_garden_size, foreach="chunked_file_matches") - @batch(cpu=2, memory=16000) + # @batch(cpu=2, memory=16000) + @batch(cpu=2, memory=1000) @step def estimate_garden_size(self): import os @@ -100,6 +106,7 @@ def estimate_garden_size(self): import shapely import geopandas as gpd + import polars as pl from asf_heat_pump_suitability.pipeline.prepare_features import ( building_footprint, garden_size, @@ -185,7 +192,8 @@ def save_outputs(self): from asf_heat_pump_suitability.utils import save_utils from asf_heat_pump_suitability.pipeline.prepare_features import garden_size - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.epc_gardens_df = self.epc_gardens_df.with_columns( @@ -195,7 +203,8 @@ def save_outputs(self): self.epc_gardens_df ) - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.next(self.end) From ad6d805ed7e20222a8ae1bd93881f67c5ff5c359 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 17:03:31 +0100 Subject: [PATCH 12/18] fix bug in CalculateGardenSize flow --- .../pipeline/flows/run_calculate_garden_size_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index 79aa521f..8fa0646b 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -89,7 +89,7 @@ def start(self): ) logging.info( - f"Estimating garden size for properties across {len(self.file_matches)} pairs of land extent and building footprint files." + f"Estimating garden size for properties across {len(file_matches)} pairs of land extent and building footprint files." ) self.next(self.estimate_garden_size, foreach="chunked_file_matches") From d0ae871d979312040527711cd53bd3b1470a4fc8 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Tue, 3 Jun 2025 17:11:47 +0100 Subject: [PATCH 13/18] correct branch name in CalculateGardenSize flow --- .../pipeline/flows/run_calculate_garden_size_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index 8fa0646b..b870a1f7 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -101,7 +101,7 @@ def estimate_garden_size(self): import os os.system( - "pip install git+https://github.com/nestauk/asf_heat_pump_suitability.git@153_parallelise_garden_script" + "pip install git+https://github.com/nestauk/asf_heat_pump_suitability.git@152_parallelise_garden_script" ) import shapely From 9ef1d93427726b9c4b3ead49bc47968e64f38fd5 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 4 Jun 2025 16:46:57 +0100 Subject: [PATCH 14/18] cast NATIONALCADASTRALREFERENCE as string in CalculateGardenSize flow --- .../pipeline/flows/run_calculate_garden_size_flow.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index b870a1f7..d244ad44 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -95,7 +95,7 @@ def start(self): self.next(self.estimate_garden_size, foreach="chunked_file_matches") # @batch(cpu=2, memory=16000) - @batch(cpu=2, memory=1000) + @batch(cpu=2, memory=16000) @step def estimate_garden_size(self): import os @@ -164,7 +164,11 @@ def estimate_garden_size(self): ).drop(columns=["geometry", "index_right"]) epc_df = pl.from_pandas(epc_df) - self.epc_gardens.append(epc_df) + self.epc_gardens.append( + epc_df.with_columns( + pl.col("NATIONALCADASTRALREFERENCE").cast(pl.String) + ) + ) self.next(self.concatenate_garden_size_dfs) From 45b8a37cd8c753f034dc5354f16ee1363553037f Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 11 Jun 2025 12:09:10 +0100 Subject: [PATCH 15/18] scale CalculateGardenSizeFlow to full dataset --- .../flows/run_calculate_garden_size_flow.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index d244ad44..a3b4804a 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -81,12 +81,12 @@ def start(self): file_matches = garden_size.match_series_files_land_building( land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds ) - # self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) + self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) - # TODO remove before merge - self.chunked_file_matches = parallel_utils.chunk_df( - file_matches.sample(3), size=1 - ) + # # TODO remove before merge + # self.chunked_file_matches = parallel_utils.chunk_df( + # file_matches.sample(3), size=1 + # ) logging.info( f"Estimating garden size for properties across {len(file_matches)} pairs of land extent and building footprint files." @@ -94,8 +94,8 @@ def start(self): self.next(self.estimate_garden_size, foreach="chunked_file_matches") - # @batch(cpu=2, memory=16000) @batch(cpu=2, memory=16000) + # @batch(cpu=2, memory=32000) @step def estimate_garden_size(self): import os @@ -186,7 +186,7 @@ def concatenate_garden_size_dfs(self, inputs): ) self.next(self.save_outputs) - # @batch(cpu=2, memory=16000) + @batch(cpu=2, memory=16000) @step def save_outputs(self): """ @@ -196,8 +196,8 @@ def save_outputs(self): from asf_heat_pump_suitability.utils import save_utils from asf_heat_pump_suitability.pipeline.prepare_features import garden_size - # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_SAMPLE.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.epc_gardens_df = self.epc_gardens_df.with_columns( @@ -207,8 +207,8 @@ def save_outputs(self): self.epc_gardens_df ) - # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated_SAMPLE.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.next(self.end) From 384946871700eb36014a829384e75acb58c51592 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:32:04 +0100 Subject: [PATCH 16/18] add documentation to CalculateGardenSize flow and set to run on sample --- .../flows/run_calculate_garden_size_flow.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index a3b4804a..e72b5e8e 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -84,9 +84,9 @@ def start(self): self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) # # TODO remove before merge - # self.chunked_file_matches = parallel_utils.chunk_df( - # file_matches.sample(3), size=1 - # ) + self.chunked_file_matches = parallel_utils.chunk_df( + file_matches.sample(3), size=1 + ) logging.info( f"Estimating garden size for properties across {len(file_matches)} pairs of land extent and building footprint files." @@ -95,11 +95,14 @@ def start(self): self.next(self.estimate_garden_size, foreach="chunked_file_matches") @batch(cpu=2, memory=16000) - # @batch(cpu=2, memory=32000) @step def estimate_garden_size(self): + """ + Estimate garden size per property using land registry polygons and building footprints. + """ import os + # TODO update to dev before merge os.system( "pip install git+https://github.com/nestauk/asf_heat_pump_suitability.git@152_parallelise_garden_script" ) @@ -174,6 +177,9 @@ def estimate_garden_size(self): @step def concatenate_garden_size_dfs(self, inputs): + """ + Concatenate estimated garden size data into one dataframe. + """ import itertools import polars as pl import logging @@ -186,7 +192,7 @@ def concatenate_garden_size_dfs(self, inputs): ) self.next(self.save_outputs) - @batch(cpu=2, memory=16000) + # @batch(cpu=2, memory=16000) @step def save_outputs(self): """ @@ -196,8 +202,8 @@ def save_outputs(self): from asf_heat_pump_suitability.utils import save_utils from asf_heat_pump_suitability.pipeline.prepare_features import garden_size - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" - # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_SAMPLE.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.epc_gardens_df = self.epc_gardens_df.with_columns( @@ -207,8 +213,8 @@ def save_outputs(self): self.epc_gardens_df ) - save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" - # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated_SAMPLE.parquet" + # save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated.parquet" + save_as = f"s3://asf-heat-pump-suitability/outputs/{self.year}Q{self.quarter}/gardens/{self.year}_Q{self.quarter}_EPC_garden_size_estimates_{self.nations.upper()}_deduplicated_SAMPLE.parquet" save_utils.save_to_s3(self.epc_gardens_df, save_as) self.next(self.end) From a4305834ba94097209a4469f124070ecc973fad7 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:54:28 +0100 Subject: [PATCH 17/18] comment out line in CalculateGardenSizeFlow for running with sample --- .../pipeline/flows/run_calculate_garden_size_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index e72b5e8e..33fdb943 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -81,7 +81,7 @@ def start(self): file_matches = garden_size.match_series_files_land_building( land_files_gdf=land_file_bounds, building_files_gdf=microsoft_file_bounds ) - self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) + # self.chunked_file_matches = parallel_utils.chunk_df(file_matches, size=30) # # TODO remove before merge self.chunked_file_matches = parallel_utils.chunk_df( From 746f1f9685cb5ccf27c669edefcfecddaa9b8c7c Mon Sep 17 00:00:00 2001 From: crispy-wonton <104171770+crispy-wonton@users.noreply.github.com> Date: Wed, 18 Jun 2025 13:01:07 +0100 Subject: [PATCH 18/18] remove max-num-splits param from run instructions in `run_calculate_garden_size_flow.py` --- .../pipeline/flows/run_calculate_garden_size_flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py index 33fdb943..1fd931a4 100644 --- a/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py +++ b/asf_heat_pump_suitability/pipeline/flows/run_calculate_garden_size_flow.py @@ -3,7 +3,7 @@ Microsoft Building Footprints data. To run: -python asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py run --epc [path/to/EPC/data] --year [YYYY] --quarter [Q] --nations ews --max-num-splits 400 +python asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size_flow.py run --epc [path/to/EPC/data] --year [YYYY] --quarter [Q] --nations ews [Set --nations flag to "ew" or "s" for generating garden size estimates for either England and Wales or Scotland INSPIRE files only.]