From e231d969bc2cdcb96d3e85733bd2ce48df31185c Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:25:35 +0100 Subject: [PATCH 1/5] add notebook to create full features dataset for plymouth --- .../create_full_dataset_plymouth.py | 1195 +++++++++++++++++ requirements.txt | 2 +- 2 files changed, 1196 insertions(+), 1 deletion(-) create mode 100644 asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py diff --git a/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py new file mode 100644 index 00000000..2b30af85 --- /dev/null +++ b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py @@ -0,0 +1,1195 @@ +# %% [markdown] +# ## Explore creating a full dataset of all residential properties in Plymouth +# +# This notebook explores the methodology for creating a complete residential property dataset in Plymouth using only open-source data. The final dataset includes relevant property-level data to assess physical feasibility for ASHPs, SGLs, and HNs. The features included are: +# - in listed building +# - in building conservation area +# - outdoor space [to be added] +# - in existing / planned heat network zone +# - tenure +# - property type +# - heating fuel [to be added] +# - off gas status + +# %% [markdown] +# ## Import libraries + +# %% +import geopandas as gpd +import polars as pl +import pandas as pd +import fiona +import momepy +import folium +import networkx as nx +import rustworkx as rx +import osmnx as ox +import numpy as np +from scipy.sparse import csr_matrix +from sklearn.cluster import DBSCAN, HDBSCAN +import graphblas_algorithms as ga +import matplotlib.cm as cm +import matplotlib.pyplot as plt +from tqdm import tqdm +from matplotlib import collections +import libpysal +import spopt +from sklearn.preprocessing import OneHotEncoder +from sklearn import metrics +from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier + +from asf_heat_pump_suitability.getters import get_datasets +from asf_heat_pump_suitability.pipeline.prepare_features import ( + lat_lon, + listed_buildings, + protected_areas, + off_gas, + anchor_properties, + output_areas, +) +from asf_heat_pump_suitability.analysis.flats_on_fossils.features import fuel_type +from asf_heat_pump_suitability.pipeline.reweight_epc import prepare_sample + +# %% [markdown] +# ## Load data + +# %% +fiona.listlayers("s3://asf-heat-pump-suitability/source_data/opmplc_gb.gpkg") + +# %% +print("LOADING DATASETS TO GET RESIDENTIAL UPRNS FOR PLYMOUTH...") +print("Loading OS UPRN...") +os_uprn_df = get_datasets.get_df_osopen_uprn_latlon() + +print("\nLoading LA boundaries...") +la_boundaries_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/source_data/Local_Authority_Districts_December_2023_Boundaries_UK_BFE_-2600600853110041429/LAD_DEC_2023_UK_BFE.shp" +) + +print("\nLoading OS OpenMap Local - Buildings...") +os_openmap_building_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/OS OpenMap Local (ESRI Shape File) SX/data/SX_Building.shp" +) + +print("\nLoading OS OpenMap Local - Important Buildings...") +os_openmap_importantbuilding_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/OS OpenMap Local (ESRI Shape File) SX/data/SX_ImportantBuilding.shp" +) + +print("\nLoading OS OpenMap Local - Railways...") +os_openmap_railway_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/OS OpenMap Local (ESRI Shape File) SX/data/SX_RailwayStation.shp" +) + +# --------------------------------------------------------------------------# + +print("\nLOADING PROPERTY-LEVEL FEATURE DATASETS...") +print("\nLoading listed buildings...") +listed_buildings_gdf = gpd.read_file( + "../spatial_clustering/data/National_Heritage_List_for_England_NHLE_v02_VIEW_-464524051049198649/Listed_Building_points.shp" +) + +print("\nLoading building conservation areas...") +cons_areas_gdf = gpd.read_file( + "../spatial_clustering/data/conservation-area (1).geojson" +) + +print("\nLoading existing / planned HN zones...") +hn_zones_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/heat_network_desnz_data/heat-network-zone-map-Plymouth.gpkg", + layer="heat-network-zone-map-Plymouth", +) + +print("\nLoading outdoor space...") +outdoor_space_df = pl.read_parquet( + "s3://asf-heat-pump-suitability/outputs/2023Q4/gardens/20250224_2023_Q4_EPC_garden_size_estimates_EWS_deduplicated.parquet" +) + +print("\nLoading off gas postcodes...") +off_gas_list = off_gas.process_off_gas_data() + +print("\nLoading EPC...") +epc_df = pl.read_parquet( + "s3://asf-daps/lakehouse/2025_Q1/processed/epc/deduplicated/processed_dedupl-0.parquet", + columns=[ + "UPRN", + "POSTCODE", + "PROPERTY_TYPE", + "BUILT_FORM", + "TENURE", + "MAIN_FUEL", + "MAINHEAT_DESCRIPTION", + ], +) + +print("\nAdding EPC property type...") +epc_df = prepare_sample.add_col_property_type(epc_df) + +# -------------------------------------------------------------------------# + +print("\nLOADING DATASETS TO FILL MISSING DATA") +print("\nLoading OS Code-Point Open...") +code_point_df = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/codepo_gb.gpkg", + layers="codepoint", +) + +# %% [markdown] +# ## Filter to Plymouth only + +# %% +# Convert UPRN coordinates to geopoints +os_uprn_gdf = lat_lon.generate_gdf_uprn_coords(os_uprn_df) + +# Get polygon of Plymouth LA +plymouth_polygon = la_boundaries_gdf[ + la_boundaries_gdf["LAD23NM"].str.contains("Plymouth") +]["geometry"].values[0] + +# Filter UPRN and OpenMap datasets to Plymouth only +os_uprn_plymouth_gdf = os_uprn_gdf[os_uprn_gdf.within(plymouth_polygon)] +os_openmap_buildings_plymouth_gdf = os_openmap_building_gdf[ + os_openmap_building_gdf.within(plymouth_polygon) +] +os_openmap_importantbuildings_plymouth_gdf = os_openmap_importantbuilding_gdf[ + os_openmap_importantbuilding_gdf.within(plymouth_polygon) +] + +# %% [markdown] +# ## Identify residential UPRNs and building footprints + +# %% +# View types of building +os_openmap_importantbuildings_plymouth_gdf.groupby("BUILDGTHEM")[ + "CLASSIFICA" +].value_counts(dropna=False) + +# %% +# Identify all UPRNs in a building - these contain residential addresses +uprns_in_buildings = os_openmap_buildings_plymouth_gdf.sjoin( + os_uprn_plymouth_gdf, how="inner", predicate="contains" +)["UPRN"].tolist() + +# Identify building types which are unlikely to contain residential dwellings +no_residential_overlap_building_types = [ + "Port Consisting of Docks and Nautical Berthing", + "Fire Station", + "Hospital", + "Non State Secondary Education", + "Secondary Education", + "Higher or University Education", + "Primary Education", + "Post Office", + "Place Of Worship", + "Medical Care Accommodation", + "Museum", + "Special Needs Education", + "Further Education", + "Non State Primary Education", + "Coach Station", + "Police Station", + "Sports And Leisure Centre", + "Vehicular Ferry Terminal", + "Hospice", + "Bus Station", +] + +# Create gdf of important buildings and railway stations which are unlikely to contain residential dwellings +exclude_buildings_plymouth_gdf = os_openmap_importantbuildings_plymouth_gdf[ + os_openmap_importantbuildings_plymouth_gdf["CLASSIFICA"].isin( + no_residential_overlap_building_types + ) +] +railway_stations_gdf = os_openmap_buildings_plymouth_gdf.sjoin( + os_openmap_railway_gdf, how="inner", predicate="contains" +) +exclude_buildings_plymouth_gdf = pd.concat( + [exclude_buildings_plymouth_gdf[["geometry"]], railway_stations_gdf[["geometry"]]] +) + +# Identify UPRNs which are in these buildings - i.e. UPRNs which are likely to represent non-residential addresses +exclude_uprns = os_uprn_plymouth_gdf.sjoin( + exclude_buildings_plymouth_gdf, how="inner", predicate="within" +)["UPRN"].tolist() + +# Filter UPRN dataset to only residential UPRNs - there will still be some commercial UPRNs from mixed-use buildings +plymouth_residential_uprns_gdf = os_uprn_plymouth_gdf[ + (~os_uprn_plymouth_gdf["UPRN"].isin(exclude_uprns)) + & (os_uprn_plymouth_gdf["UPRN"].isin(uprns_in_buildings)) +] + +# Filter building footprints to only those which contain residential UPRNs +plymouth_residential_buildings_gdf = os_openmap_buildings_plymouth_gdf.sjoin( + plymouth_residential_uprns_gdf, how="inner", predicate="contains" +).drop_duplicates(subset=["ID"]) +plymouth_residential_buildings_gdf["building_area_m2"] = ( + plymouth_residential_buildings_gdf.area +) + +# %% +plymouth_residential_buildings_gdf.plot() + +# %% +plymouth_residential_uprns_gdf.shape + +# %% [markdown] +# ## Add features +# +# - in listed building +# - in building conservation area +# - outdoor space +# - in existing / planned heat network zone +# - tenure +# - property type +# - heating fuel +# - off gas status + +# %% +# Identify UPRNs in listed buildings +listed_building_uprns = ( + plymouth_residential_uprns_gdf.sjoin_nearest( + listed_buildings_gdf, how="inner", max_distance=10 + )["UPRN"] + .unique() + .tolist() +) + +# UPRNs in building conservation areas +cons_area_uprns = plymouth_residential_uprns_gdf.sjoin( + cons_areas_gdf.to_crs(epsg=27700), how="inner", predicate="intersects" +)["UPRN"].tolist() + +# Reformat UPRN for join +outdoor_space_df = outdoor_space_df.with_columns( + pl.col("UPRN").str.replace(".0", "", literal=True).cast(pl.Int64).alias("UPRN") +) + +# UPRNs in HNs +uprns_in_hn = plymouth_residential_uprns_gdf.sjoin( + hn_zones_gdf.drop(columns=["index_right"]), predicate="within" +)["UPRN"].tolist() + +# %% +len(listed_building_uprns) / len(plymouth_residential_uprns_gdf) * 100 + +# %% +# Standardise postcode column - remove space and set to upper case +epc_df = output_areas.standardise_col_postcode(epc_df, pcd_col="POSTCODE") +epc_df = epc_df.with_columns( + # Standardise UPRN + pl.col("UPRN") + .str.replace(".0", "", literal=True) + .cast(pl.Int64, strict=False) + .alias("UPRN"), + pl.col("MAIN_FUEL").cast(pl.String).alias("MAIN_FUEL"), + pl.col("PROPERTY_TYPE").cast(pl.String).alias("PROPERTY_TYPE"), +) + +# Process central heating fuel type information +epc_df = fuel_type.extend_df_fuel_type(epc_df, epc_col="MAIN_FUEL", name="fuel_type") +epc_df = fuel_type.extend_df_fuel_type( + epc_df, epc_col="MAINHEAT_DESCRIPTION", name="fill_fuel_type" +) +epc_df = epc_df.with_columns( + pl.col("fuel_type").fill_null(pl.col("fill_fuel_type")) +).drop(["fill_fuel_type"]) + +# %% +# Create features dataset with initial property-level features +features_df = ( + pl.from_pandas(plymouth_residential_uprns_gdf[["UPRN"]]) + .with_columns( + pl.when(pl.col("UPRN").is_in(listed_building_uprns)) + .then(True) + .otherwise(False) + .alias("in_listed_building"), + pl.when(pl.col("UPRN").is_in(cons_area_uprns)) + .then(True) + .otherwise(False) + .alias("in_cons_area"), + pl.when(pl.col("UPRN").is_in(uprns_in_hn)) + .then(True) + .otherwise(False) + .alias("in_hn"), + ) + .join(outdoor_space_df, how="left", on="UPRN") + .join(epc_df, how="left", on="UPRN") +) + +# %% +features_df.null_count() / len(features_df) + +# %% +features_df.shape + +# %% [markdown] +# ## Assess how many EPC records missing +# +# As most of the data we are missing for UPRNs originates from EPC, we want to get a sense of how many EPC records are missing per LSOA. + +# %% +# Get Plymouth LA code +plymouth_ladcd = la_boundaries_gdf[la_boundaries_gdf["LAD23NM"] == "Plymouth"][ + "LAD23CD" +].values[0] + +print("Loading postcode lookup...") +postcode_lookup_df = output_areas.load_transform_df_area_info( + use_cols=[ + "pcd", + "lsoa11", + "msoa11", + "lsoa21", + "msoa21", + "ru11ind", + "oslaua", + "oa21", + "imd", + ] +) + +print("\nLoading ONS census number of household per LSOA...") +n_households_df = get_datasets.get_df_ons_number_of_households() + +# Join output area codes to EPC data on postcode +epc_df = epc_df.join( + postcode_lookup_df[ + [ + "POSTCODE", + "oa21", + "lsoa", + "msoa", + "lad_code", + "country_code", + "ruc_two_fold", + "ruc_EW_ten_fold", + "imd", + ] + ], + how="left", + on="POSTCODE", +) + +# Get LSOAs in Plymouth +plymouth_lsoas = postcode_lookup_df.filter(pl.col("lad_code") == plymouth_ladcd)[ + "lsoa" +].unique() + +# Filter EPC to Plymouth LSOAs and get counts per LSOA of households according to EPC data vs census +epc_counts_df = ( + epc_df.filter(pl.col("lsoa").is_in(plymouth_lsoas)) + .group_by("lsoa") + .agg(pl.col("UPRN").count()) + .join( + n_households_df.select(["mnemonic", "2021"]), + how="left", + left_on="lsoa", + right_on="mnemonic", + ) + .rename({"UPRN": "EPC_records_count", "2021": "census_count"}) + .with_columns( + (pl.col("census_count") - pl.col("EPC_records_count")).alias("count_diff") + ) +) + +# Plot differences in household counts +plt.hist(epc_counts_df["count_diff"], bins=20) +plt.title( + "Distribution of diffs between 2021 census count of households and EPC records\nin Plymouth LSOAs" +) +plt.ylabel("Count of LSOAs") +plt.xlabel("Count of households in 2021 census - count of EPC records") +plt.show() + +print("Number of EPC records in Plymouth:") +print(epc_counts_df["EPC_records_count"].sum()) + +print("Number of households in census in Plymouth:") +print(epc_counts_df["census_count"].sum()) + +print("Number of residential UPRNs in Plymouth:") + +print(len(features_df)) + +# %% [markdown] +# ## Fill missing data - off gas + +# %% +# FIND NEAREST POSTCODE TO EACH UPRN +# Remove space from postcode in code point data +code_point_df["POSTCODE"] = code_point_df["postcode"].str.replace(" ", "") + +# Find nearest postcode point for all UPRNs within a 1km radius +nearest_postcode_df = pl.from_pandas( + plymouth_residential_uprns_gdf.sjoin_nearest( + code_point_df[["POSTCODE", "geometry"]], + how="left", + max_distance=1000, + distance_col="distance_to_postcode_m", + ).drop(columns="index_right")[["UPRN", "POSTCODE", "distance_to_postcode_m"]] +) + +# ----------------------------------------------------------------------------------- # +# ASSIGN OFF GAS LABEL + +# Label all UPRNs with on/off gas where possible +off_gas_df = ( + nearest_postcode_df.with_columns( + # Label postcodes according to on/off gas + pl.when(pl.col("POSTCODE").is_in(off_gas_list)) + .then(True) + .otherwise(False) + .alias("off_gas") + ) + .group_by("UPRN") + .agg( + # Get number of postcodes per UPRN and modal on/off gas value + pl.col("POSTCODE").alias("postcodes"), + pl.col("POSTCODE").len().alias("n_postcodes"), + pl.col("off_gas").mode().alias("modal_off_gas"), + pl.col("distance_to_postcode_m").first().alias("distance_to_postcode_m"), + ) + .with_columns( + # Select single modal on/off gas value + pl.when(pl.col("modal_off_gas").list.len() == 1).then( + pl.col("modal_off_gas").list.get(0) + ) + # If there are multiple modal values, set on/off gas as Null + .otherwise(None).alias("off_gas"), + pl.when(pl.col("n_postcodes") == 1).then(pl.col("postcodes").list.get(0)) + # If there are multiple postcode values, set postcode as Null + .otherwise(None).alias("POSTCODE"), + ) + .select(["UPRN", "POSTCODE", "distance_to_postcode_m", "off_gas"]) +) + +# Join postcode to UPRNs and assign on/off gas label based on postcode +features_df = ( + features_df.with_columns( + pl.when(pl.col("POSTCODE").is_in(off_gas_list)) + .then(True) + .when(pl.col("POSTCODE").is_not_null()) + .then(False) + .alias("off_gas") + ) + .join( + off_gas_df.select(["UPRN", "POSTCODE", "off_gas", "distance_to_postcode_m"]), + how="left", + on="UPRN", + suffix="_nearest", + ) + .with_columns( + (pl.col("POSTCODE").fill_null(pl.col("POSTCODE_nearest"))).alias( + "use_postcode" + ), + pl.col("off_gas").fill_null(pl.col("off_gas_nearest")).alias("filled_off_gas"), + ) +) + +# Fill missing off-gas label +features_df = features_df.with_columns( + pl.when((pl.col("off_gas").is_null()) & (pl.col("fuel_type") == "gas")) + .then(False) + .otherwise(pl.col("off_gas")) + .alias("off_gas") +) + +# ----------------------------------------------------------------------------------- # +# EVALUATE ACCURACY OF RESULTS + +# Evaluate postcode match accuracy +evaluate_postcode_match_df = ( + features_df.select( + ["UPRN", "POSTCODE", "POSTCODE_nearest", "distance_to_postcode_m"] + ) + .filter(pl.col("POSTCODE").is_not_null()) + .with_columns( + pl.when(pl.col("POSTCODE") == pl.col("POSTCODE_nearest")) + .then(True) + .otherwise(False) + .alias("postcode_match") + ) +) + +print(evaluate_postcode_match_df["postcode_match"].value_counts(normalize=True)) +print(features_df.filter(pl.col("POSTCODE").is_not_null()).shape) +print(features_df.filter(pl.col("POSTCODE").is_null()).shape) +print(features_df["UPRN"].n_unique()) +print(len(features_df)) +print(features_df["filled_off_gas"].value_counts(normalize=True)) + +# Plot distance distribution between incorrect and correct matches +plt.hist( + evaluate_postcode_match_df.filter(~pl.col("postcode_match"))[ + "distance_to_postcode_m" + ], + bins=50, + density=True, + alpha=0.5, + label="Postcode different from EPC", +) +plt.hist( + evaluate_postcode_match_df.filter(pl.col("postcode_match"))[ + "distance_to_postcode_m" + ], + bins=50, + density=True, + alpha=0.5, + label="Postcode matches EPC", +) +plt.title("Distribution of distance from UPRN to postcode point (m)") +plt.xlabel("Distance between UPRN and postcode point (m)") +plt.legend() +plt.show() + +# %% [markdown] +# ## Fill missing data - property type + +# %% +# Join residential UPRNs with their corresponding buildings +uprns_buildings_df = pl.from_pandas( + plymouth_residential_uprns_gdf.sjoin( + plymouth_residential_buildings_gdf[["ID", "geometry", "building_area_m2"]], + how="left", + predicate="within", + ) + .rename(columns={"ID": "building_id"}) + .drop(columns=["index_right", "geometry"]) +).join(features_df.select(["UPRN", "property_type"]), how="left", on="UPRN") + +# Identify building IDs which have UPRNs missing property type information +missing_property_type_buildings_list = ( + uprns_buildings_df.filter(pl.col("property_type").is_null())["building_id"] + .unique() + .to_list() +) + +# Aggregate summary property type information for the selected building IDs +property_type_df = ( + uprns_buildings_df.filter( + pl.col("building_id").is_in(missing_property_type_buildings_list), + ) + .group_by(pl.col("building_id")) + .agg( + pl.col("UPRN").count().alias("n_UPRNs"), + pl.col("property_type").alias("property_types"), + pl.col("property_type").null_count().alias("null_count"), + pl.col("property_type").n_unique().alias("nunique_property_types"), + pl.col("property_type").mode().alias("modal_property_type"), + pl.col("property_type").mode().len().alias("n_modal_property_type"), + ) + .with_columns( + ((pl.col("n_UPRNs") - pl.col("null_count")) / pl.col("n_UPRNs")).alias( + "non_null_proportion" + ) + ) +) + +print( + "UPRNs that are missing a property type and are in a building with multiple property type labels:" +) +print( + property_type_df.filter(pl.col("nunique_property_types") > 2)["null_count"].sum() + / len(plymouth_residential_uprns_gdf) +) + +print( + "\nUPRNs that are missing a property type and are in a building with multiple MODAL property type labels:" +) +print( + property_type_df.filter(pl.col("n_modal_property_type") > 1)["null_count"].sum() + / len(plymouth_residential_uprns_gdf) +) + +print( + "\nUPRNs that are missing a property type and are in a multi-property building with NO property type labels:" +) +print( + property_type_df.filter( + pl.col("n_UPRNs") > 1, pl.col("non_null_proportion") == 0 + ).height + / len(plymouth_residential_uprns_gdf) +) + +# %% +# Get UPRN density of buildings where all UPRNs have property type labels and they all have the same label +density_df = ( + uprns_buildings_df.filter( + ~pl.col("building_id").is_in(missing_property_type_buildings_list) + ) + .group_by("building_id") + .agg( + pl.col("UPRN").n_unique().alias("n_UPRNs"), + pl.col("building_area_m2").first().alias("building_area_m2"), + pl.col("property_type").unique().alias("property_types"), + pl.col("property_type").n_unique().alias("n_property_types"), + ) + .filter(pl.col("n_property_types") == 1) + .with_columns( + pl.col("property_types").list.get(0).alias("property_type"), + (pl.col("n_UPRNs") / (pl.col("building_area_m2") / 1000)).alias( + "UPRN_density_km2" + ), + ) +) + +# Plot UPRN density distribution for each property type +fig = plt.figure(figsize=(10, 5)) +for property_type in density_df["property_type"].sort().unique(): + if property_type != "Detached": + plt.hist( + density_df.filter(pl.col("property_type") == property_type)[ + "UPRN_density_km2" + ], + density=True, + bins=40, + alpha=0.5, + label=f"{property_type},\nN={len(density_df.filter(pl.col('property_type') == property_type))}", + ) +plt.title("Within-building UPRN density in different property types") +plt.xlabel("UPRNs per km2 of building footprint") +plt.legend() + +# %% +# Plot building footprint area distribution for each property type +fig = plt.figure(figsize=(10, 5)) +for property_type in density_df["property_type"].sort().unique(): + plt.hist( + density_df.filter(pl.col("property_type") == property_type)["building_area_m2"], + density=True, + bins=40, + alpha=0.5, + label=f"{property_type},\nN={len(density_df.filter(pl.col('property_type') == property_type))}", + ) +plt.title("Building footprint area in different property types") +plt.xlabel("Building footprint area (m2)") +plt.xlim(0, 500) +plt.legend() + +# %% +# CALCULATE NEAREST NEIGHBOUR DISTANCE + +# Calculate distance from nearest neighbour for UPRNs with property type label +# Get coordinates of each UPRN +nn_dist_gdf = lat_lon.generate_gdf_uprn_coords( + uprns_buildings_df.filter(pl.col("property_type").is_not_null()) +) + +# Convert coordinates to array +coords = np.array(nn_dist_gdf.geometry.map(lambda p: [p.x, p.y]).tolist()) + +# Find two nearest neighbours to each UPRN +knn = NearestNeighbors(n_neighbors=2, algorithm="kd_tree").fit(coords) +knn_dist, knn_idx = knn.kneighbors(coords) + +# Get distance of second nearest neighbour (the nearest neighbour is itself) +# The second neighbour will only be itself if there is another neighbour that is 0m away +nn_dist_gdf[list(map("neighbour_{}".format, range(1, 2)))] = ( + nn_dist_gdf.geometry.values.to_numpy()[knn_idx[:, 1:]] +) +nn_dist_gdf["nn_distance_m"] = nn_dist_gdf["geometry"].distance( + gpd.GeoSeries(nn_dist_gdf["neighbour_1"], crs="EPSG:27700"), align=True +) + +# ----------------------------------------------------------------------------------- # +# PLOT RESULTS + +# Plot distance of second nearest neighbour per property type +fig = plt.figure(figsize=(10, 5)) +for property_type in [ + "Semi-detached", + "Terraced (including end-terrace)", + "Flat, maisonette or apartment", +]: + plt.hist( + nn_dist_gdf[ + (nn_dist_gdf["property_type"] == property_type) + & (nn_dist_gdf["nn_distance_m"] <= 20) + ]["nn_distance_m"], + density=True, + bins=40, + alpha=0.5, + label=f"{property_type},\nN={len(nn_dist_gdf[nn_dist_gdf['property_type'] == property_type])}", + ) +plt.title("Distance from nearest neighbour in different property types") +plt.xlabel("Distance from nearest neighbour (m)") +plt.xlim(0, 20) +plt.legend() + +# %% +# Table of counts & proportions of buildings with different counts of UPRNs per property type +table_df = ( + density_df.group_by(["property_type", "n_UPRNs"]) + .agg(pl.col("building_id").count().alias("n_buildings")) + .pivot(on="property_type", index="n_UPRNs", values="n_buildings") + .to_pandas() +) + +table_df = table_df.set_index("n_UPRNs").sort_index()[ + [ + "Detached", + "Semi-detached", + "Terraced (including end-terrace)", + "Flat, maisonette or apartment", + "Caravan or other mobile or temporary structure", + ] +] +table_df / table_df.sum() * 100 + +# %% +table_df.div(table_df.sum(axis=1), axis=0) * 100 + +# %% +# ----------------------------------------------------------------------------------- # +# CALCULATE NEAREST NEIGHBOUR DISTANCE + +# Calculate distance from nearest neighbour for all UPRNs +# Get coordinates of each UPRN +nn_dist_gdf = lat_lon.generate_gdf_uprn_coords(uprns_buildings_df) + +# Convert coordinates to array +coords = np.array(nn_dist_gdf.geometry.map(lambda p: [p.x, p.y]).tolist()) + +# Find two nearest neighbours to each UPRN +knn = NearestNeighbors(n_neighbors=2, algorithm="kd_tree").fit(coords) +knn_dist, knn_idx = knn.kneighbors(coords) + +# Get distance of second nearest neighbour (the nearest neighbour is itself) +# The second neighbour will only be itself if there is another neighbour that is 0m away +nn_dist_gdf[list(map("neighbour_{}".format, range(1, 2)))] = ( + nn_dist_gdf.geometry.values.to_numpy()[knn_idx[:, 1:]] +) +nn_dist_gdf["nn_distance_m"] = nn_dist_gdf["geometry"].distance( + gpd.GeoSeries(nn_dist_gdf["neighbour_1"], crs="EPSG:27700"), align=True +) + +# ----------------------------------------------------------------------------------- # +# ASSIGN PROPERTY TYPE LABELS + +# Apply rules to determine property type +nn_dist_df = pl.from_pandas(nn_dist_gdf.drop(columns=["geometry", "neighbour_1"])) +density_df = uprns_buildings_df.group_by("building_id").agg( + pl.col("UPRN").n_unique().alias("n_UPRNs") +) + +nn_dist_df = ( + nn_dist_df.join(density_df, how="left", on="building_id") + .with_columns( + pl.when(pl.col("nn_distance_m") <= 2.5) + .then(pl.lit("Flat, maisonette or apartment")) + .when(pl.col("n_UPRNs") == 1) + .then(pl.lit("Detached")) + .when(pl.col("n_UPRNs") == 2) + .then(pl.lit("Semi-detached")) + .otherwise(pl.lit("Terraced (including end-terrace)")) + .alias("predicted_property_type") + ) + .with_columns( + (pl.col("property_type") == pl.col("predicted_property_type")).alias( + "correct_prediction" + ) + ) +) + +property_types = [ + "Detached", + "Semi-detached", + "Terraced (including end-terrace)", + "Flat, maisonette or apartment", +] + +# ----------------------------------------------------------------------------------- # +# CHECK RESULTS OF PREDICTIONS + +# Proportions of correct predictions per property type +print( + nn_dist_df.filter( + pl.col("correct_prediction").is_not_null(), + pl.col("property_type") != "Caravan or other mobile or temporary structure", + )["correct_prediction"].value_counts(normalize=True) +) + +# Proportions of correct predictions per property type and UPRN count per building +results_df = ( + nn_dist_df.filter( + pl.col("correct_prediction").is_not_null(), + pl.col("property_type") != "Caravan or other mobile or temporary structure", + ) + .group_by(["property_type", "correct_prediction"]) + .agg(pl.col("UPRN").count()) + .pivot(on="property_type", index="correct_prediction", values="UPRN") + .with_columns((pl.col(property_types) / pl.col(property_types).sum()) * 100) + .select(["correct_prediction"] + property_types) +) + +print(results_df) + +# Proportions of predicted property type for incorrectly predicted detached properties +print( + nn_dist_df.filter( + pl.col("correct_prediction").is_not_null(), + pl.col("property_type") == "Detached", + ~pl.col("correct_prediction"), + )["predicted_property_type"].value_counts(normalize=True) +) + +# Proportions of predicted property type for incorrectly predicted semi-detached properties +print( + nn_dist_df.filter( + pl.col("correct_prediction").is_not_null(), + pl.col("property_type") == "Semi-detached", + ~pl.col("correct_prediction"), + )["predicted_property_type"].value_counts(normalize=True) +) + +# Join predicted property type to features df +features_df = features_df.join( + nn_dist_df.select(["UPRN", "predicted_property_type"]), how="left", on="UPRN" +) + +# %% [markdown] +# ## Fill missing data - tenure type + +# %% +# Load census data +print("Loading age band data...") +age_bands_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_age_bands_OA_plymouth.csv", + skip_rows=4, +) + +print("\nLoading disability data...") +disability_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_disability_OA_plymouth.csv", + skip_rows=6, +) + +print("\nLoading economic status data...") +economic_status_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_economic_status_OA_plymouth.csv", + skip_rows=6, +) + +print("\nLoading hours worked data...") +hours_worked_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_hours_worked_OA_plymouth.csv", + skip_rows=6, +) + +print("\nLoading student binary data...") +student_binary_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_student_binary_OA_plymouth.csv", + skip_rows=6, +) + +print("\nLoading tenure data...") +oa_tenure_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_tenure_OA_plymouth.csv", + skip_rows=6, +) + +# %% +# Preprocess census data to generate features for predictive model +# Process age band data +age_bands_df = ( + age_bands_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename( + { + k: v + for k, v in zip( + age_bands_df.columns, + [col.lower().replace(" ", "_") for col in age_bands_df.columns], + ) + } + ) + .rename({"2021_output_area": "oa_code"}) + .drop("total") + .with_columns( + pl.sum_horizontal( + "aged_16_to_19_years", "aged_20_to_24_years", "aged_25_to_34_years" + ).alias("prop_aged_16_to_35"), + pl.sum_horizontal("aged_35_to_49_years", "aged_50_to_64_years").alias( + "prop_aged_35_to_64" + ), + pl.sum_horizontal( + "aged_65_to_74_years", "aged_75_to_84_years", "aged_85_years_and_over" + ).alias("prop_aged_65_and_over"), + ) + .with_columns( + pl.sum_horizontal( + "prop_aged_16_to_35", "prop_aged_35_to_64", "prop_aged_65_and_over" + ).alias("total") + ) + .select( + [ + "oa_code", + "prop_aged_16_to_35", + "prop_aged_35_to_64", + "prop_aged_65_and_over", + "total", + ] + ) + .with_columns((pl.col(pl.Int64) / pl.col("total"))) + .drop("total") +) + +# Process disability data +disability_df = ( + disability_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename({"2021 output area": "oa_code"}) + .with_columns( + ( + pl.col("Disabled under the Equality Act") + / pl.col("Total: All usual residents") + ).alias("prop_disabled_residents") + ) + .select(["oa_code", "prop_disabled_residents"]) +) + +# Process economic status data +economic_status_df = ( + economic_status_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename({"2021 output area": "oa_code"}) + .with_columns( + ( + pl.col("Economically inactive") + / pl.col("Total: All usual residents aged 16 years and over") + ).alias("prop_economically_inactive_adults") + ) + .select(["oa_code", "prop_economically_inactive_adults"]) +) + +# Process hours worked data +hours_worked_df = ( + hours_worked_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename({"2021 output area": "oa_code"}) + .with_columns( + ( + pl.col("Full-time") + / pl.col( + "Total: All usual residents aged 16 years and over in employment the week before the census" + ) + ).alias("prop_employed_adults_working_full_time") + ) + .select(["oa_code", "prop_employed_adults_working_full_time"]) +) + +# Process student binary +student_binary_df = ( + student_binary_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename({"2021 output area": "oa_code"}) + .with_columns((pl.col("Student") / pl.col("Total")).alias("prop_students")) + .select(["oa_code", "prop_students"]) +) + +# Process tenure +oa_tenure_df = ( + oa_tenure_df.filter( + pl.col("2021 output area").is_not_null(), pl.col("2021 output area") != "" + ) + .rename({"2021 output area": "oa_code"}) + .select( + [ + "oa_code", + "Total: All households", + "Owned", + "Shared ownership", + "Social rented", + "Private rented", + "Lives rent free", + ] + ) + .with_columns( + ( + pl.sum_horizontal("Owned", "Shared ownership") + / pl.col("Total: All households") + ).alias("prop_owner_occupied"), + (pl.col("Social rented") / pl.col("Total: All households")).alias( + "prop_social_rented" + ), + ) + .select(["oa_code", "prop_owner_occupied", "prop_social_rented"]) +) + +# Join census datasets together +census_df = oa_tenure_df +for df in [ + age_bands_df, + disability_df, + economic_status_df, + hours_worked_df, + student_binary_df, +]: + census_df = census_df.join(df, how="left", on="oa_code") + +# %% +# CREATE DATAFRAME WITH FEATURES TO TRAIN AND TEST MODEL +model_df = ( + epc_df.select( + [ + "UPRN", + "property_type", + "TENURE", + "oa21", + ] + ) + .join(census_df, how="left", left_on="oa21", right_on="oa_code") + .join( + features_df.select(["UPRN", "in_cons_area", "in_listed_building"]), + how="left", + on="UPRN", + ) + .filter( + pl.all_horizontal(pl.all().is_not_null()), + pl.col("property_type") != "Caravan or other mobile or temporary structure", + pl.col("TENURE") != "unknown", + ) + .drop(["oa21", "prop_aged_35_to_64"]) + .to_pandas() +) + +X = model_df.drop(columns=["TENURE"]) +y = model_df[["TENURE"]] + +# One-hot encode property type feature +enc = OneHotEncoder() +onehotenc_df = pd.DataFrame(enc.fit_transform(X[["property_type"]]).toarray()) +onehotenc_df.columns = list(col.lower().replace(" ", "_") for col in enc.categories_[0]) + +# Join encodings to X +X = X.join(onehotenc_df, how="left").drop(columns=["property_type", "detached", "UPRN"]) + +# ----------------------------------------------------------------------------------- # +# TRAIN AND TEST MODEL + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.25, random_state=1 +) + +# Fit and predict with K-nearest neighbours classifier model +classifier = KNeighborsClassifier(n_neighbors=10) +classifier.fit(X_train, y_train) +print("K Nearest Neighbour classifier accuracy score:") +print(classifier.score(X_test, y_test)) +y_pred = classifier.predict(X_test) +y_pred_prob = classifier.predict_proba(X_test) + +# Create dataframe with results of predictions and probabilities +results_df = ( + pl.DataFrame( + { + "actual": y_test["TENURE"], + "predicted": y_pred, + "0": y_pred_prob[:, 0], + "1": y_pred_prob[:, 1], + "2": y_pred_prob[:, 2], + } + ) + .rename({k: v for k, v in zip(["0", "1", "2"], classifier.classes_)}) + .with_columns( + (pl.col("actual") == pl.col("predicted")).alias("correct_prediction"), + max_prob=pl.concat_list( + "owner-occupied", "rental (private)", "rental (social)" + ).list.max(), + ) +) + +# classifier = RandomForestClassifier() +# classifier.fit(X_train, y_train) +# print("\nRandom Forest classifier accuracy score:") +# print(classifier.score(X_test, y_test)) + +# ----------------------------------------------------------------------------------- # +# PLOT RESULTS + +# Plot probability of predictions based on prediction outcome +plot_df = results_df.group_by(["correct_prediction", "max_prob"]).agg( + pl.col("predicted").count().alias("n_predictions") +) +fig = plt.figure(figsize=(10, 5)) +plot_df = ( + plot_df.pivot( + on="correct_prediction", + index="max_prob", + values="n_predictions", + ) + .rename({"true": "Correct prediction", "false": "Incorrect prediction"}) + .sort(by="max_prob", descending=False) + .to_pandas() +) + +plot_df.index = plot_df["max_prob"] +plot_df = plot_df.drop("max_prob", axis=1) + +plot_df.plot(kind="bar") + +plt.ylabel("Count of predictions") +plt.xlabel("Prediction probability") +plt.title("Prediction probability by prediction accuracy") +plt.show() + +# %% +# USE MODEL TO PREDICT TENURE TYPE +# Load output area geospatial boundaries +oa_boundaries_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/Output_Areas_2021_EW_BFE_V9_6122634609897870819/OA_2021_EW_BFE_V9.shp" +) + +# Assign UPRNs to their output areas +uprns_with_oa_df = plymouth_residential_uprns_gdf.sjoin( + oa_boundaries_gdf[["OA21CD", "geometry"]], how="left", predicate="intersects" +).drop(columns="index_right") +features_df = features_df.join( + pl.from_pandas(uprns_with_oa_df[["UPRN", "OA21CD"]]), how="left", on="UPRN" +) +pd_features_df = features_df.to_pandas() + +# One hot encode property type data +onehotenc_df = pd.DataFrame( + enc.transform( + pd_features_df[["predicted_property_type"]].rename( + columns={"predicted_property_type": "property_type"} + ) + ).toarray() +) +onehotenc_df.columns = list(col.lower().replace(" ", "_") for col in enc.categories_[0]) +X = ( + pd_features_df.join(onehotenc_df, how="left") + .drop(columns=["property_type", "detached"]) + .merge(census_df.to_pandas(), how="left", left_on="OA21CD", right_on="oa_code")[ + X.columns + ] +) + +# Run model and get results +y_pred = classifier.predict(X) +y_pred_prob = classifier.predict_proba(X) + +results_df = pd.DataFrame( + { + "predicted_tenure": y_pred, + "0": y_pred_prob[:, 0], + "1": y_pred_prob[:, 1], + "2": y_pred_prob[:, 2], + } +).rename(columns={k: v for k, v in zip(["0", "1", "2"], classifier.classes_)}) + +# Join predicted tenure type to features dataset +features_df = pl.from_pandas(pd_features_df.join(results_df, how="left")) +features_df = features_df.with_columns( + tenure_prob=pl.concat_list( + "owner-occupied", "rental (private)", "rental (social)" + ).list.max(), +).drop(["owner-occupied", "rental (private)", "rental (social)"]) diff --git a/requirements.txt b/requirements.txt index 4c9b14da..a2ce2506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ balance~=0.10 jupytext~=1.17 fastexcel~=0.13 scikit-learn<=1.2.2 -matplotlib==3.5.2 +matplotlib~=3.10.2 tqdm~=4.67 geopandas~=1.0 shapely~=2.1 From 37711929a6db055a729128cb014d6c9048995575 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Thu, 21 Aug 2025 15:07:11 +0100 Subject: [PATCH 2/5] delete unused imports in create_full_dataset_plymouth.py --- .../create_full_dataset_plymouth.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py index 2b30af85..eb6bfdf1 100644 --- a/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py +++ b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py @@ -19,21 +19,8 @@ import polars as pl import pandas as pd import fiona -import momepy -import folium -import networkx as nx -import rustworkx as rx -import osmnx as ox import numpy as np -from scipy.sparse import csr_matrix -from sklearn.cluster import DBSCAN, HDBSCAN -import graphblas_algorithms as ga -import matplotlib.cm as cm import matplotlib.pyplot as plt -from tqdm import tqdm -from matplotlib import collections -import libpysal -import spopt from sklearn.preprocessing import OneHotEncoder from sklearn import metrics from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier From a70ecd008b919090c0bbc57c0408c0fdc8156658 Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:14:20 +0100 Subject: [PATCH 3/5] TEMP - comment out lines in garden_size.py --- .../pipeline/prepare_features/garden_size.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py b/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py index 1c95cb36..db71d878 100644 --- a/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py +++ b/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py @@ -99,7 +99,12 @@ def generate_gdf_garden_size( """ building_size = ( intersections_gdf.groupby("NATIONALCADASTRALREFERENCE")[ - ["building_id", "building_intersection_area_m2", "height"] + [ + "building_id", + "building_intersection_area_m2", + # "height", + "source", + ] ] .agg( building_ids=( @@ -110,10 +115,10 @@ def generate_gdf_garden_size( "building_intersection_area_m2", "sum", ), # get total building area on land parcel - max_building_height=( - "height", - "max", - ), # get max building height on land parcel + # max_building_height=( + # "height", + # "max", + # ), # get max building height on land parcel ) .reset_index() ) From 5cf03bfeada4d889a22ce388d63a9b8288c0fe5a Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:16:13 +0100 Subject: [PATCH 4/5] fill missing data new features in create_full_dataset_plymouth.py --- .../create_full_dataset_plymouth.py | 793 ++++++++++++++++-- 1 file changed, 737 insertions(+), 56 deletions(-) diff --git a/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py index eb6bfdf1..0618bc4b 100644 --- a/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py +++ b/asf_heat_pump_suitability/analysis/exploratory/create_full_dataset/create_full_dataset_plymouth.py @@ -21,12 +21,17 @@ import fiona import numpy as np import matplotlib.pyplot as plt +from matplotlib import ticker + +from sklearn.cluster import HDBSCAN from sklearn.preprocessing import OneHotEncoder from sklearn import metrics + from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier +from asf_heat_pump_suitability.utils import geo_utils, save_utils from asf_heat_pump_suitability.getters import get_datasets from asf_heat_pump_suitability.pipeline.prepare_features import ( lat_lon, @@ -35,9 +40,13 @@ off_gas, anchor_properties, output_areas, + building_footprint, + land_extent, + garden_size, ) from asf_heat_pump_suitability.analysis.flats_on_fossils.features import fuel_type from asf_heat_pump_suitability.pipeline.reweight_epc import prepare_sample +import folium # %% [markdown] # ## Load data @@ -75,12 +84,12 @@ print("\nLOADING PROPERTY-LEVEL FEATURE DATASETS...") print("\nLoading listed buildings...") listed_buildings_gdf = gpd.read_file( - "../spatial_clustering/data/National_Heritage_List_for_England_NHLE_v02_VIEW_-464524051049198649/Listed_Building_points.shp" + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/National_Heritage_List_for_England_NHLE_v02_VIEW_-464524051049198649/Listed_Building_points.shp" ) print("\nLoading building conservation areas...") cons_areas_gdf = gpd.read_file( - "../spatial_clustering/data/conservation-area (1).geojson" + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/conservation-area (1).geojson" ) print("\nLoading existing / planned HN zones...") @@ -89,11 +98,6 @@ layer="heat-network-zone-map-Plymouth", ) -print("\nLoading outdoor space...") -outdoor_space_df = pl.read_parquet( - "s3://asf-heat-pump-suitability/outputs/2023Q4/gardens/20250224_2023_Q4_EPC_garden_size_estimates_EWS_deduplicated.parquet" -) - print("\nLoading off gas postcodes...") off_gas_list = off_gas.process_off_gas_data() @@ -248,11 +252,6 @@ cons_areas_gdf.to_crs(epsg=27700), how="inner", predicate="intersects" )["UPRN"].tolist() -# Reformat UPRN for join -outdoor_space_df = outdoor_space_df.with_columns( - pl.col("UPRN").str.replace(".0", "", literal=True).cast(pl.Int64).alias("UPRN") -) - # UPRNs in HNs uprns_in_hn = plymouth_residential_uprns_gdf.sjoin( hn_zones_gdf.drop(columns=["index_right"]), predicate="within" @@ -301,7 +300,6 @@ .otherwise(False) .alias("in_hn"), ) - .join(outdoor_space_df, how="left", on="UPRN") .join(epc_df, how="left", on="UPRN") ) @@ -334,6 +332,7 @@ "oslaua", "oa21", "imd", + # "LSOA21NM" ] ) @@ -840,7 +839,7 @@ # ## Fill missing data - tenure type # %% -# Load census data +# LOAD CENSUS DATA print("Loading age band data...") age_bands_df = pl.read_csv( "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/2021Census_age_bands_OA_plymouth.csv", @@ -878,7 +877,7 @@ ) # %% -# Preprocess census data to generate features for predictive model +# PREPROCESS CENSUS DATA TO GENERATE FEATURES FOR PREDICTIVE MODEL # Process age band data age_bands_df = ( age_bands_df.filter( @@ -1011,9 +1010,10 @@ ) # Join census datasets together -census_df = oa_tenure_df +# census_df = oa_tenure_df +census_df = age_bands_df for df in [ - age_bands_df, + # age_bands_df, disability_df, economic_status_df, hours_worked_df, @@ -1022,17 +1022,116 @@ census_df = census_df.join(df, how="left", on="oa_code") # %% +# Load output area geospatial boundaries +oa_boundaries_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/Output_Areas_2021_EW_BFE_V9_6122634609897870819/OA_2021_EW_BFE_V9.shp" +) + +# Assign UPRNs to their output areas +uprns_with_oa_df = plymouth_residential_uprns_gdf.sjoin( + oa_boundaries_gdf[["OA21CD", "LSOA21CD", "geometry"]], + how="left", + predicate="intersects", +).drop(columns="index_right") + +# Join output areas to features +features_df = features_df.join( + pl.from_pandas(uprns_with_oa_df[["UPRN", "OA21CD", "LSOA21CD"]]), + how="left", + on="UPRN", +).with_columns(pl.col("TENURE").replace("unknown", None)) + +# %% +# CALCULATE PROPORTIONS OF TENURE TYPES FOR NEIGHBOURS WITHIN 100m RADIUS FOR EACH UPRN +nn_tenure_df = features_df.select( + [ + "UPRN", + "TENURE", + "OA21CD", + ] +).join( + os_uprn_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]), how="left", on="UPRN" +) + +# Calculate distance from nearest neighbour for all UPRNs +# Get coordinates of each UPRN +nn_tenure_df = lat_lon.generate_gdf_uprn_coords( + nn_tenure_df, usecols=["UPRN", "TENURE", "X_COORDINATE", "Y_COORDINATE"] +) + +# Convert coordinates to array +coords = np.array(nn_tenure_df.geometry.map(lambda p: [p.x, p.y]).tolist()) + +# Find all neighbours within 100m radius of each UPRN +knn = NearestNeighbors(radius=100, algorithm="kd_tree").fit(coords) +knn_dist, knn_idx = knn.radius_neighbors(coords) + +# Get the tenures of the neighbours within 100m +nn_tenure_df["nearest_tenures"] = [ + nn_tenure_df.TENURE.values[knn_idx[i]] for i in range(0, len(knn_idx)) +] + +# Get the UPRNs of the neighbours within 100m +nn_tenure_df["nearest_UPRNs"] = [ + nn_tenure_df.UPRN.values[knn_idx[i]] for i in range(0, len(knn_idx)) +] + +nn_tenure_df = pd.DataFrame(nn_tenure_df).explode( + column=["nearest_tenures", "nearest_UPRNs"] +) +nn_tenure_df = ( + nn_tenure_df[ + # Filter out neighbours which are 'self' and filter out any neighbours with no tenure + (nn_tenure_df["UPRN"] != nn_tenure_df["nearest_UPRNs"]) + & (nn_tenure_df["nearest_tenures"].notnull()) + # Get the counts of neighbours with each tenure type per UPRN + ] + .groupby(["UPRN", "nearest_tenures"]) + .agg( + count=pd.NamedAgg(column="UPRN", aggfunc="count"), + ) + .reset_index() +) + +# Calculate the proportion of neighbours with each tenure type +total_tenure_df = nn_tenure_df.groupby("UPRN").agg( + total=pd.NamedAgg(column="count", aggfunc="sum") +) +nn_tenure_df = nn_tenure_df.merge(total_tenure_df, how="left", on="UPRN") +nn_tenure_df["prop_neighbours"] = nn_tenure_df["count"] / nn_tenure_df["total"] + +nn_tenure_df = pl.from_pandas( + nn_tenure_df.pivot( + index="UPRN", columns="nearest_tenures", values="prop_neighbours" + ) + .fillna(0) + .reset_index() +).rename( + { + "owner-occupied": "prop_nn_owner_occupied", + "rental (private)": "prop_nn_private_rental", + "rental (social)": "prop_nn_social_rental", + } +) + +# ----------------------------------------------------------------------------------- # # CREATE DATAFRAME WITH FEATURES TO TRAIN AND TEST MODEL model_df = ( - epc_df.select( + features_df.select( [ "UPRN", "property_type", "TENURE", - "oa21", + "OA21CD", ] ) - .join(census_df, how="left", left_on="oa21", right_on="oa_code") + .join(census_df, how="left", left_on="OA21CD", right_on="oa_code") + .join( + os_uprn_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]), + how="left", + on="UPRN", + ) + .join(nn_tenure_df.drop("prop_nn_private_rental"), how="left", on="UPRN") .join( features_df.select(["UPRN", "in_cons_area", "in_listed_building"]), how="left", @@ -1043,7 +1142,7 @@ pl.col("property_type") != "Caravan or other mobile or temporary structure", pl.col("TENURE") != "unknown", ) - .drop(["oa21", "prop_aged_35_to_64"]) + .drop(["OA21CD", "prop_aged_35_to_64"]) .to_pandas() ) @@ -1065,10 +1164,21 @@ X, y, test_size=0.25, random_state=1 ) +print(y_train["TENURE"].value_counts(normalize=True)) +print(y_test["TENURE"].value_counts(normalize=True)) + # Fit and predict with K-nearest neighbours classifier model -classifier = KNeighborsClassifier(n_neighbors=10) +# classifier = KNeighborsClassifier(n_neighbors=10) +# classifier.fit(X_train, y_train) +# print("K Nearest Neighbour classifier accuracy score:") +# print(classifier.score(X_test, y_test)) +# y_pred = classifier.predict(X_test) +# y_pred_prob = classifier.predict_proba(X_test) + +# Fit and predict with Random Forest classifier model +classifier = RandomForestClassifier(class_weight="balanced") classifier.fit(X_train, y_train) -print("K Nearest Neighbour classifier accuracy score:") +print("\nRandom Forest classifier accuracy score:") print(classifier.score(X_test, y_test)) y_pred = classifier.predict(X_test) y_pred_prob = classifier.predict_proba(X_test) @@ -1093,14 +1203,8 @@ ) ) -# classifier = RandomForestClassifier() -# classifier.fit(X_train, y_train) -# print("\nRandom Forest classifier accuracy score:") -# print(classifier.score(X_test, y_test)) - # ----------------------------------------------------------------------------------- # # PLOT RESULTS - # Plot probability of predictions based on prediction outcome plot_df = results_df.group_by(["correct_prediction", "max_prob"]).agg( pl.col("predicted").count().alias("n_predictions") @@ -1117,7 +1221,21 @@ .to_pandas() ) -plot_df.index = plot_df["max_prob"] +bins = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] +labels = [ + "(0.3-0.4]", + "(0.4-0.5]", + "(0.5-0.6]", + "(0.6-0.7]", + "(0.7-0.8]", + "(0.8-0.9]", + "(0.9-1]", +] +plot_df["max_prob_group"] = pd.cut( + plot_df["max_prob"], bins=bins, labels=labels, right=True +) + +plot_df = plot_df.groupby("max_prob_group").sum() plot_df = plot_df.drop("max_prob", axis=1) plot_df.plot(kind="bar") @@ -1128,37 +1246,250 @@ plt.show() # %% -# USE MODEL TO PREDICT TENURE TYPE -# Load output area geospatial boundaries -oa_boundaries_gdf = gpd.read_file( - "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/Output_Areas_2021_EW_BFE_V9_6122634609897870819/OA_2021_EW_BFE_V9.shp" +# Plot stacked bar chart with proportions of prediction probability vs prediction accuracy +plot_df = plot_df.div(plot_df.sum(axis=1), axis=0) * 100 # convert to 0‑100 % + +ax = plot_df.plot( + kind="bar", stacked=True, figsize=(10, 6), width=0.8, edgecolor="none" ) -# Assign UPRNs to their output areas -uprns_with_oa_df = plymouth_residential_uprns_gdf.sjoin( - oa_boundaries_gdf[["OA21CD", "geometry"]], how="left", predicate="intersects" -).drop(columns="index_right") -features_df = features_df.join( - pl.from_pandas(uprns_with_oa_df[["UPRN", "OA21CD"]]), how="left", on="UPRN" +ax.set_ylabel("Percentage of predictions") +ax.set_xlabel("Probability of predictions") +ax.set_title("Prediction probability by prediction accuracy") + +ax.legend(title="", bbox_to_anchor=(1.02, 1), loc="upper left") +ax.grid(axis="y", linestyle=":", linewidth=0.5) +plt.tight_layout() +plt.show() + +# %% +from sklearn.metrics import confusion_matrix +import seaborn as sns + +tenure_mapping = {"owner-occupied": 0, "rental (private)": 1, "rental (social)": 2} +y_true = np.array(y_test["TENURE"].map(tenure_mapping)) + +y_pred_arr = np.copy(y_pred) +for k, v in tenure_mapping.items(): + y_pred_arr[y_pred == k] = v + +matrix = confusion_matrix(list(y_true), list(y_pred_arr), normalize="true") +print(matrix.diagonal() / matrix.sum(axis=1)) + +sns.heatmap( + matrix, + annot=True, + cmap="YlGnBu", + xticklabels=tenure_mapping.keys(), + yticklabels=tenure_mapping.keys(), +) +plt.xlabel("Predicted", fontsize=12) +plt.ylabel("Actual", fontsize=12) +plt.title("Confusion Matrix", fontsize=16) +plt.show() + +# %% +# FEATURE ENGINEERING +# PLOT CORRELATION HEATMAP +import seaborn as sns + +corr = X.corr() +plt.figure(figsize=(15, 8)) +ax = sns.heatmap(corr, annot=True) + +# %% [markdown] +# ### ------- Test feature engineering for tenure prediction model -------- +# +# All the code here is copied directly from an sklearn tutorial with some minor adjustments: https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html +# +# Ultimately, I decided not to proceed with feature engineering as the original feature importance looks consistent and good through permutation and engineering didn't improve the accuracy or confusion matrix for the model. + +# %% +# RANDOM FOREST CLASSIFIER FEATURE IMPORTANCE WITH NO FEATURE ENGINEERING +import matplotlib.pyplot as plt +import matplotlib +import numpy as np +import pandas as pd +from sklearn.inspection import permutation_importance +from sklearn.utils.fixes import parse_version + + +def plot_permutation_importance(clf, X, y, ax): + result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=2) + perm_sorted_idx = result.importances_mean.argsort() + + # `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been + # renamed to `tick_labels`. The following code handles this, but as a + # scikit-learn user you probably can write simpler code by using `labels=...` + # (matplotlib < 3.9) or `tick_labels=...` (matplotlib >= 3.9). + tick_labels_parameter_name = ( + "tick_labels" + if parse_version(matplotlib.__version__) >= parse_version("3.9") + else "labels" + ) + tick_labels_dict = {tick_labels_parameter_name: X.columns[perm_sorted_idx]} + ax.boxplot(result.importances[perm_sorted_idx].T, vert=False, **tick_labels_dict) + ax.axvline(x=0, color="k", linestyle="--") + return ax + + +mdi_importances = pd.Series(classifier.feature_importances_, index=X_train.columns) +tree_importance_sorted_idx = np.argsort(classifier.feature_importances_) + +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) +mdi_importances.sort_values().plot.barh(ax=ax1) +ax1.set_xlabel("Gini importance") +plot_permutation_importance(classifier, X_train, y_train, ax2) +ax2.set_xlabel("Decrease in accuracy score") +fig.suptitle( + "Impurity-based vs. permutation importances on multicollinear features (train set)" +) +_ = fig.tight_layout() + +# %% +# PLOT CORRELATED FEATURES IN HEATMAP AND DENDROGRAM +from scipy.cluster import hierarchy +from scipy.spatial.distance import squareform +from scipy.stats import spearmanr + +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) +corr = spearmanr(X).correlation + +# Ensure the correlation matrix is symmetric +corr = (corr + corr.T) / 2 +np.fill_diagonal(corr, 1) + +# We convert the correlation matrix to a distance matrix before performing +# hierarchical clustering using Ward's linkage. +distance_matrix = 1 - np.abs(corr) +dist_linkage = hierarchy.ward(squareform(distance_matrix)) +dendro = hierarchy.dendrogram( + dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90 +) +dendro_idx = np.arange(0, len(dendro["ivl"])) + +ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]]) +ax2.set_xticks(dendro_idx) +ax2.set_yticks(dendro_idx) +ax2.set_xticklabels(dendro["ivl"], rotation="vertical") +ax2.set_yticklabels(dendro["ivl"]) +_ = fig.tight_layout() + +# %% +# PICK A THRESHOLD FROM DENDROGRAM AND PERFORM HEIRARCHICAL CLUSTERING - DETERMINE A FEATURE FROM EACH CLUSTER TO KEEP +from collections import defaultdict + +# Threshold chosen from manual inspection of dendrogram +threshold = 1 + +cluster_ids = hierarchy.fcluster(dist_linkage, threshold, criterion="distance") +cluster_id_to_feature_ids = defaultdict(list) +for idx, cluster_id in enumerate(cluster_ids): + cluster_id_to_feature_ids[cluster_id].append(idx) +selected_features = [v[0] for v in cluster_id_to_feature_ids.values()] +selected_features_names = X.columns[selected_features] + +X_train_sel = X_train[selected_features_names] +X_test_sel = X_test[selected_features_names] + +clf_selected_features = RandomForestClassifier( + n_estimators=100, random_state=42, class_weight="balanced" +) +clf_selected_features.fit(X_train_sel, y_train) +print( + "Baseline accuracy on test data with features removed:" + f" {clf_selected_features.score(X_test_sel, y_test):.3}" +) + +fig, ax = plt.subplots(figsize=(7, 6)) +plot_permutation_importance(clf_selected_features, X_test_sel, y_test, ax) +ax.set_title("Permutation Importances on selected subset of features\n(test set)") +ax.set_xlabel("Decrease in accuracy score") +ax.figure.tight_layout() +plt.show() + +# %% +from sklearn.metrics import confusion_matrix +import seaborn as sns + +tenure_mapping = {"owner-occupied": 0, "rental (private)": 1, "rental (social)": 2} +y_true = np.array(y_test["TENURE"].map(tenure_mapping)) +y_pred = clf_selected_features.predict(X_test_sel) + +y_pred_arr = np.copy(y_pred) +for k, v in tenure_mapping.items(): + y_pred_arr[y_pred == k] = v + +matrix = confusion_matrix(list(y_true), list(y_pred_arr), normalize="true") +print(matrix.diagonal() / matrix.sum(axis=1)) + +sns.heatmap( + matrix, + annot=True, + cmap="YlGnBu", + xticklabels=tenure_mapping.keys(), + yticklabels=tenure_mapping.keys(), +) +plt.xlabel("Predicted", fontsize=12) +plt.ylabel("Actual", fontsize=12) +plt.title("Confusion Matrix", fontsize=16) +plt.show() + +# %% [markdown] +# ### -------- End of feature engineering test -------- + +# %% +# USE MODEL TO PREDICT TENURE TYPE +features = [ + "prop_aged_16_to_35", + "prop_aged_65_and_over", + "prop_disabled_residents", + "prop_economically_inactive_adults", + "prop_employed_adults_working_full_time", + "prop_students", + "X_COORDINATE", + "Y_COORDINATE", + "prop_nn_owner_occupied", + "prop_nn_social_rental", + "in_cons_area", + "in_listed_building", +] + +# Join output areas to features +pd_features_df = ( + # features_df.join( + # pl.from_pandas(uprns_with_oa_df[["UPRN", "OA21CD", "LSOA21CD"]]), how="left", on="UPRN" + # ) + features_df.join( + os_uprn_df.select(["UPRN", "X_COORDINATE", "Y_COORDINATE"]), + how="left", + on="UPRN", + ) + .join(nn_tenure_df, how="left", on="UPRN") + .join(census_df, how="left", left_on="OA21CD", right_on="oa_code") + .with_columns( + pl.col("property_type") + .fill_null(pl.col("predicted_property_type")) + .alias("use_property_type") + ) + .filter( + pl.col("use_property_type") != "Caravan or other mobile or temporary structure", + pl.all_horizontal(pl.col(features).is_not_null()), + ) + .to_pandas() ) -pd_features_df = features_df.to_pandas() # One hot encode property type data onehotenc_df = pd.DataFrame( enc.transform( - pd_features_df[["predicted_property_type"]].rename( - columns={"predicted_property_type": "property_type"} + pd_features_df[["use_property_type"]].rename( + columns={"use_property_type": "property_type"} ) ).toarray() ) onehotenc_df.columns = list(col.lower().replace(" ", "_") for col in enc.categories_[0]) -X = ( - pd_features_df.join(onehotenc_df, how="left") - .drop(columns=["property_type", "detached"]) - .merge(census_df.to_pandas(), how="left", left_on="OA21CD", right_on="oa_code")[ - X.columns - ] -) + +X = pd_features_df.join(onehotenc_df, how="left")[X.columns] # Run model and get results y_pred = classifier.predict(X) @@ -1173,10 +1504,360 @@ } ).rename(columns={k: v for k, v in zip(["0", "1", "2"], classifier.classes_)}) -# Join predicted tenure type to features dataset -features_df = pl.from_pandas(pd_features_df.join(results_df, how="left")) +# # Join predicted tenure type to features dataset +predicted_tenure_df = ( + pl.from_pandas(pd_features_df[["UPRN"]].join(results_df, how="left")) + .with_columns( + tenure_prob=pl.concat_list( + "owner-occupied", "rental (private)", "rental (social)" + ).list.max(), + ) + .drop(["owner-occupied", "rental (private)", "rental (social)"]) +) + +features_df = features_df.join(predicted_tenure_df, how="left", on="UPRN") + +# %% [markdown] +# ## Add data - IMD decile + +# %% +# Load IMD deciles per LSOA and load LSOA names +imd_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/societal-wellbeing_imd2019_indices.csv", + skip_rows=7, +) +lsoa_names_df = pl.read_csv( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/Lower_Layer_Super_Output_Area_(2021)_to_LAD_(April_2023)_Lookup_in_England_and_Wales.csv" +) + +# %% +# Join IMD decile to UPRNs +imd_df = ( + imd_df.select(["Reference area", "2019"]) + .rename({"Reference area": "lsoa_name", "2019": "imd_decile"}) + .join( + lsoa_names_df.select(["LSOA21CD", "LSOA21NM"]), + how="left", + left_on="lsoa_name", + right_on="LSOA21NM", + ) +) + +features_df = features_df.join( + imd_df.select(["LSOA21CD", "imd_decile"]), how="left", on="LSOA21CD" +) + +# %% [markdown] +# ## Fill missing data - garden size +# +# - use average by property type (+ area) +# - sum up the different garden sizes + +# %% +# Load land registry polygons for Plymouth +land_extent_gdf = gpd.read_file( + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/Land_Registry_Cadastral_Parcels.gml", + columns=["NATIONALCADASTRALREFERENCE", "geometry"], +) + +# Preprocess land polygons +land_extent_gdf = geo_utils.transform_gdf_drop_duplicates(land_extent_gdf) +land_extent_gdf["land_area_m2"] = land_extent_gdf["geometry"].area + +# Get Plymouth OS building footprints +plymouth_building_footprints_gdf = ( + plymouth_residential_buildings_gdf[["geometry", "ID", "building_area_m2"]] + .rename(columns={"ID": "building_id"}) + .assign(source="os") + .copy() +) + +# %% +# Remove land extent geometries which are not polygons +print(land_extent_gdf["geometry"].geom_type.value_counts()) +land_extent_gdf = land_extent_gdf[land_extent_gdf.geometry.geom_type == "Polygon"] + +# Get intersection of land polygons and building polygons +intersection_gdf = garden_size.generate_gdf_land_building_overlay( + land_parcels_gdf=land_extent_gdf, + building_footprints_gdf=plymouth_building_footprints_gdf, +) + +# Get garden size +gardens_gdf = garden_size.generate_gdf_garden_size(intersection_gdf, land_extent_gdf) + +# Join garden size to UPRNs +uprns_with_garden_gdf = pl.from_pandas( + gpd.sjoin( + os_uprn_gdf, + gardens_gdf, + how="inner", + predicate="intersects", + ).drop(columns=["geometry", "index_right"]) +) + +# Identify UPRNs with multiple garden sizes and find the range between min and max garden size calculated for them +duplicated_gdf = ( + uprns_with_garden_gdf.filter(pl.col("UPRN").is_duplicated()) + .group_by("UPRN") + .agg(pl.min("garden_area_m2").alias("min"), pl.max("garden_area_m2").alias("max")) + .with_columns((pl.col("max") - pl.col("min")).alias("range")) +) + +# Deduplicate UPRNs to the smallest garden size - the join allows us to retain the other columns in the df +# TODO - use pl.over instead as it's more efficient +uprns_with_garden_gdf = uprns_with_garden_gdf.join( + uprns_with_garden_gdf.group_by("UPRN").agg(pl.min("garden_area_m2")), + how="left", + on="UPRN", +).filter(pl.col("garden_area_m2") == pl.col("garden_area_m2_right")) + +features_df = features_df.join( + uprns_with_garden_gdf.select( + ["UPRN", "NATIONALCADASTRALREFERENCE", "garden_area_m2"] + ), + how="left", + on="UPRN", +) + +# %% +# Plot the distribution of range in garden size for UPRNs joined to multiple gardens +plt.boxplot(duplicated_gdf.filter(pl.col("range") <= 250000)["range"]) +plt.title("Range in garden size (under 250,000m2) for duplicate UPRNs") +plt.ylabel("Range in garden area (m2)") +plt.show() + +print(duplicated_gdf["range"].describe()) + +# %% +# Fill any remaining missing garden size with the median size per property type and OA +fill_garden_df = features_df.group_by(["predicted_property_type", "OA21CD"]).agg( + pl.median("garden_area_m2").alias("fill_missing_garden_area_m2") +) +features_df = features_df.join( + fill_garden_df, how="left", on=["predicted_property_type", "OA21CD"] +).with_columns( + pl.col("garden_area_m2") + .fill_null(pl.col("fill_missing_garden_area_m2")) + .alias("use_garden_area_m2") +) + +# %% [markdown] +# ## Communal heating +# - if any other UPRNs in the building have communal label -> communal +# - elif any other UPRNs in the building have individual -> individual +# - else null + +# %% +# Add communal heating column (uses same logic as flats analysis) +features_df = fuel_type.extend_df_communal_heating( + features_df, epc_col="MAIN_FUEL", name="community_heating" +) +features_df = fuel_type.extend_df_communal_heating( + features_df, epc_col="MAINHEAT_DESCRIPTION", name="fill_community_heating" +) +features_df = features_df.with_columns( + (pl.col("community_heating").fill_null(pl.col("fill_community_heating"))).alias( + "community_heating" + ) +).drop("fill_community_heating") + +# Map building IDs to UPRNs and join to features df +plymouth_uprn_to_building_mapping_df = os_openmap_buildings_plymouth_gdf.sjoin( + plymouth_residential_uprns_gdf, how="inner", predicate="contains" +) +features_df = features_df.join( + pl.from_pandas(plymouth_uprn_to_building_mapping_df[["ID", "UPRN"]]).rename( + {"ID": "building_id"} + ), + how="left", + on="UPRN", +) + +# Identify buildings that contain UPRNs with communal heating and buildings that contain UPRNs with individual heating +communal_heating_buildings = list( + features_df.filter(pl.col("community_heating"))["building_id"].unique() +) +individual_heating_buildings = list( + features_df.filter( + (~pl.col("community_heating")) & pl.col("community_heating").is_not_null() + )["building_id"].unique() +) +len(set(communal_heating_buildings).intersection(set(individual_heating_buildings))) + +# Apply communal heating rules outlined above +features_df = features_df.with_columns( + pl.when(pl.col("building_id").is_in(communal_heating_buildings)) + .then(True) + .when(pl.col("building_id").is_in(individual_heating_buildings)) + .then(False) + .otherwise(None) + .alias("fill_community_heating") +).with_columns( + pl.col("community_heating") + .fill_null(pl.col("fill_community_heating")) + .alias("use_community_heating") +) + +# %% [markdown] +# ## Final feature processing + +# %% +# Fill nulls with predicted types +features_df = features_df.with_columns( + pl.col("property_type") + .fill_null(pl.col("predicted_property_type")) + .alias("use_property_type"), + pl.col("TENURE").fill_null(pl.col("predicted_tenure")).alias("use_tenure"), +) + +# %% [markdown] +# ## Cluster UPRNs + +# %% +# Cluster UPRNs on distance only +model = HDBSCAN( + min_cluster_size=5, + cluster_selection_epsilon=20, + algorithm="balltree", + metric="euclidean", +) +plymouth_residential_uprns_gdf["cluster"] = model.fit_predict( + plymouth_residential_uprns_gdf[["X_COORDINATE", "Y_COORDINATE"]] +) +print(plymouth_residential_uprns_gdf["cluster"].nunique()) + +# Join clusters to features df +features_df = features_df.join( + pl.from_pandas(plymouth_residential_uprns_gdf[["UPRN", "cluster"]]), + how="left", + on="UPRN", +) + +# Add cluster size features_df = features_df.with_columns( - tenure_prob=pl.concat_list( - "owner-occupied", "rental (private)", "rental (social)" - ).list.max(), -).drop(["owner-occupied", "rental (private)", "rental (social)"]) + cluster_size=pl.col("UPRN").n_unique().over("cluster") +) + +# %% +# PLOT CLUSTERS FOR STOKE WARD +lsoa_boundaries = get_datasets.load_gdf_ons_lsoa_bounds() +stoke_ward_lsoas = [ + "E01015155", + "E01015153", + "E01015172", + "E01015047", + "E01015151", + "E01015173", + "E01015051", + "E01015169", # used in example + "E01015171", + "E01015170", + "E01015099", + "E01015174", + "E01015167", + "E01015045", + "E01015044", + "E01015168", + "E01015072", + "E01015042", +] +stoke_ward = ( + lsoa_boundaries[lsoa_boundaries["LSOA21CD"].isin(stoke_ward_lsoas)] + .dissolve()["geometry"] + .values[0] +) + +wgs84_uprns_df = plymouth_residential_uprns_gdf[ + plymouth_residential_uprns_gdf[ + ["UPRN", "cluster", "LATITUDE", "LONGITUDE", "geometry"] + ].within(stoke_ward) +].to_crs(epsg=4326) +wgs84_buildings_df = plymouth_residential_buildings_gdf[ + plymouth_residential_buildings_gdf.within(stoke_ward) +].to_crs(epsg=4326) +x, y = ( + wgs84_uprns_df.dissolve().centroid.values[0].x, + wgs84_uprns_df.dissolve().centroid.values[0].y, +) + +import random + +get_colors = lambda n: ["#%06x" % random.randint(0, 0xFFFFFF) for _ in range(n + 1)] +colors = get_colors(wgs84_uprns_df["cluster"].max()) + +sigma = 0.00001 +wgs84_uprns_df["jitter_lat"] = wgs84_uprns_df["LATITUDE"].apply( + lambda x: np.random.normal(x, sigma) +) +wgs84_uprns_df["jitter_long"] = wgs84_uprns_df["LONGITUDE"].apply( + lambda x: np.random.normal(x, sigma) +) + +map = folium.Map(location=[y, x], tiles="OpenStreetMap", zoom_start=15) +for _, r in wgs84_buildings_df.iterrows(): + geo_j = gpd.GeoSeries(r["geometry"]).to_json() + geo_j = folium.GeoJson( + data=geo_j, style_function=lambda x: {"fillColor": "orange", "color": "orange"} + ) + geo_j.add_to(map) + +for _, r in wgs84_uprns_df.iterrows(): + if r.cluster == -1: + folium.Marker( + location=[r["jitter_lat"], r["jitter_long"]], + icon=folium.Icon(icon_color="grey", prefix="fa", icon="ban"), + popup="Noise", + ).add_to(map) + else: + folium.CircleMarker( + location=[r["jitter_lat"], r["jitter_long"]], + radius=5, + weight=5, + alpha=0.5, + color=colors[r.cluster], + popup=f"Cluster {r.cluster}", + ).add_to(map) +map + +# %% +# SAVE DATA +select_features = [ + "UPRN", + "cluster", + "OA21CD", + "LSOA21CD", + "in_listed_building", + "in_cons_area", + "in_hn", + "filled_off_gas", + "use_property_type", + "use_tenure", + # 'tenure_prob', + "imd_decile", + "NATIONALCADASTRALREFERENCE", + "use_garden_area_m2", + "building_id", + "use_community_heating", +] + +rename_selected = { + "OA21CD": "oa21", + "in_cons_area": "in_conservation_area", + "in_hn": "in_heat_network_zone", + "use_garden_area_m2": "garden_area_m2", + "use_property_type": "predicted_property_type", + "use_tenure": "predicted_tenure", + "filled_off_gas": "use_off_gas", + "use_community_heating": "on_communal_heating", +} + +save_utils.save_to_s3( + features_df, + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/results/plymouth_features_full_with_clusters.parquet", +) +save_utils.save_to_s3( + features_df.select(select_features).rename(rename_selected), + "s3://asf-heat-pump-suitability/exploration/spatial_clustering_plymouth/results/plymouth_features_selected_with_clusters.parquet", +) +plymouth_residential_uprns_gdf.to_file("plymouth_residential_uprns_with_clusters.shp") From 399929e27b6141cc8d05ca171c456443a081e04e Mon Sep 17 00:00:00 2001 From: roisin <104171770+crispy-wonton@users.noreply.github.com> Date: Fri, 29 Aug 2025 12:27:48 +0100 Subject: [PATCH 5/5] add requirements --- requirements.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a2ce2506..bdde4155 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ polars-lts-cpu==1.5.0 -ipfn @ git+https://github.com/nestauk/ipfn.git@master -balance~=0.10 jupytext~=1.17 fastexcel~=0.13 -scikit-learn<=1.2.2 +scikit-learn>=1.3 matplotlib~=3.10.2 tqdm~=4.67 geopandas~=1.0 @@ -25,3 +23,7 @@ fiona~=1.1 bs4~=0.0 pandera~=0.24 metaflow +folium +fiona +seaborn +scipy