diff --git a/Agents/FenlandTrajectoryAgent/.gitignore b/Agents/FenlandTrajectoryAgent/.gitignore index 023b95d1611..9f026d97777 100644 --- a/Agents/FenlandTrajectoryAgent/.gitignore +++ b/Agents/FenlandTrajectoryAgent/.gitignore @@ -5,6 +5,7 @@ build/ dist/ *~* .idea/ +!.vscode/ py4jps/ coverage.xml .coverage diff --git a/Agents/FenlandTrajectoryAgent/.vscode/launch.json b/Agents/FenlandTrajectoryAgent/.vscode/launch.json new file mode 100644 index 00000000000..7ef9b79d322 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/.vscode/launch.json @@ -0,0 +1,68 @@ +{ + + "version": "0.2.0", + "configurations": [ + { + "type": "python", + "name": "Build and Debug (Flask)", + "request": "attach", + "host": "localhost", + "port": "${input:debug.port}", + "preLaunchTask": "compose-build-deploy", + "pathMappings": [ + { + "localRoot": "${workspaceFolder}/agent", + "remoteRoot": "/app/agent" + } + ] + }, + { + "type": "python", + "name": "Debug (Flask)", + "request": "attach", + "host": "localhost", + "port": "${input:debug.port}", + "preLaunchTask": "compose-deploy", + "pathMappings": [ + { + "localRoot": "${workspaceFolder}/agent", + "remoteRoot": "/app/agent" + } + ] + }, + + { + "type": "python", + "name": "Reattach and Debug (Flask)", + "request": "attach", + "host": "localhost", + "port": "${input:debug.port.read}", + "pathMappings": [ + { + "localRoot": "${workspaceFolder}/agent", + "remoteRoot": "/app/agent" + } + ] + } + ], + "inputs": [ + { + "id": "debug.port", + "type": "command", + "command": "shellCommand.execute", + "args": { + "command": "bash ./stack.sh ports write", + "useFirstResult": true + } + }, + { + "id": "debug.port.read", + "type": "command", + "command": "shellCommand.execute", + "args": { + "command": "bash ./stack.sh ports read", + "useFirstResult": true + } + } + ] + } \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/.vscode/tasks.json b/Agents/FenlandTrajectoryAgent/.vscode/tasks.json new file mode 100644 index 00000000000..315c693f0b3 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/.vscode/tasks.json @@ -0,0 +1,76 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "shell", + "label": "compose-build", + "command": "./stack.sh", + "args": ["build", "--debug-port", "${input:debug.port}"], + "group": { + "kind": "build", + "isDefault": true + }, + "windows": { + "options": { + "shell": { + "executable": "bash" + } + } + } + }, + { + "type": "shell", + "label": "compose-deploy-internal", + "command": "./stack.sh", + "args": [ + "start", + "${input:stackName}", + "--debug-port", + "${input:debug.port}" + ], + "windows": { + "options": { + "shell": { + "executable": "bash" + } + } + } + }, + { + "type": "shell", + "label": "compose-deploy", + "command": "sleep 1", + "dependsOn": ["compose-deploy-internal"], + "windows": { + "options": { + "shell": { + "executable": "powershell" + } + } + } + }, + { + "type": "shell", + "label": "compose-build-deploy", + "dependsOn": ["compose-build", "compose-deploy"], + "dependsOrder": "sequence" + } + ], + "inputs": [ + { + "id": "stackName", + "type": "promptString", + "description": "Name your stack.", + "default": "CEW-DT" + }, + { + "id": "debug.port", + "type": "command", + "command": "shellCommand.execute", + "args": { + "command": "bash ./stack.sh ports read", + "useFirstResult": true + } + } + ] + } \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/Dockerfile b/Agents/FenlandTrajectoryAgent/Dockerfile index 6866438bfd1..86d4f125c6a 100644 --- a/Agents/FenlandTrajectoryAgent/Dockerfile +++ b/Agents/FenlandTrajectoryAgent/Dockerfile @@ -1,10 +1,10 @@ # Install the fenland-trajectory-agent agent in Docker container #================================================================================================== -FROM ghcr.io/cambridge-cares/stack-client:1.27.1 as stackclients +FROM ghcr.io/cambridge-cares/stack-client:1.27.1 AS stackclients #------------------------------------------------------ # Base image to be reused #------------------------------------------------------ -FROM python:3.9.14-slim-buster as base +FROM python:3.9.14-slim-buster AS base # Meta data LABEL authors = "jc2341@cam.ac.uk" LABEL description = "Fenland Trajectory Agent" @@ -43,7 +43,7 @@ EXPOSE 5000 #------------------------------------------------------ # Debugging image reusing the base #------------------------------------------------------ -FROM base as debug +FROM base AS debug # Install additional dependencies for debugging RUN pip install debugpy @@ -59,7 +59,7 @@ CMD python -m debugpy --listen 0.0.0.0:5678 --wait-for-client -m flask run -h 0. #------------------------------------------------------ # Production image reusing the base #------------------------------------------------------ -FROM base as prod +FROM base AS prod # Copy required source code (as code is not attached as volume) WORKDIR /app diff --git a/Agents/FenlandTrajectoryAgent/agent/raw_data/gps_target_folder/.gitignore b/Agents/FenlandTrajectoryAgent/Input/.gitignore similarity index 100% rename from Agents/FenlandTrajectoryAgent/agent/raw_data/gps_target_folder/.gitignore rename to Agents/FenlandTrajectoryAgent/Input/.gitignore diff --git a/Agents/FenlandTrajectoryAgent/README.md b/Agents/FenlandTrajectoryAgent/README.md index 41b2d2efa8b..ec177a11307 100644 --- a/Agents/FenlandTrajectoryAgent/README.md +++ b/Agents/FenlandTrajectoryAgent/README.md @@ -1,9 +1,8 @@ # 1. Description -The `Fenland Trajectory Agent` is a specialized tool for batch instantiation of structured time-series GPS trajectory data. This agent is capable of receiving HTTP POST requests or CURL commands to load GPS trajectory files, which it subsequently instantiates into triples before uploading them into the knowledge graph. - -Presently, the agent focuses on data from the Fenland Study to analyze the interaction between GPS trajectories and environmental features within the context of a digital twin. Example data can be found at Dropbox/CoMo_shared/_Projects/c4e-AI-for-public-health/sample_gps_data.csv. By default, the data instantiated from the Fenland Study using this agent encompasses Speed, Height, Distance, Heading, Latitude, and Longitude. This method is also applicable to other categories of time-series structured data in Fenland Study by replacing or adding the relevant column names. The information instantiated into the knowledge graph adheres to the Ontology of Devices [OntoDevice] in [TheWorldAvatar] project. The instantiation process is executed based on the [TimeSeriesClient]. This agent is implemented as a Docker container, designed for deployment within a stack managed by the [Stack Manager] +The `Fenland Trajectory Agent` is a specialised tool designed to both instantiate structured time-series GPS trajectory data into a knowledge graph and perform quantitative exposure calculations between these trajectories and environmental features. This agent is implemented as a Docker container, designed for deployment within a stack managed by the [Stack Manager]. The agent receives [HTTP POST] requests or Client URL ([CURL]) commands to perform specified tasks. These include loading GPS trajectory files, mapping them into RDF triples, and uploading them to the stack. Additionally, for an exposure radius of interest, the agent can interpret environmental features and perform corresponding exposure calculations automatically. Examples of these calculations include determining the number of food retail locations or greenspace objects within the defined radius. +Presently, the agent focuses on data from the Fenland Study to analyze the interaction between GPS trajectories and environmental features within the context of a digital twin. Example gps data can be found at Dropbox/CoMo_shared/_Projects/c4e-AI-for-public-health/sample_gps_data.csv. By default, the data instantiated from the Fenland Study using this agent encompasses Speed, Height, Distance, Heading, Latitude, and Longitude. This method is also applicable to other categories of time-series structured data in Fenland Study by replacing or adding the relevant column names. The information instantiated into the knowledge graph adheres to the Ontology of Devices [OntoDevice] in [TheWorldAvatar] project. The instantiation process is executed based on the [TimeSeriesClient]. In terms of environmental features from the Fenland Study, the raw data is stored in Dropbox/CoMo_shared/_Projects/c4e-AI-for-public-health/Data/Raw data/. Uploading and instantiating environmental data requires the [Stack-data-uploader] tool, and configuration files can be found in the [AI-for-Public-Health] folder. # 2. Agent Setup @@ -43,9 +42,9 @@ However, occasionally commands executed within the Docker environment may not su # 3. Spinning Up the Agent -## 3.1 Placing the GPS Data +## 3.1 Preparing the GPS Data -The GPS trajectory data is structured and stored in tables (in .csv format). Each table consists of a series of record points, with each point containing eight essential pieces of information for instantiation, including UTC Date, UTC Time, Longitude (degrees), Latitude (degrees), Speed (km/h), Heading (degrees), Height (meters), and Distance (meters). Please place GPS files in the [gps_target_folder]. Below is an example of the columns and values in GPS trajectory tables for instantiation: +The GPS trajectory data is structured and stored in tables (in .csv format). Each table consists of a series of record points, with each point containing eight essential pieces of information for instantiation, including UTC Date, UTC Time, Longitude (degrees), Latitude (degrees), Speed (km/h), Heading (degrees), Height (meters), and Distance (meters). Below is an example of the columns and values in GPS trajectory tables for instantiation: | UTC DATE | UTC TIME | LATITUDE | LONGITUDE | SPEED | HEADING | HEIGHT | DISTANCE | |------------|----------|-----------|-----------|--------|---------|--------|----------| @@ -67,7 +66,21 @@ In case of time out issues in automatically building the StackClients resource i ## 3.3 Deploying the Agent ### 1) Configuration and Adding Services -Please place [FenlandTrajectoryAgent.json] in [stack manager configuration service directory]. Afterward, ensure that the 'fenland-trajectory-agent service' is included in your stack configuration files normally placed in [stack manager configuration directory] +Please place [FenlandTrajectoryAgent.json] in the [stack manager configuration service directory]. Additionally, create a folder named `fta-input` under the stack manager's [data directory] to store all your GPS trajectory data. This operation ensures that any new or updated data in the fta-input folder is automatically available to the agent. Then, ensure that the fenland-trajectory-agent service is included in your stack configuration files normally located in the [stack manager configuration directory] by adding the following to your stack configuration JSON file: + +``` +{ + "services": { + "includes": [ + "fenland-trajectory-agent" + ] + }, + "volumes": { + "fta-input": "fta-input" + } +} + +``` ### 2) Starting with the Stack Manager To spin up the stack, both a `postgis_password` and `geoserver_password` file need to be created in the `Deploy/stacks/dynamic/stack-manager/inputs/secrets/` directory (see detailed guidance in the [Stack Manager README]). @@ -81,29 +94,69 @@ To spin up the stack, create `postgis_password` and `geoserver_password` files i # 4. Using the Agent -The agent will automatically register a task upon startup to assimilate the data from the target GPS folder into the Knowledge Graph (KG). This background task can be triggered via an HTTP request after the agent has started, accessible at http://localhost:{Port}/fenland-trajectory-agent. Please replace {Port} with the numerical value of the port on which your stack is running. +## 4.1 Data Ingestion + +The agent automatically registers a task at startup to ingest data from the target GPS folder into the Knowledge Graph. This background task is designed to be triggered via HTTP requests after the agent has started. -## Functionality Description +### Functionality Description -The operation of this agent is streamlined into two key steps: **Preprocess** and **Instantiate**. Here is a brief description of each: +The operation of data ingestion is streamlined into two key steps: **Preprocess** and **Instantiate**. Here is a brief description of each: - **Preprocess**: This initial step involves loading and examining the GPS trajectory files in batch to ensure efficient processing. Each file is checked for essential columns (UTC Date, UTC Time, Speed, Heading, Height, Distance, Latitude, and Longitude). Any missing data or misformatted columns, particularly those related to time, are corrected to conform with the [ISO 8601] standard. During this phase, data is extracted into a temporary dataframe, allowing for necessary restructuring and corrections without altering the original files. - **Instantiate**: This further step involves creating geometry classes for the trajectories, generating IRIs, and structuring the data into lists categorized as time-series and non-time-series. These lists serve as inputs for the [TimeSeriesClient] and the [KGClient], respectively. This organized data is then instantiated in both the knowledge graph and the relational database to facilitate queries and analytics. -Importantly, the preprocessing step must be completed before moving on to the instantiation step. This sequence is crucial as it ensures that all data is properly formatted and organized, making it ready for use in the knowledge graph and the relational database. +### Example HTTP Requests +Example requests for preprocessing and instantiating data are available in detailed HTTP files. You can access these examples at the [preprocess] file, the [instantiate] file, and the [layer_generator] file. + +Additionally, services above can be triggered using CURL from a bash terminal. An example CURL command used to load the GPS trajectory files is displayed in [CURL commands folder]. + +## 4.2 Trip Detection + +After data instantiation, the agent provides a trip detection route that analyzes GPS trajectories to identify distinct trips and visits. Using Kernel Density Estimation (KDE) and watershed algorithms, the agent processes each trajectory to detect significant locations (hotspots) and movement patterns. The results are stored by updating the "Trip index" column in the timeseries table, where each continuous sequence of GPS points is assigned a unique trip identifier. This functionality is particularly useful for analyzing movement patterns and segmenting long trajectories into meaningful trips. The agent can process multiple trips within a single trajectory and merge the results together, providing a comprehensive view of an individual's movement patterns. For subsequent exposure calculations, the agent focuses exclusively on the detected trips (excluding visits and gaps), and will aggregate the exposure measurements across all trips within the same trajectory to provide a consolidated result. Example requests for trip detection can be found in the [trip_detection] file. + +## 4.3 Count-Based Exposure Calculation + +The agent is configured to calculate exposure by listening to trajectory IRIs and environmental feature IRIs along with a specified exposure radius. +The calculation can be triggered via an HTTP request after the agent has started, and its principles and operational procedures are outlined as follows. + +### Functionality Description + +The workflow begins by generating a buffer zone around the trajectory geometry, with the buffer extending outward from the trajectory line by a specified exposure radius. Environmental features, such as food retail locations or greenspaces, are then identified and counted based on their intersections with this buffer zone, providing a quantitative measure of exposure. To accommodate different data storage scenarios, the agent provides two routes for exposure calculations: + +- **Exposure_count**: +This route can operate when the environmental data and the Fenland Trajectory Agent are deployed in different stacks. You need to manually configure the [config.properties] file: + +1. For the environmental data endpoint (`ENV_DATA_ENDPOINT_URL`): + - If the TWA stack containing environmental data is deployed on the same machine with different port: use `http://host.docker.internal:/ontop/ui/sparql` + - If the TWA stack is deployed on an external server: use `http://:/ontop/ui/sparql` + +2. For trajectory database configuration: + - If your trajectory data is managed by the same stack as this agent: + ``` + TRAJECTORY_DB_HOST= + TRAJECTORY_DB_PASSWORD= + ``` + (Just leave these fields empty) + - If managed by a TWA stack on a different server: + ``` + TRAJECTORY_DB_HOST= + TRAJECTORY_DB_PASSWORD= + ``` -In addition to the core functionality, this agent offers a route task called layer_generator that creates GeoServer layers from PostGIS tables for visualisation. Given a table name (formed by UUID) and latitude/longitude column names, the agent deploys the [SQL commands for virtual-table generation], creates vector objects, and communicates with the GeoServer REST API to publish the layer. Please note that this functionality is specialized and typically requires manual examination of SQL commands and layer metadata (such as EPSG) to ensure compatibility with specific requirements. This functionality will be further updated and refined after the AI for Public Health (Fenland) project's visualisation is switched to [TWA-VF] 5.4.0. +In this project, we use Ontop SPARQL endpoint as `ENV_DATA_ENDPOINT_URL` in the [config.properties] file to handle data such as Food Retail and Greenspace. In this context, the matching [OBDA mappings] should be configured in advance by [Stack-data-uploader]. Once these settings are complete, the agent will fetch the geometry data for both environmental features and trajectories from the specified endpoint and database, temporarily store the data in a dataframe, and perform the calculations within the agent. -## Example HTTP Requests -Example HTTP requests for preprocessing and instantiating data are available in detailed HTTP files. You can access these examples at the [preprocess] file, the [instantiate] file, and the [layer_generator] file. +A typical example involves using an Ontop SPARQL endpoint as `ENV_DATA_ENDPOINT_URL`, where this Ontop endpoint can originate from any other stack created by the [Stack Manager]. Food Retail and Greenspace datasets can be used as demonstration cases. When uploading these structured datasets to the stack, the [Stack-data-uploader] should be preconfigured with particular [OBDA mapping] files containing key metadata, such as FeatureType and SourceTableName. These mappings are essential, as the exposure calculation relies on this metadata to trigger the appropriate computation methods. -Additionally, services above can be triggered using Client URL (CURL) from a bash terminal. An example CURL command used to load the GPS trajectory files is displayed in [CURL commands folder]. +- **Exposure_count_single_stack**: If the environmental data and the Fenland Trajectory Agent are deployed in the same stack. +This route is used when the environmental data and the Fenland Trajectory Agent are deployed in the same stack. In this case, the agent will query the Ontop SPARQL endpoint deployed within the same stack to fetch the datasource table name corresponding to the trajectory IRI. The agent will then populate an query and execute the calculation internally within the stack. +### Example HTTP Requests +Example requests are available in detailed HTTP files. You can access these examples at the [exposure_count] file and the [exposure_count_single_stack] file. Services can also be triggered using CURL from a bash terminal. Examples are displayed in [CURL commands folder].   # Authors -Jiying Chen (jc2341@cam.ac.uk), May 2024 +Jiying Chen (jc2341@cam.ac.uk), Jan 2025 @@ -140,6 +193,10 @@ Jiying Chen (jc2341@cam.ac.uk), May 2024 [ISO 8601]: https://www.iso.org/iso-8601-date-and-time-format.html [knowledge graph operations guidance]: https://cambridge-cares.github.io/TheWorldAvatar/examples/sparql/ [TWA-VF]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/web/twa-vis-framework +[Stack-data-uploader]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/Deploy/stacks/dynamic/stack-data-uploader +[AI-for-Public-Health]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/Deploy/stacks/AI4PublicHealth +[HTTP POST]: https://www.w3schools.com/tags/ref_httpmethods.asp +[CURL]: https://learn.microsoft.com/en-us/dynamics365/business-central/dev-itpro/developer/devenv-web-client-urls [Dockerfile]: ./Dockerfile @@ -150,6 +207,7 @@ Jiying Chen (jc2341@cam.ac.uk), May 2024 [FenlandTrajectoryAgent.json]: ./stack-manager-input-config-service/ [stack manager configuration service directory]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/Deploy/stacks/dynamic/stack-manager/inputs/config/services [stack manager configuration directory]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/Deploy/stacks/dynamic/stack-manager/inputs/config/ +[data directory]: https://github.com/cambridge-cares/TheWorldAvatar/tree/main/Deploy/stacks/dynamic/stack-manager/inputs/data [CURL commands folder]: ./example-requests/curl [SendHTTP]: ./example-requests/SendHTTP [preprocess]: ./example-requests/SendHTTP/gps_preprocess.http @@ -157,3 +215,10 @@ Jiying Chen (jc2341@cam.ac.uk), May 2024 [layer_generator]: ./example-requests/SendHTTP/layer_generator.http [KGClient]: ./agent/kgutils/kgclient.py [SQL commands for virtual-table generation]: ./agent/layergenerator/virtual_table.sql +[config.properties]:./agent/flaskapp/exposure/config.properties +[exposure_count]: ./example-requests/SendHTTP/exposure_count.http +[exposure_count_single_stack]: ./example-requests/SendHTTP/exposure_count_single_stack.http +[OBDA mapping]: ./resources/ontop.obda +[Input]: ./Input/ +[target_gps_folder.env]:./target_gps_folder.env +[trip_detection]:./example-requests/SendHTTP/trip_detection.http \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/datainstantiation/cleaning_tool.py b/Agents/FenlandTrajectoryAgent/agent/datainstantiation/cleaning_tool.py new file mode 100644 index 00000000000..fefe030a748 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/datainstantiation/cleaning_tool.py @@ -0,0 +1,155 @@ +import os +import re +import pandas as pd +import logging + +logger = logging.getLogger(__name__) + +def clean_gps_data(file_path, output_dir=None): + """ + Logic for quick understanding: + 1. Only the following columns are cleaned: [UTC DATE, UTC TIME, LATITUDE, LONGITUDE, SPEED, HEADING, HEIGHT, DISTANCE]. + Other columns in the raw data from Fenland Study remain unchanged. + 2. For the LATITUDE and LONGITUDE columns: + - First, check if the cell contains an isolated uppercase direction letter (N or S for latitude; E or W for longitude). + If such a letter exists, use it to determine the sign. But for data in Fenland study, this pre-check is unnecessary. We just add this pre-check for general applicability consideration + - Then, extract the numerical value from the cell using a regular expression that supports scientific notation. + - If no direction letter is found in the cell, check the optional additional direction columns (N/S for latitude, E/W for longitude) + to determine the direction. + - Adjust the numerical value's sign based on the determined direction. For data in fenland study, the direction indicated by direction columns (N/S for latitude, E/W for longitude) hold a dominant position to determine the direction + Even if there is any single letters like N or S in cell of lat or lon, the function we defined here still use the infomation indicated by N/S and E/W) + 3. For the other columns, directly extract the numerical value using the regular expression (which also that supports scientific notation) + """ + logger.info("Starting cleaning of GPS data from file: %s", file_path) + try: + df = pd.read_csv(file_path) + logger.info("CSV file loaded successfully with shape: %s", df.shape) + except Exception as e: + logger.error("Error loading CSV file %s: %s", file_path, e, exc_info=True) + raise e + + # Remove leading and trailing spaces from column names + df.columns = [col.strip() for col in df.columns] + logger.info("Normalized columns: %s", df.columns.tolist()) + + def norm_col(col): + return col.strip() + + # Define column names + utc_date_column_name = "UTC DATE" + utc_time_column_name = "UTC TIME" + latitude_column_name = "LATITUDE" + longitude_column_name = "LONGITUDE" + speed_column_name = "SPEED" + heading_column_name = "HEADING" + height_column_name = "HEIGHT" + distance_column_name = "DISTANCE" + ns_column_name = "N/S" # Optional additional direction column for latitude + ew_column_name = "E/W" # Optional additional direction column for longitude + + columns_to_clean = [ + latitude_column_name, + longitude_column_name, + speed_column_name, + heading_column_name, + height_column_name, + distance_column_name + ] + + num_pattern = re.compile(r"[-+]?\d*\.?\d+(?:[eEdD][-+]?\d+)?") + cleaned_df = df.copy() + + def extract_numeric(value): + if isinstance(value, str): + value = value.strip() + matches = num_pattern.findall(value) + if matches: + try: + return float(matches[0]) + except ValueError: + return None + return None + return value + + # Updated process_coord function + def process_coord(val, row, is_latitude): + """ + As mentioned at the beginning of def clean_gps_data, the workflow for this process function using following steps: + 1. Check for an isolated uppercase direction letter in the cell. If it is N/S or E/W, set a provisional direction + 2. Extract the numeric value using a regular expression. + 3. Check the optional additional direction column (N/S or E/W) and override the direction if available. + 4. Adjust the numeric value's sign based on the final direction. + """ + # Step 1: Convert val to string. + val_str = str(val) + + direction = None + # Step 2: Check for a direction letter in the cell. + if is_latitude: + match = re.search(r'(? float: + """Calculate actual surface area in WGS84 coordinate system (unit: square meters)""" + if geom is None or geom.is_empty: + return 0.0 + geod = Geod(ellps='WGS84') + if geom.geom_type == 'Polygon': + lons, lats = zip(*list(geom.exterior.coords)) + area, _ = geod.polygon_area_perimeter(lons, lats) + return abs(area) + elif geom.geom_type == 'MultiPolygon': + return sum(ExposureUtils.geodesic_area(poly) for poly in geom.geoms) + else: + return 0.0 + + @staticmethod + def buffer_point_geodesic(lon: float, lat: float, radius_m: float, n=60) -> Polygon: + """ + Deprecated, no longer called internally. Signature kept for compatibility. + """ + raise NotImplementedError("Use _make_fast_buffer instead") + + @staticmethod + def buffer_linestring_geodesic(coords: list, radius_m: float, seg_step_m=20.0) -> Polygon: + """ + Deprecated, no longer called internally. Signature kept for compatibility. + """ + raise NotImplementedError("Use _make_fast_buffer instead") + + @staticmethod + def create_geodesic_buffer(coords: list, radius_m: float): + """ + Backward compatibility: Wrap as Geometry, then use _make_fast_buffer. + """ + from shapely.geometry import Point, LineString + eu = ExposureUtils() + if not coords: + return None + if len(coords) == 1: + geom = Point(coords[0]) + else: + geom = LineString(coords) + return eu._make_fast_buffer(geom, radius_m) + + def _make_fast_buffer(self, geom, radius_m: float): + """ + Perform local equidistant projection (+buffer) + inverse projection on given WGS84 Geometry. + Much faster than geodesic fwd version. + """ + # 1) Calculate centroid for projection center + centroid = geom.centroid + # 2) Construct Azimuthal Equidistant projection + aeqd = CRS.from_proj4(f"+proj=aeqd +lat_0={centroid.y} +lon_0={centroid.x} +units=m") + to_aeqd = Transformer.from_crs(4326, aeqd, always_xy=True).transform + to_wgs84 = Transformer.from_crs(aeqd, 4326, always_xy=True).transform + # 3) Project, Euclidean buffer, inverse project + proj = transform(to_aeqd, geom) + buf_proj = proj.buffer(radius_m) + return transform(to_wgs84, buf_proj) + + def __init__(self): + logging.basicConfig(level=logging.INFO) + self.logger = logging.getLogger(self.__class__.__name__) + + config_file = os.path.join( + os.path.dirname(__file__), + "..", "flaskapp", "exposure", "config.properties" + ) + if not os.path.exists(config_file): + self.logger.error(f"Config file not found at {config_file}") + sys.exit(1) + self.config = configparser.ConfigParser() + self.config.read(config_file) + try: + self.ENV_DATA_ENDPOINT_URL = self.config.get("DEFAULT", "ENV_DATA_ENDPOINT_URL") + self.TRAJECTORY_DB_HOST = self.config.get("DEFAULT", "TRAJECTORY_DB_HOST") + self.TRAJECTORY_DB_PASSWORD = self.config.get("DEFAULT", "TRAJECTORY_DB_PASSWORD") + except (configparser.NoOptionError, configparser.NoSectionError) as e: + self.logger.error(f"Config error: {e}") + sys.exit(1) + + # Reuse geodetic and projection instances + self.geod = Geod(ellps='WGS84') + self.trans_27700_to_4326 = Transformer.from_crs(27700, 4326, always_xy=True) + + self.DB_PORT = DB_URL.split(':')[-1].split('/')[0] + self.DB_USER = DB_USER + self.DB_NAME = DATABASE + + self.env_data_cache = {} + self.template_dir = os.path.join(os.path.dirname(__file__), "templates") + + def fetch_env_crs(self, env_data_iri: str, feature_type: str) -> str: + """ + Returns a CRS URI for a geometry; falls back to 4326 for POINT if not found. + """ + # Minimal change: directly load template + template = self.load_template("get_crs_for_env_data.sparql") + sparql = template.format(env_data_iri=env_data_iri) + + try: + resp = requests.post( + self.ENV_DATA_ENDPOINT_URL, + data=sparql, + headers={ + "Content-Type": "application/sparql-query", + "Accept": "application/json", + }, + timeout=10 + ) + resp.raise_for_status() + bnd = resp.json().get("results", {}).get("bindings", []) + if bnd: + return bnd[0]["crs"]["value"] + except Exception: + # Ignore network/SPARQL issues + pass + + # If not found or error, force fallback to WGS84 for POINT + if feature_type.upper() == "POINT": + return "http://www.opengis.net/def/crs/EPSG/0/4326" + # AREA also falls back to 4326 (can be adjusted if needed) + return "http://www.opengis.net/def/crs/EPSG/0/4326" + + def exposure_calculation( + self, + trajectory_df: pd.DataFrame, + env_df: pd.DataFrame, + env_data_iri: str, + domain_label: str, + feature_type: str, + exposure_radius: float, + trajectory_iri: str + ) -> dict: + """ + AEQD-based high-speed exposure calculator. + 2025-06 – POINT: _count_at_time + – AREA : _area_at_time (m², geodesic) + """ + + # 1) ─── Extract trajectory coordinates ───────────────────────────────────────────── + coords_ll = [ + (float(r["LONGITUDE"]), float(r["LATITUDE"])) + for _, r in trajectory_df.iterrows() + if r["LONGITUDE"] and r["LATITUDE"] + ] + if not coords_ll: + return {"envFeatureName": domain_label, + "type": feature_type, + "count": 0, + "totalArea": None} + + # 2) ─── Trajectory geometry / 3) AEQD projection ────────────────────────────── + base_geom = Point(coords_ll[0]) if len(coords_ll) == 1 else LineString(coords_ll) + centroid = base_geom.centroid + aeqd_crs = CRS.from_proj4(f"+proj=aeqd +lat_0={centroid.y} +lon_0={centroid.x} +units=m") + to_aeqd = Transformer.from_crs(4326, aeqd_crs, always_xy=True).transform + to_wgs84 = Transformer.from_crs(aeqd_crs, 4326, always_xy=True).transform + + # 4) ─── Buffer entire trajectory once ───────────────────────────────────── + proj_base = transform(to_aeqd, base_geom) + buf_proj = proj_base.buffer(exposure_radius) + if buf_proj.is_empty: + return {"envFeatureName": domain_label, + "type": feature_type, + "count": 0, + "totalArea": None} + + # 5) ─── Environmental features → same plane ──────────────────────────────────── + proj_env = [] + if feature_type.upper() == "POINT": + for _, row in env_df.iterrows(): + try: + lon, lat = float(row["Longitude"]), float(row["Latitude"]) + proj_env.append((Point(to_aeqd(lon, lat)), row.get("ID", f"{lon},{lat}"))) + except Exception: + continue + else: + crs_uri = self.fetch_env_crs(env_data_iri, feature_type) + m = re.search(r"EPSG/0/(\d+)", crs_uri) + src_epsg = int(m.group(1)) if m else 4326 + to_env = Transformer.from_crs(f"EPSG:{src_epsg}", aeqd_crs, always_xy=True).transform + for _, row in env_df.iterrows(): + wkb_hex = row.get("Geometry", "") + if not wkb_hex or wkb_hex == "N/A": + continue + try: + poly_src = wkb_loads(binascii.unhexlify(wkb_hex), hex=True) + proj_env.append((transform(to_env, poly_src), None)) + except Exception: + continue + + # 6) ─── STRtree + cumulative exposure ───────────────────────────────────── + tree = STRtree([g for g, _ in proj_env]) + spatial_ix = tree.query(buf_proj) + + point_count = area_count = 0 + total_area = 0.0 + + if feature_type.upper() == "POINT": + visited = set() + for i in spatial_ix: + pt_proj, uid = proj_env[i] + if buf_proj.intersects(pt_proj): + visited.add(uid) + point_count = len(visited) + else: + for i in spatial_ix: + poly_proj, _ = proj_env[i] + inter_proj = poly_proj.intersection(buf_proj) + if inter_proj.is_empty: + continue + inter_ll = transform(to_wgs84, inter_proj) + total_area += ExposureUtils.geodesic_area(inter_ll) + area_count += 1 + + # 7) ─── Trip-mask (shared) ─────────────────────────────────────── + trip_mask = [True] * len(coords_ll) + for col in trajectory_df.columns: + if col.strip().lower().replace(" ", "_") == "trip_index": + vals = trajectory_df[col].fillna(0).astype(int) + if (vals > 0).any(): + trip_mask = vals > 0 + break + + # 8) ─── Time series exposure (POINT ➜ count; AREA ➜ area) ──────────────── + per_point_counts = None # for POINT + per_area_values = None # for AREA + + if feature_type.upper() == "POINT": + per_point_counts = [] + for (lon, lat), in_trip in zip(coords_ll, trip_mask): + if not in_trip: + per_point_counts.append(None) + continue + pt_proj = Point(to_aeqd(lon, lat)) + idxs_pt = tree.query(pt_proj.buffer(exposure_radius)) + cnt = sum( + 1 for i in idxs_pt + if pt_proj.distance(proj_env[i][0]) <= exposure_radius + ) + per_point_counts.append(cnt) + + elif feature_type.upper() == "AREA": + per_area_values = [] + buf_template = Point(0, 0).buffer(exposure_radius) # Reuse circle template + for (lon, lat), in_trip in zip(coords_ll, trip_mask): + if not in_trip: + per_area_values.append(None) + continue + pt_proj = Point(to_aeqd(lon, lat)) + circ_proj = translate(buf_template, pt_proj.x, pt_proj.y) + idxs_pt = tree.query(circ_proj) + area_m2 = 0.0 + for i in idxs_pt: + poly_proj, _ = proj_env[i] + inter_proj = poly_proj.intersection(circ_proj) + if inter_proj.is_empty: + continue + inter_ll = transform(to_wgs84, inter_proj) + area_m2 += ExposureUtils.geodesic_area(inter_ll) + per_area_values.append(area_m2 if area_m2 > 0 else 0.0) + + # 9) ─── Write back to database ───────────────────────────────────────────── + conn = None + try: + conn = self.connect_to_database(self.TRAJECTORY_DB_HOST, self.DB_PORT, + self.DB_USER, self.TRAJECTORY_DB_PASSWORD, self.DB_NAME) + table_name = self.get_table_name_for_timeseries(conn, trajectory_iri) + safe_label = domain_label.replace(" ", "_").replace("/", "_") + + col_tot_point = f"{safe_label}_point_count" + col_tot_area = f"{safe_label}_area_count" + col_tot_area_m = f"{safe_label}_total_area" + col_pt_time = f"{safe_label}_count_at_time" + col_ar_time = f"{safe_label}_area_at_time" + + # 9a) Ensure columns exist + cols_to_add = [ + (col_tot_point, "INTEGER"), + (col_tot_area, "INTEGER"), + (col_tot_area_m, "DOUBLE PRECISION"), + ] + if per_point_counts is not None: + cols_to_add.append((col_pt_time, "INTEGER")) + if per_area_values is not None: + cols_to_add.append((col_ar_time, "DOUBLE PRECISION")) + + for col, dtype in cols_to_add: + alter_sql = f""" + DO $$ BEGIN + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = %s AND column_name = %s + ) THEN + EXECUTE 'ALTER TABLE "{table_name}" ADD COLUMN "{col}" {dtype}'; + END IF; + END $$; + """ + self.execute_query(conn, alter_sql, (table_name, col)) + + # Step 2: use -1 as the flag, reflecting the incomplete calculation + init_sql = f'UPDATE "{table_name}" SET "{col}" = -1 WHERE "{col}" IS NULL;' + self.execute_query(conn, init_sql) + + # 9b) Update cumulative columns + has_tsiri = bool(self.execute_query( + conn, + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = %s AND column_name = 'time_series_iri';", + (table_name,))) + + if has_tsiri: + upd_sql = f""" + UPDATE "{table_name}" + SET "{col_tot_point}" = %s, + "{col_tot_area}" = %s, + "{col_tot_area_m}"= %s + WHERE "time_series_iri" = %s; + """ + params = (point_count, area_count, total_area, trajectory_iri) + else: + upd_sql = f""" + UPDATE "{table_name}" + SET "{col_tot_point}" = %s, + "{col_tot_area}" = %s, + "{col_tot_area_m}"= %s; + """ + params = (point_count, area_count, total_area) + + self.execute_query(conn, upd_sql, params) + + # 9c) Write per-time series + def write_series(col_name, series_vals): + if not series_vals: + return + values = [ + (pd.to_datetime(r["time"]), v) + for (_idx, r), v in zip(trajectory_df.iterrows(), series_vals) + if v is not None + ] + if not values: + return + # Determine connection type + is_psycopg = True + try: + _ = conn.cursor() + _.close() + except Exception: + is_psycopg = False + + if is_psycopg: + cur = conn.cursor() + execute_values( + cur, + f""" + UPDATE "{table_name}" AS t + SET "{col_name}" = v.val + FROM (VALUES %s) AS v(ts, val) + WHERE t."time" = v.ts; + """, + values + ) + else: + for ts, val in values: + s_sql = f''' + UPDATE "{table_name}" + SET "{col_name}" = %s + WHERE "time" = %s; + ''' + self.execute_query(conn, s_sql, (val, ts)) + + if per_point_counts is not None: + write_series(col_pt_time, per_point_counts) + if per_area_values is not None: + write_series(col_ar_time, per_area_values) + + # 9d) Commit (psycopg2 only) + self.commit_if_psycopg2(conn) + + except Exception as e: + self.logger.error(f"Failed to update the database: {e}") + finally: + if conn: + conn.close() + + # 10) ─── Return result dict ──────────────────────────────────────── + result = { + "envFeatureName": domain_label, + "type": feature_type, + "count": point_count if feature_type.upper() == "POINT" else area_count, + } + if feature_type.upper() == "AREA": + result["totalArea"] = total_area + if per_area_values is not None: + result["area_at_time"] = per_area_values + if per_point_counts is not None: + result["count_at_time"] = per_point_counts + return result + + def commit_if_psycopg2(self, conn): + try: + test_cursor = conn.cursor() # Will throw an exception if using JDBC + test_cursor.close() + # Success means we're using psycopg2 + conn.commit() + self.logger.debug("Committed (psycopg2).") + except: + self.logger.debug("Skipping commit (TSClient autoCommit).") + + def load_template(self, filename: str) -> str: + filepath = os.path.join(self.template_dir, filename) + with open(filepath, "r", encoding="utf-8") as f: + content = f.read() + return content + + def connect_to_database(self, host: str, port: int, user: str, password: str, database: str) -> psycopg2.extensions.connection: + """ + If TRAJECTORY_DB_HOST or TRAJECTORY_DB_PASSWORD is empty, consider external configuration incomplete, + use internal connection method from TSClient; otherwise use psycopg2 to connect to external server. + Note: Caller needs to manually call conn.close() after using the connection. + """ + if not host.strip() or not password.strip(): + self.logger.info("External DB parameters not provided; using internal TSClient connection via setup_clients().") + try: + # Call setup_clients() from gps_client module, which has already successfully initialized TSClient (internal connection method) + import agent.datainstantiation.gps_client as gdi + kg_client, ts_client, double_class, point_class = gdi.setup_clients() + # Use TSClient's connection method, return internal connection + conn = ts_client.connection.getConnection() + self.logger.info("Internal TSClient connection established successfully.") + return conn + except Exception as ex: + self.logger.error(f"Internal TSClient connection failed: {ex}") + raise ex + else: + self.logger.info(f"Connecting to database {database} at {host}:{port} using external parameters...") + try: + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + database=database, + connect_timeout=10 + ) + self.logger.info("External DB connection established successfully.") + return conn + except psycopg2.Error as e: + self.logger.error(f"External DB connection failed: {e}") + raise e + + def execute_query(self, connection, query: str, params: Optional[tuple] = None): + try: + do_fetch = False + first = query.strip().upper() + if first.startswith("WITH") or first.startswith("SELECT"): + do_fetch = True + + start_time = time.time() + # Try getting a cursor via psycopg2 + try: + cursor = connection.cursor() + using_psycopg = True + except Exception: + # If .cursor() throws an exception (including Py4JException), we assume it's TSClient JDBC + cursor = connection.createStatement() + using_psycopg = False + + self.logger.debug(f"Executing query: {query}, params={params}") + + if using_psycopg: + # psycopg2 mode + cursor.execute(query, params) + else: + # JDBC mode + if params: + # For each parameter: escape special characters and wrap in quotes + safe_params = [] + for p in params: + if p is None: + # For NULL parameters, use the 'NULL' string + safe_params.append("NULL") + continue + p_escaped = str(p).replace("'", "''") + safe_params.append(f"'{p_escaped}'") + query = query % tuple(safe_params) + + cursor.execute(query) + + elapsed = time.time() - start_time + + if do_fetch: + try: + if using_psycopg: + rows = cursor.fetchall() + else: + rs = cursor.getResultSet() + rows = [] + meta = rs.getMetaData() + ncols = meta.getColumnCount() + while rs.next(): + row = tuple(rs.getString(i + 1) for i in range(ncols)) + rows.append(row) + self.logger.info(f"Query returned {len(rows)} rows in {elapsed:.3f}s.") + return rows + except Exception as fetch_e: + self.logger.error(f"Error fetching query results: {fetch_e}") + raise fetch_e + else: + self.logger.info(f"Query executed (no fetch) in {elapsed:.3f}s.") + return None + + except Exception as e: + self.logger.error(f"Query execution error: {e}") + raise e + + def get_table_name_for_timeseries(self, conn, timeseriesIRI: str) -> str: + try: + query_template = self.load_template("get_table_name_for_timeseries.sql") + rows = self.execute_query(conn, query_template, (timeseriesIRI,)) + if rows: + return rows[0][0] + except Exception as e: + self.logger.warning(f"[Fallback] Primary get_table_name failed: {e}") + + # Try fallback + fallback_template = self.load_template("get_table_name_for_timeseries_fallback.sql") + rows = self.execute_query(conn, fallback_template, (timeseriesIRI,)) + if not rows: + raise ValueError(f"No fallback tableName found for timeseriesIRI: {timeseriesIRI}") + return rows[0][0] + + def get_timeseries_data(self, conn, table_name: str, trajectory_iri: str) -> pd.DataFrame: + """ + Retrieve raw timeseries data from database. + Automatically detects whether 'Trip index' column exists and selects the correct SQL template. + Returns DataFrame with appropriate columns. + """ + try: + check_sql = self.load_template("check_trip_index_column.sql") + result = self.execute_query(conn, check_sql, (table_name,)) + has_trip_index = bool(result) + except Exception as e: + self.logger.warning(f"[Trip index] Failed to check for 'Trip index' column: {e}") + has_trip_index = False + + try: + if has_trip_index: + self.logger.info(f"[Trip index] Detected in table: {table_name}") + sql_template_name = "get_timeseries_data_with_trip.sql" + else: + self.logger.info(f"[Trip index] Not found, using fallback SQL.") + sql_template_name = "get_timeseries_data.sql" + + query_template = self.load_template(sql_template_name) + query = query_template.format(table_name=table_name) + rows = self.execute_query(conn, query) + + base_cols = ["time", "column1", "column2", "column3", "column4", "column5", "column6", "column7"] + if has_trip_index: + base_cols.append("Trip index") + + return pd.DataFrame(rows, columns=base_cols) + + except Exception as primary_exception: + self.logger.warning(f"[Fallback] Primary SQL failed for {table_name}: {primary_exception}") + + # Run fallback version + try: + fallback_template = self.load_template("get_timeseries_data_fallback.sql") + query = fallback_template.format(table_name=table_name) + rows = self.execute_query(conn, query, (trajectory_iri,)) + + return pd.DataFrame(rows, columns=["time", "LATITUDE", "LONGITUDE"]) + + except Exception as fallback_exception: + self.logger.error(f"[Fallback] Failed for {table_name}: {fallback_exception}") + raise fallback_exception + + + def fetch_trajectory_data_from_db(self, trajectory_iri: str) -> pd.DataFrame: + """ + Fetch and process trajectory data from database. + Handles basic column renaming and time formatting. + """ + self.logger.info("==== fetch_trajectory_data_from_db() called ====") + conn = None + try: + conn = self.connect_to_database( + self.TRAJECTORY_DB_HOST, + self.DB_PORT, + self.DB_USER, + self.TRAJECTORY_DB_PASSWORD, + self.DB_NAME + ) + table_name = self.get_table_name_for_timeseries(conn, trajectory_iri) + df = self.get_timeseries_data(conn, table_name, trajectory_iri) + finally: + if conn: + conn.close() + + df = df.rename(columns={ + "column1": "SPEED", + "column2": "DISTANCE", + "column3": "HEIGHT", + "column4": "HEADING", + "column5": "LATITUDE", + "column6": "LONGITUDE", + "column7": "POINT" + }) + + if 'time' in df.columns: + df['time'] = pd.to_datetime(df['time'], errors='coerce') + df = df.sort_values('time').reset_index(drop=True) + + # Find earliest time, store to df['trajectory_start_time'] + if not df.empty: + earliest_dt = df['time'].iloc[0] + earliest_str = earliest_dt.strftime('%Y-%m-%dT%H:%M:%SZ') + df['trajectory_start_time'] = earliest_str + self.logger.info(f"Trajectory earliest time = {earliest_str}") + else: + self.logger.warning("Trajectory DataFrame is empty; no earliest time") + + return df + + def get_domain_and_featuretype(self, env_data_iri: str, endpoint_url: Optional[str] = None) -> Tuple[str, str]: + if endpoint_url is None: + endpoint_url = self.ENV_DATA_ENDPOINT_URL + template = self.load_template("get_domain_and_featuretype.sparql") + query = template.format(env_data_iri=env_data_iri) + headers = {"Content-Type": "application/sparql-query", "Accept": "application/json"} + resp = requests.post(endpoint_url, data=query, headers=headers, timeout=30) + resp.raise_for_status() + results = resp.json().get("results", {}).get("bindings", []) + if not results: + raise ValueError(f"No domainName/featureType found for {env_data_iri}") + domain_name = results[0]["domainName"]["value"] + feature_type = results[0]["featureType"]["value"] + self.logger.info(f"For {env_data_iri}, domainName={domain_name}, featureType={feature_type}") + return domain_name, feature_type + + # ---- NEW OR MODIFIED ---- + # => Here we use absolute-time-filter.sparql + def fetch_env_data(self, env_data_iri: str, endpoint_url: Optional[str] = None, + reference_time: Optional[str] = None) -> Tuple[pd.DataFrame, str]: + if endpoint_url is None: + endpoint_url = self.ENV_DATA_ENDPOINT_URL + + domain_label, feature_type = self.get_domain_and_featuretype(env_data_iri, endpoint_url) + dl_lower = domain_label.strip().lower() + + self.logger.info(f"[fetch_env_data] Processing env_data_iri={env_data_iri}, domain_label={domain_label}, feature_type={feature_type}") + + if env_data_iri in self.env_data_cache and reference_time is None: + self.logger.info(f"[fetch_env_data] Cache hit for env_data_iri: {env_data_iri}") + return self.env_data_cache[env_data_iri] + + if feature_type.upper() == "POINT" and dl_lower.startswith("food hygiene rating") and reference_time is not None: + self.logger.info(f"[fetch_env_data] Using absolute-time-filter for {env_data_iri} at {reference_time}") + template_file = "absolute_time_filter.sparql" # You might have named it absolute-time-filter.sparql + sparql_query = self.load_template(template_file).format(given_time=reference_time) + headers = {"Content-Type": "application/sparql-query", "Accept": "application/json"} + resp = requests.post(endpoint_url, data=sparql_query, headers=headers, timeout=30) + resp.raise_for_status() + json_res = resp.json() + data = [] + for b in json_res["results"]["bindings"]: + lat = b.get("lat", {}).get("value") + lon = b.get("long", {}).get("value") + data.append({ + "ID": b.get("be", {}).get("value", "N/A"), + "Latitude": lat if lat else "N/A", + "Longitude": lon if lon else "N/A", + "WKT": b.get("wkt", {}).get("value", "N/A"), + }) + df = pd.DataFrame(data) + self.logger.info(f"[POINT - TimeFilter] Extracted DataFrame with {len(df)} rows for {env_data_iri}") + domain_label_final = domain_label # Keep original domain label + + elif feature_type.upper() == "POINT": + if dl_lower.startswith("food hygiene rating"): + template_file = "get_food_hygiene_rating.sparql" + else: + template_file = "get_point_generic.sparql" + sparql_query = self.load_template(template_file).format(env_data_iri=env_data_iri) + headers = {"Content-Type": "application/sparql-query", "Accept": "application/json"} + resp = requests.post(endpoint_url, data=sparql_query, headers=headers, timeout=30) + resp.raise_for_status() + json_res = resp.json() + data = [ + { + "ID": b.get("id", {}).get("value", "N/A"), + "Longitude": b.get("longitude", {}).get("value", "N/A"), + "Latitude": b.get("latitude", {}).get("value", "N/A"), + } + for b in json_res["results"]["bindings"] + ] + df = pd.DataFrame(data) + self.logger.info(f"[POINT] Extracted DataFrame with {len(df)} rows for {env_data_iri}") + domain_label_final = domain_label + + elif feature_type.upper() == "AREA": + if dl_lower.startswith("greenspace"): + template_file = "get_greenspace_area.sparql" + else: + template_file = "get_area_generic.sparql" + sparql_query = self.load_template(template_file).format(env_data_iri=env_data_iri) + headers = {"Content-Type": "application/sparql-query", "Accept": "application/json"} + resp = requests.post(endpoint_url, data=sparql_query, headers=headers, timeout=30) + resp.raise_for_status() + json_res = resp.json() + data = [ + { + "Feature": b.get("feature", {}).get("value", "N/A"), + "Geometry": b.get("geometry", {}).get("value", "N/A"), + } + for b in json_res["results"]["bindings"] + ] + df = pd.DataFrame(data) + self.logger.info(f"[AREA] Extracted DataFrame with {len(df)} rows for {env_data_iri}") + domain_label_final = domain_label + else: + self.logger.warning(f"featureType={feature_type} unknown for {env_data_iri}, returning empty df.") + df = pd.DataFrame([]) + domain_label_final = domain_label + + if reference_time is None: + self.env_data_cache[env_data_iri] = (df, domain_label_final) + + return df, domain_label_final + + def fetch_domain_and_data_sources(self, env_data_iri: str) -> List[Dict[str, str]]: + template = self.load_template("get_domain_and_data_sources.sparql") + query = template.format(env_data_iri=env_data_iri) + headers = {"Content-Type": "application/sparql-query", "Accept": "application/json"} + resp = requests.post(self.ENV_DATA_ENDPOINT_URL, data=query, headers=headers, timeout=30) + resp.raise_for_status() + res_json = resp.json() + bindings = res_json.get("results", {}).get("bindings", []) + results = [] + for b in bindings: + results.append({ + "domainIRI": b["domainIRI"]["value"], + "domainName": b["domainName"]["value"], + "dataSourceName": b["dataSourceName"]["value"], + }) + return results + + def create_standard_trajectory_table(self, table_name: str, conn): + """ + Creates a standardized trajectory table only for internal use when importing external data. + This SQL is defined inline rather than as a template since it's a utility function + for internal data standardization rather than a query template. + """ + create_sql = f''' + CREATE TABLE IF NOT EXISTS "{table_name}" ( + "time" TIMESTAMP, + "LATITUDE" DOUBLE PRECISION, + "LONGITUDE" DOUBLE PRECISION + ); + ''' + self.execute_query(conn, create_sql) + self.logger.info(f"[Fallback] Created table {table_name} with standard structure.") + + + def calculate_exposure_simplified_util(self, data: dict) -> List[Dict]: + trajectoryIRIs = data.get("trajectoryIRIs", []) + exposure_radius = data.get("exposure_radius", 100) + dataIRIs = data.get("DataIRIs", []) + final_results = [] + + try: + conn = self.connect_to_database( + host=self.TRAJECTORY_DB_HOST, + port=self.DB_PORT, + user=self.DB_USER, + password=self.TRAJECTORY_DB_PASSWORD, + database=self.DB_NAME + ) + except Exception as e: + raise Exception(f"Failed to connect DB: {str(e)}") + + for trajectory_iri in trajectoryIRIs: + try: + table_name = self.get_table_name_for_timeseries(conn, trajectory_iri) + except Exception as e: + conn.close() + raise Exception(f"Failed to get table_name for {trajectory_iri}: {str(e)}") + + for env_data_iri in dataIRIs: + try: + domain_name, feature_type = self.get_domain_and_featuretype(env_data_iri, self.ENV_DATA_ENDPOINT_URL) + ds_rows = self.fetch_domain_and_data_sources(env_data_iri) + if not ds_rows: + final_results.append({ + "trajectory_iri": trajectory_iri, + "env_data_iri": env_data_iri, + "error": "No dataSourceName found" + }) + continue + + dl_lower = domain_name.strip().lower() + row_count = 0 + + if feature_type.upper() == "AREA": + if dl_lower.startswith("greenspace"): + sql_template = self.load_template("simplified_get_area_greenspace.sql") + query = sql_template.format(exposure_radius=exposure_radius, table_name=table_name) + rows = self.execute_query(conn, query) + row_count = len(rows) + else: + union_parts = [] + for item in ds_rows: + ds_name = item["dataSourceName"] + union_parts.append(f'SELECT wkb_geometry FROM public."{ds_name}"') + union_sql = "\nUNION ALL\n".join(union_parts) + sql_template = self.load_template("simplified_get_area_generic.sql") + query = sql_template.format(exposure_radius=exposure_radius, table_name=table_name, union_sql=union_sql) + rows = self.execute_query(conn, query) + row_count = len(rows) + elif feature_type.upper() == "POINT": + if dl_lower.startswith("food hygiene rating"): + union_parts = [] + for item in ds_rows: + ds_name = item["dataSourceName"] + union_parts.append(f''' + SELECT "Name", "Address", geom + FROM public."{ds_name}" + ''') + union_sql = "\nUNION ALL\n".join(union_parts) + sql_template = self.load_template("simplified_get_point_food_hygiene.sql") + query = sql_template.format(exposure_radius=exposure_radius, table_name=table_name, union_sql=union_sql) + point_rows = self.execute_query(conn, query) + row_count = sum(r[3] for r in point_rows) + else: + union_parts = [] + for item in ds_rows: + ds_name = item["dataSourceName"] + union_parts.append(f'SELECT geom FROM public."{ds_name}"') + union_sql = "\nUNION ALL\n".join(union_parts) + sql_template = self.load_template("simplified_get_point_generic.sql") + query = sql_template.format(exposure_radius=exposure_radius, table_name=table_name, union_sql=union_sql) + rows = self.execute_query(conn, query) + row_count = len(rows) + else: + final_results.append({ + "trajectory_iri": trajectory_iri, + "env_data_iri": env_data_iri, + "error": f"Unknown featureType={feature_type}" + }) + continue + + safe_column_name = f"{domain_name}_Count".replace(" ", "_").replace("/", "_") + alter_sql = f""" + DO $$ BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.columns + WHERE table_name = %s AND column_name = %s + ) THEN + EXECUTE 'ALTER TABLE "{table_name}" ADD COLUMN "{safe_column_name}" INTEGER'; + END IF; + END $$; + """ + self.execute_query(conn, alter_sql, (table_name, safe_column_name)) + + update_sql = f''' + UPDATE "{table_name}" + SET "{safe_column_name}" = %s + ''' + self.execute_query(conn, update_sql, (row_count,)) + self.commit_if_psycopg2(conn) + + final_results.append({ + "trajectory_iri": trajectory_iri, + "env_data_iri": env_data_iri, + "domain_name": domain_name, + "feature_type": feature_type, + "updated_column": safe_column_name, + "row_count": row_count, + "table_name": table_name + }) + except Exception as e: + final_results.append({ + "trajectory_iri": trajectory_iri, + "env_data_iri": env_data_iri, + "error": str(e) + }) + if conn: + conn.close() + return final_results + + def detect_trips_util(self, trajectory_iri: str) -> dict: + """ + Performs trip detection on a trajectory and updates the database with results. + Args: + trajectory_iri: The IRI of the trajectory to analyze + Returns: + dict: Status of the operation + """ + if not trajectory_iri: + raise ValueError("trajectoryIRI is required") + + try: + trajectory_df = self.fetch_trajectory_data_from_db(trajectory_iri) + if 'time' not in trajectory_df.columns: + raise ValueError("Required column 'time' not found in trajectory data") + if trajectory_df['time'].isna().all(): + raise ValueError("No valid time data found in trajectory") + + trajectory_df['Timestamp'] = pd.to_datetime(trajectory_df['time']) + self.logger.info(f"Created Timestamp column from time data. Sample values:") + self.logger.info(f"First 3 values: {trajectory_df['Timestamp'].head(3).tolist()}") + self.logger.info(f"Total rows: {len(trajectory_df)}, Valid Timestamp values: {trajectory_df['Timestamp'].notna().sum()}") + + if trajectory_df['Timestamp'].isna().all(): + raise ValueError("Failed to create valid Timestamp column from time data") + + for col in ['LATITUDE', 'LONGITUDE', 'SPEED', 'DISTANCE', 'HEIGHT', 'HEADING']: + if col not in trajectory_df.columns: + raise ValueError(f"Required column '{col}' not found in trajectory data") + if trajectory_df[col].isna().all(): + raise ValueError(f"No valid data found in column '{col}'") + trajectory_df[col] = pd.to_numeric(trajectory_df[col], errors='coerce') + if col not in ['LATITUDE', 'LONGITUDE']: + trajectory_df[col] = trajectory_df[col].fillna(0) + + trajectory_df = trajectory_df.sort_values('Timestamp').reset_index(drop=True) + trajectory_df['original_index'] = trajectory_df.index + + except Exception as e: + self.logger.error(f"Failed to prepare trajectory data: {e}") + raise + + columns_mapping = { + "utc_date": "Timestamp", + "lat": "LATITUDE", + "lon": "LONGITUDE", + "utm_n": "y_utm_sd", + "utm_e": "x_utm_sd" + } + + if "y_utm_sd" in trajectory_df.columns: + trajectory_df.drop(columns=["y_utm_sd"], inplace=True) + if "x_utm_sd" in trajectory_df.columns: + trajectory_df.drop(columns=["x_utm_sd"], inplace=True) + + valid_coords = trajectory_df[trajectory_df['LATITUDE'].notna() & trajectory_df['LONGITUDE'].notna()] + if valid_coords.empty: + raise ValueError("No valid coordinates found in trajectory data") + + lat, lon = float(valid_coords.iloc[0]['LATITUDE']), float(valid_coords.iloc[0]['LONGITUDE']) + utm_code = f"EPSG:{wgs_to_utm_code(lat, lon)}" + self.logger.info(f"Using projection {utm_code} based on coordinates: lat={lat}, lon={lon}") + + try: + self.logger.info("Starting trip detection with DataFrame shape: %s", trajectory_df.shape) + detected_gps, incidents_table, width, height, cell_size = detect_trips( + trajectory_df.copy(), + trajectory_iri, + columns_mapping, + params=DEFAULT_PARAMS, + interpolate_helper_func=None, + code=utm_code + ) + self.logger.info("Trip detection completed successfully") + self.logger.info("Results - detected_gps shape: %s", detected_gps.shape if detected_gps is not None else "None") + self.logger.info("Results - incidents_table shape: %s", incidents_table.shape if incidents_table is not None else "None") + + if detected_gps is not None: + result_df = pd.DataFrame(index=trajectory_df.index) + result_df['time'] = trajectory_df['time'] + + trip_columns = [ + ("Points in segment", ""), + ("x_utm_sd", None), + ("y_utm_sd", None), + ("kernel", 0.0), + ("zone", -1), + ("norm_kernel", 0.0), + ("modfied_kernel", 0.0), + ("norm_modified_kernel", 0.0), + ("snap_to_hs", 0), + ("Trip index", -1), + ("Visit index", -1), + ("Gap", 0) + ] + + detected_gps = detected_gps.reset_index(drop=True) + result_df = result_df.reset_index(drop=True) + + if len(detected_gps) != len(result_df): + self.logger.warning(f"Length mismatch: detected_gps={len(detected_gps)}, result_df={len(result_df)}") + if 'original_index' in detected_gps.columns: + detected_gps = detected_gps.set_index('original_index') + result_df = result_df.reindex(detected_gps.index) + detected_gps = detected_gps.reset_index() + result_df = result_df.reset_index() + else: + min_len = min(len(detected_gps), len(result_df)) + detected_gps = detected_gps.iloc[:min_len] + result_df = result_df.iloc[:min_len] + + for col, default_value in trip_columns: + try: + result_df[col] = detected_gps[col].values if col in detected_gps.columns else pd.Series([default_value] * len(result_df)) + except Exception as e: + self.logger.warning(f"Error copying column {col}: {e}") + result_df[col] = pd.Series([default_value] * len(result_df)) + + detected_gps = result_df + + except Exception as e: + self.logger.error(f"Trip detection failed: {e}") + self.logger.error("DataFrame state at failure:") + self.logger.error(f"Columns: {trajectory_df.columns.tolist()}") + self.logger.error(f"Data types: {trajectory_df.dtypes.to_dict()}") + raise + + conn = None + try: + conn = self.connect_to_database( + self.TRAJECTORY_DB_HOST, + self.DB_PORT, + self.DB_USER, + self.TRAJECTORY_DB_PASSWORD, + self.DB_NAME + ) + table_name = self.get_table_name_for_timeseries(conn, trajectory_iri) + + add_template = self.load_template("add_trip_detection_columns.sql") + for col_name, col_type in [ + ("Points in segment", "TEXT"), + ("x_utm_sd", "DOUBLE PRECISION"), + ("y_utm_sd", "DOUBLE PRECISION"), + ("kernel", "DOUBLE PRECISION"), + ("zone", "INTEGER"), + ("norm_kernel", "DOUBLE PRECISION"), + ("modfied_kernel", "DOUBLE PRECISION"), + ("norm_modified_kernel", "DOUBLE PRECISION"), + ("snap_to_hs", "INTEGER"), + ("Trip index", "INTEGER"), + ("Visit index", "INTEGER"), + ("Gap", "INTEGER"), + ("utm_zone", "TEXT") + ]: + alter_sql = add_template.format(table_name=table_name, column_name=col_name, col_type=col_type) + self.execute_query(conn, alter_sql) + + update_sql_template = self.load_template("update_trip_detection_results.sql") + update_sql = update_sql_template.format(table_name=table_name) + + for _, row in detected_gps.iterrows(): + values = ( + row.get("Points in segment"), + row.get("x_utm_sd"), + row.get("y_utm_sd"), + row.get("kernel"), + row.get("zone"), + row.get("norm_kernel"), + row.get("modfied_kernel"), + row.get("norm_modified_kernel"), + row.get("snap_to_hs"), + row.get("Trip index"), + row.get("Visit index"), + row.get("Gap"), + row.get("time") + ) + self.execute_query(conn, update_sql, values) + + self.commit_if_psycopg2(conn) + return { + "message": "Trip detection applied and table updated successfully.", + "projection_used": utm_code, + "processed_rows": len(detected_gps) + } + + except Exception as e: + self.logger.error(f"Database operation failed: {e}") + raise + finally: + if conn: + conn.close() + + diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/function_lib.py b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/function_lib.py new file mode 100644 index 00000000000..0c0e0dd5302 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/function_lib.py @@ -0,0 +1,470 @@ +""" +This script demonstrates an exposure calculation pipeline with PROJECTION: +1) Read a trajectory from CSV (time in ms, lat/lon) and reproject to EPSG:27700 +2) Slice the trajectory by a given time interval (in seconds) +3) For each slice, create a buffer geometry (in meters, thanks to EPSG:27700) +4) Construct a SPARQL query to Blazegraph, which in turn federates to Ontop +5) Filter the returned WKT by intersection with the buffer +6) Collect unique business establishments as exposure results + +Now the 100m buffer is truly 100 meters, because we do a reproject. + +Additionally, we reproject each returned geometry from EPSG:4326 to EPSG:27700 +before intersection, ensuring coordinate compatibility. + +Author: Jiying Chen(jc2341@cam.ac.uk) March 2025 +""" + +import csv +import time +import datetime +import requests +import json +import pandas as pd +from shapely.geometry import Point, LineString +from shapely import wkt +from shapely.ops import transform, nearest_points +from pyproj import Transformer + +############################## +# Coordinate Projection Settings +############################## +# If the raw data is in WGS84 (EPSG:4326) and you need distances in meters for the UK, +# then project the points to EPSG:27700. +project_4326_to_27700 = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True).transform +project_27700_to_4326 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True).transform + +def read_trajectory(csv_path): + """ + Reads trajectory data from a CSV file. The CSV must contain the following columns: + - 'time': Unix timestamp in milliseconds + - 'LATITUDE' and 'LONGITUDE' + Converts each point to a datetime and projects it to EPSG:27700 (meters). + Returns a time-sorted list, where each element is (timestamp, (x, y)). + """ + points = [] + df = pd.read_csv(csv_path) + for _, row in df.iterrows(): + ts_ms = int(row['time']) + ts = datetime.datetime.utcfromtimestamp(ts_ms / 1000) + lat = float(row['LATITUDE']) + lon = float(row['LONGITUDE']) + pt_4326 = Point(lon, lat) + pt_27700 = transform(project_4326_to_27700, pt_4326) + points.append((ts, (pt_27700.x, pt_27700.y))) + points.sort(key=lambda x: x[0]) + print(f"[INFO] Loaded {len(points)} trajectory points from {csv_path} and projected them to EPSG:27700.") + return points + +def get_interpolated_point(points, target_time): + """ + For a given sorted list of trajectory points (timestamp, (x, y)) and a target time, + returns the point if the target time exactly exists in the sample; + otherwise, computes the coordinate for the target time using linear interpolation. + """ + if target_time <= points[0][0]: + return points[0] + if target_time >= points[-1][0]: + return points[-1] + for i in range(len(points) - 1): + t1, coord1 = points[i] + t2, coord2 = points[i + 1] + if t1 <= target_time <= t2: + if target_time == t1: + return points[i] + if target_time == t2: + return points[i + 1] + total_seconds = (t2 - t1).total_seconds() + elapsed = (target_time - t1).total_seconds() + fraction = elapsed / total_seconds + x = coord1[0] + fraction * (coord2[0] - coord1[0]) + y = coord1[1] + fraction * (coord2[1] - coord1[1]) + return (target_time, (x, y)) + return points[-1] + +def slice_trajectory_with_interpolation(points, delta_seconds): + """ + Splits the trajectory into segments based on the specified time interval (in seconds). + For each segment, if the start or end time is not exactly a sample, calculate the corresponding point using interpolation. + Returns a list where each element is (slice_points, mid_time), with slice_points including the interpolated start point, + all in-between sample points, and the interpolated end point; mid_time represents the midpoint time of the segment. + """ + slices = [] + if not points: + return slices + current_start = points[0][0] + last_time = points[-1][0] + while current_start < last_time: + current_end = current_start + datetime.timedelta(seconds=delta_seconds) + if current_end > last_time: + current_end = last_time + slice_points = [] + start_point = get_interpolated_point(points, current_start) + slice_points.append(start_point) + for pt in points: + if pt[0] > current_start and pt[0] < current_end: + slice_points.append(pt) + end_point = get_interpolated_point(points, current_end) + slice_points.append(end_point) + duration = current_end - current_start + mid_time = current_start + duration / 2 + print(f"[DEBUG] Created slice: start {current_start.isoformat()}, end {current_end.isoformat()}, midpoint {mid_time.isoformat()}, containing {len(slice_points)} points") + slices.append((slice_points, mid_time)) + current_start = current_end + return slices + +def create_full_buffer(points, buffer_radius): + """ + Generates a buffer for the entire trajectory. + If the trajectory has only one point, apply the buffer directly to that point; otherwise, construct a LineString and buffer it. + """ + coords = [coord for (_, coord) in points] + if len(coords) == 1: + geom = Point(coords[0]) + else: + geom = LineString(coords) + return geom.buffer(buffer_radius) + +def compute_buffer(slice_points, buffer_radius): + """ + Generates a buffer based on all points in a slice (projected to EPSG:27700). + If the slice has only one point, return a circular buffer; otherwise, generate a buffer from the LineString. + """ + coords = [coord for (_, coord) in slice_points] + if len(coords) == 1: + geom = Point(coords[0]) + else: + geom = LineString(coords) + return geom.buffer(buffer_radius) + +def format_timestamp(dt): + """ + Formats a datetime object into a literal string allowed by xsd:dateTimeStamp. + If microseconds are present, truncate to milliseconds and append "Z" to indicate UTC. + Format examples: YYYY-MM-DDTHH:MM:SS.mmmZ or YYYY-MM-DDTHH:MM:SSZ. + """ + if dt.microsecond == 0: + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + else: + ms = int(dt.microsecond / 1000) + return dt.strftime("%Y-%m-%dT%H:%M:%S") + f".{ms:03d}Z" + +############################################################################### +# SPARQL Query Construction (Dynamic Time Binding and Regex) +############################################################################### +def query_sparql(dt_start, dt_end, blazegraph_endpoint): + """ + Dynamically constructs a SPARQL query using the start and end times of a slice. + Extracts the weekday and time string (HH:MM) from dt_start and dt_end to build a regular expression filter for opening_hours, + while binding the slice's start and end times as xsd:dateTimeStamp. + + Args: + dt_start (datetime): The slice's start time. + dt_end (datetime): The slice's end time. + blazegraph_endpoint (str): The Blazegraph SPARQL endpoint URL. + + Returns: + The JSON data of the query results; returns None if an error occurs. + """ + # Format as a string allowed by xsd:dateTimeStamp + formatted_start = format_timestamp(dt_start) + formatted_end = format_timestamp(dt_end) + + # Dynamically extract weekday and HH:MM strings + weekday_map = {"Mon": "Mo", "Tue": "Tu", "Wed": "We", "Thu": "Th", "Fri": "Fr", "Sat": "Sa", "Sun": "Su"} + weekday_start = weekday_map[dt_start.strftime("%a")] + time_str_start = dt_start.strftime("%H:%M") + weekday_end = weekday_map[dt_end.strftime("%a")] + time_str_end = dt_end.strftime("%H:%M") + + # Construct regex patterns; note that four backslashes represent two backslashes when passed to SPARQL + query = f""" +PREFIX fh: +PREFIX ies4: +PREFIX geo: +PREFIX wgs: +PREFIX xsd: +PREFIX osmkey: + +SELECT DISTINCT ?be ?geometry ?wkt ?lat ?long ?openingHours +WHERE {{ + # Bind the dynamically extracted weekday and time (HH:MM) + BIND("{weekday_start}" AS ?weekdayStart) + BIND("{time_str_start}" AS ?timeStrStart) + BIND("{weekday_end}" AS ?weekdayEnd) + BIND("{time_str_end}" AS ?timeStrEnd) + + # Construct regex patterns to extract operating hours from opening_hours + BIND(CONCAT(".*", ?weekdayStart, "(?:-[A-Za-z]{{2}})?\\\\s+([0-9]{{2}}:[0-9]{{2}})-([0-9]{{2}}:[0-9]{{2}}).*") AS ?patternStart) + BIND(CONCAT(".*", ?weekdayEnd, "(?:-[A-Za-z]{{2}})?\\\\s+([0-9]{{2}}:[0-9]{{2}})-([0-9]{{2}}:[0-9]{{2}}).*") AS ?patternEnd) + + ################################################################## + ## Federated query from Ontop: Get valid BusinessEstablishment (absolute filter) + ################################################################## + SERVICE {{ + SELECT DISTINCT ?be ?geometry ?wkt ?lat ?long + WHERE {{ + # Bind the slice's start and end time + BIND("{formatted_start}"^^xsd:dateTimeStamp AS ?givenTimeStart) + BIND("{formatted_end}"^^xsd:dateTimeStamp AS ?givenTimeEnd) + BIND(SUBSTR(STR(?givenTimeStart), 1, 4) AS ?timeYear) + ?be a fh:BusinessEstablishment ; + geo:hasGeometry ?geometry . + FILTER(CONTAINS(STR(?geometry), CONCAT("/", ?timeYear))) + ?geometry geo:asWKT ?wkt . + OPTIONAL {{ ?geometry wgs:lat ?lat . }} + OPTIONAL {{ ?geometry wgs:long ?long . }} + ?startEvent ies4:isStartOf ?be ; + ies4:inPeriod ?startPeriod . + ?startPeriod ies4:iso8601PeriodRepresentation ?startTime . + ?endEvent ies4:isEndOf ?be ; + ies4:inPeriod ?endPeriod . + ?endPeriod ies4:iso8601PeriodRepresentation ?endTime . + # Ensure the establishment's operating period overlaps with the slice's time interval + FILTER(?startTime <= ?givenTimeEnd && ?endTime >= ?givenTimeStart) + }} + }} + + ################################################################## + OPTIONAL {{ ?be osmkey:opening_hours ?openingHours . }} + + ################################################################## + # Operating hours filtering: Require that the extracted opening time from opening_hours + # is not later than the slice's start time, and the closing time is not earlier than the slice's end time. + # If opening_hours is not provided, keep the establishment by default. + FILTER ( + (!BOUND(?openingHours)) + || + ( + regex(?openingHours, ?patternStart) && + regex(?openingHours, ?patternEnd) && + (?timeStrStart >= REPLACE(?openingHours, ?patternStart, "$1")) && + (?timeStrEnd <= REPLACE(?openingHours, ?patternEnd, "$2")) + ) + ) +}} +""" + headers = {"Accept": "application/sparql-results+json"} + response = requests.post(blazegraph_endpoint, data={"query": query}, headers=headers) + if response.status_code == 200: + return response.json() + else: + print("[ERROR] SPARQL query failed:", response.status_code) + print("[ERROR] Query:\n" + query) + print(response.text) + return None + +############################################################################### +# Approach A: Trip-span Intersecting +############################################################################### +def approach_a(points, buffer_radius, blazegraph_endpoint): + """ + Approach A: + 1) For the entire CSV trajectory, extract the overall start (trip_start) and end times (trip_end). + 2) Build a full buffer using these points. + 3) Query business establishments within the buffer. + 4) For establishments with opening_hours, check if their operating hours intersect with [trip_start, trip_end]; + if opening_hours is not provided, keep the establishment by default. + """ + trip_start = points[0][0] + trip_end = points[-1][0] + full_buffer = create_full_buffer(points, buffer_radius) + + # Use a simplified SPARQL query (here calling naive_sparql_query which queries using the entire time interval) + results_json = naive_sparql_query(trip_start, trip_end, blazegraph_endpoint) + valid_uris = filter_by_buffer(results_json, full_buffer) + + final_uris = set() + for binding in results_json.get("results", {}).get("bindings", []): + be_uri = binding["be"]["value"] + if be_uri not in valid_uris: + continue + opening_hours = binding.get("openingHours", {}).get("value") + if not opening_hours: + final_uris.add(be_uri) + else: + # Call the check function to determine if [trip_start, trip_end] overlaps with the establishment's operating hours + if check_overlap_naive(trip_start, trip_end, binding): + final_uris.add(be_uri) + return final_uris + +############################################################################### +# Approach B: Nearest-Point Filtering +############################################################################### +def approach_b(points, buffer_radius, blazegraph_endpoint): + """ + Approach B: + 1) Construct a buffer for the entire trajectory. + 2) Query business establishments within the buffer. + 3) For each business establishment, find the sensor point closest to it. + - If the establishment has no opening_hours, keep it directly; + - Otherwise, check if the time of the nearest sensor point falls within the establishment's operating hours. + If yes, keep it; otherwise, filter it out. + """ + full_buffer = create_full_buffer(points, buffer_radius) + trip_start = points[0][0] + trip_end = points[-1][0] + results_json = naive_sparql_query(trip_start, trip_end, blazegraph_endpoint) + valid_uris = filter_by_buffer(results_json, full_buffer) + + final_uris = set() + for binding in results_json.get("results", {}).get("bindings", []): + be_uri = binding["be"]["value"] + if be_uri not in valid_uris: + continue + opening_hours = binding.get("openingHours", {}).get("value") + if not opening_hours: + final_uris.add(be_uri) + continue + wkt_str = binding.get("wkt", {}).get("value") + if not wkt_str: + continue + try: + geom_4326 = wkt.loads(wkt_str) + geom_27700 = transform(project_4326_to_27700, geom_4326) + # Find the nearest sensor point (traverse all points and return the closest one along with its time) + nearest_pt, nearest_ts = find_nearest_point_in_space(geom_27700, points) + if check_time_in_opening_hours(nearest_ts, opening_hours): + final_uris.add(be_uri) + except Exception as e: + print("[WARN] Nearest point error:", e) + return final_uris + +############################################################################### +# Approach C: Time-Sliced Matching (Model C) +############################################################################### +def approach_c(points, delta_t_seconds, buffer_radius, blazegraph_endpoint): + """ + Approach C: + Use a time-slicing method: split the trajectory into slices based on Δt, + dynamically construct a SPARQL query for each slice, and merge the results. + """ + slices = slice_trajectory_with_interpolation(points, delta_t_seconds) + union_exposures = set() + total_query_time = 0.0 + for i, (slice_data, mid_time) in enumerate(slices): + dt_start = slice_data[0][0] + dt_end = slice_data[-1][0] + buffer_poly = compute_buffer(slice_data, buffer_radius) + t0 = time.time() + results_json = query_sparql(dt_start, dt_end, blazegraph_endpoint) + elapsed = time.time() - t0 + total_query_time += elapsed + valid_be = filter_by_buffer(results_json, buffer_poly) + union_exposures.update(valid_be) + print(f"[DEBUG] Time-sliced total query time: {total_query_time:.2f} s.") + return union_exposures + +############################################################################### +# The following are helper functions (placeholder examples, extendable as needed) +############################################################################### +def naive_sparql_query(dt_start, dt_end, blazegraph_endpoint): + """ + A simple SPARQL query function that directly calls query_sparql using the entire time interval. + In Approaches A and B, the query can be adjusted as needed. + """ + return query_sparql(dt_start, dt_end, blazegraph_endpoint) + +def filter_by_buffer(results_json, buffer_poly): + """ + Iterates through all business establishment data (with geometry in WKT) returned by the SPARQL query, + parses and projects them to EPSG:27700, + and if they intersect with the given buffer_poly, records their URI. + Returns a set of the intersecting URIs. + """ + valid_be = set() + if results_json is None: + return valid_be + bindings = results_json.get("results", {}).get("bindings", []) + reproj = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True).transform + for binding in bindings: + wkt_str = binding.get("wkt", {}).get("value") + if not wkt_str: + continue + try: + geom_4326 = wkt.loads(wkt_str) + geom_27700 = transform(reproj, geom_4326) + if geom_27700.intersects(buffer_poly): + valid_be.add(binding["be"]["value"]) + except Exception as e: + print("[WARN] WKT parsing error:", e) + return valid_be + +def check_overlap_naive(tstart, tend, binding): + """ + Placeholder extension point for additional overlap validation. + + This function is called in Approaches A and C to perform a secondary + check of whether the time slice [tstart, tend] overlaps with the + business’s opening hours. In the current implementation, the + opening-hours filtering is already handled within the SPARQL query + via regular expressions, so this function simply returns True. + + You can replace this placeholder with custom logic—for example, + parsing binding["openingHours"]["value"] (e.g. "09:00-17:00") + and returning False if the slice lies entirely outside that range. + """ + return True + + +def check_time_in_opening_hours(sensor_time, opening_hours_string): + """ + Placeholder extension point for time-in-hours validation. + + This function is called in Approach B to perform an additional + check on a single representative timestamp (sensor_time) against + the opening_hours_string. The SPARQL query already applies a regex- + based filter on opening hours, so this function currently always + returns True. + + You can extend it to parse opening_hours_string (e.g. "09:00-17:00") + and return True only if sensor_time.time() falls within that interval, + handling cases like overnight hours if needed. + """ + return True + + +def find_nearest_point_in_space(business_geom, points): + """ + Given the geometry of a business establishment (projected to EPSG:27700) and a list of sensor points (EPSG:27700), + finds the point closest to business_geom using an O(n) traversal, + returning (nearest point coordinates, nearest point time). + For large datasets, consider using an R-tree to improve efficiency. + """ + from math import inf + min_dist = inf + nearest_pt = None + nearest_ts = None + for (ts, (x, y)) in points: + pt = Point(x, y) + d = business_geom.distance(pt) + if d < min_dist: + min_dist = d + nearest_pt = pt + nearest_ts = ts + return nearest_pt, nearest_ts + +############################################################################### +# Main Experiment Function +############################################################################### +def main_experiment(csv_file, delta_t_seconds, buffer_radius, blazegraph_endpoint, approach='c'): + """ + Main Process: + The parameter 'approach' controls which model to use: + 'a' -> Approach A: Trip-span intersecting + 'b' -> Approach B: Nearest-point filtering + 'c' -> Approach C: Time-sliced matching (default) + """ + points = read_trajectory(csv_file) + if approach == 'a': + print("[INFO] Using Approach A: Full time interval filtering") + exposures = approach_a(points, buffer_radius, blazegraph_endpoint) + elif approach == 'b': + print("[INFO] Using Approach B: Nearest sensor point filtering") + exposures = approach_b(points, buffer_radius, blazegraph_endpoint) + else: + print("[INFO] Using Approach C: Time-sliced matching") + exposures = approach_c(points, delta_t_seconds, buffer_radius, blazegraph_endpoint) + + print(f"[RESULT] Approach '{approach}' found {len(exposures)} business establishments.") + return exposures \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/opening_hours_cam.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/opening_hours_cam.sparql new file mode 100644 index 00000000000..e4e4629edf5 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/opening_hours_cam.sparql @@ -0,0 +1,41 @@ +PREFIX rdfs: +PREFIX fh: +PREFIX geo: +PREFIX om: +PREFIX osmkey: +PREFIX ogc: + +CONSTRUCT { + ?entity a ?my_type ; + geo:hasGeometry ?defaultGeometry ; + rdfs:label ?name ; + osmkey:opening_hours ?openingHours . +} +WHERE { + ?asset ?osm_pred ?osm_type ; + geo:hasGeometry ?defaultGeometry . + ?region osmkey:name 'Cambridge' ; + ogc:sfContains ?asset . + ?defaultGeometry geo:asWKT ?wkt . + + OPTIONAL { ?asset osmkey:name ?name . } + OPTIONAL { ?asset osmkey:opening_hours ?openingHours . } + OPTIONAL { ?asset osmkey:fhrs:id ?fhrsId . } + + BIND( + IF( + BOUND(?fhrsId), + IRI(CONCAT("http://www.theworldavatar.com/ontology/OntoFHRS/BusinessEstablishment/", STR(?fhrsId))), + ?asset + ) AS ?entity + ) + + VALUES (?osm_pred ?osm_type ?my_type) { + (osmkey:amenity "restaurant" fh:Restaurant) + (osmkey:amenity "fast_food" fh:FastFood) + (osmkey:amenity "cafe" fh:Cafe) + (osmkey:shop "food" fh:FoodShop) + (osmkey:shop "supermarket" fh:Supermarket) + (osmkey:shop "convenience" fh:ConvenienceStore) + } +} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/preprocess.py b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/preprocess.py new file mode 100644 index 00000000000..09dd8417380 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/preprocess.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Preprocess OpenStreetMap opening_hours data into OntoService TTL. + +Configuration (edit values here): + API_URL = "https://qlever.cs.uni-freiburg.de/api/osm-planet" + SPARQL_FILE = "opening_hours_cam.sparql" + START_YEAR = 2013 + END_YEAR = 2028 + COUNTRY_CODE = "GB" + OUTPUT_FILE = "results/ontoservice_opening_hours.ttl" + +Dependencies: + pip install rdflib requests holidays +""" + +import requests, re, holidays +from rdflib import Graph, Namespace, URIRef, Literal +from rdflib.namespace import RDF, RDFS, XSD +from rdflib.term import _toPythonMapping + +# disable xsd:time auto-conversion +_toPythonMapping[XSD.time] = lambda s: s + +# --------------------------------------- +# Configuration +# --------------------------------------- +API_URL = "https://qlever.cs.uni-freiburg.de/api/osm-planet" +SPARQL_FILE = "opening_hours_cam.sparql" +START_YEAR = 2013 +END_YEAR = 2028 +COUNTRY_CODE = "GB" +OUTPUT_FILE = "results/ontoservice_opening_hours.ttl" + +# --------------------------------------- +# Namespaces +# --------------------------------------- +OS = Namespace("https://theworldavatar.io/kg/") +FIBO_FD = Namespace("https://spec.edmcouncil.org/fibo/ontology/FND/DatesAndTimes/FinancialDates/") +CMNS_DT = Namespace("https://www.omg.org/spec/Commons/DatesAndTimes/") +CMNS_COL= Namespace("https://www.omg.org/spec/Commons/Collections/") +OSMKEY = Namespace("https://www.openstreetmap.org/wiki/Key:") + +WEEKDAYS = ["Mo","Tu","We","Th","Fr","Sa","Su"] +MONTH_MAP = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, + 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} + +def normalize_time(t): + m = re.match(r'(\d{1,2}):(\d{2})', t) + if not m: return None + hh, mm = m.groups() + return f"{int(hh):02d}:{mm}" + +def parse_rule(text): + text = text.strip().replace('–','-') + parts = text.split(None,1) + date_part = parts[0] + time_part = parts[1] if len(parts)>1 else "" + + # OFF rules + if 'off' in time_part.lower(): + if date_part=='PH': + return {'type':'adhoc_ph','key':'PH_off'} + m = re.match(r'([A-Za-z]{3})[- ](\d{1,2})', date_part+' '+time_part) + if m: + mon,day = m.groups() + mm = MONTH_MAP.get(mon) + if mm: + return {'type':'adhoc_md','key':f"MD_{mon}{int(day):02d}",'month':mm,'day':int(day)} + return {'type':'unsupported','key':date_part} + + # parse days (comma separated) + days=[] + for item in date_part.split(','): + item=item.strip() + if '-' in item: + a,b=item.split('-',1) + if a in WEEKDAYS and b in WEEKDAYS: + i1,i2=WEEKDAYS.index(a),WEEKDAYS.index(b) + if i1<=i2: days+=WEEKDAYS[i1:i2+1] + else: days+=WEEKDAYS[i1:]+WEEKDAYS[:i2+1] + elif item in WEEKDAYS: + days.append(item) + + if days: + m = re.search(r'(\d{1,2}:\d{2})\s*-\s*(\d{1,2}:\d{2})', time_part) + if not m: + return {'type':'unsupported','key':date_part} + st_raw,en_raw = m.groups() + st=normalize_time(st_raw); en=normalize_time(en_raw) + if not st or not en: + return {'type':'unsupported','key':date_part} + # determine key: range vs single + if days[0]!=days[-1] and len(days)>1: + key = f"{days[0]}To{days[-1]}" + else: + key = days[0] + return {'type':'weekly','key':key,'days':days, + 'start_time':st,'end_time':en} + + return {'type':'unsupported','key':date_part} + +def main(): + # read SPARQL + query = open(SPARQL_FILE).read() + resp = requests.get(API_URL,params={'query':query},headers={'Accept':'text/turtle'}) + resp.raise_for_status() + + g = Graph().parse(data=resp.text, format='turtle') + out = Graph() + for p,ns in [('twa',OS),('fibo-fnd-dt-fd',FIBO_FD), + ('cmns-dt',CMNS_DT),('cmns-col',CMNS_COL), + ('osmkey',OSMKEY)]: + out.bind(p,ns) + + # process each entity + for entity,oh in g.subject_objects(OSMKEY.opening_hours): + oh_str = str(oh) + print(f"Entity {entity} OH: {oh_str}") + rules = [r for r in oh_str.split(';') if r.strip()] + parsed = [parse_rule(r) for r in rules] + for r,pr in zip(rules,parsed): + print(f" rule: {r!r} -> {pr}") + + eid = str(entity).rsplit('/',1)[-1] + base=OS[f"schedule/{eid}"] + + adhocs={} + for pr in parsed: + if pr['type']=='weekly': + uri = URIRef(f"{base}/{pr['key']}") + out.add((entity,FIBO_FD.hasSchedule,uri)) + out.add((uri,RDF.type,FIBO_FD.RegularSchedule)) + out.add((uri,CMNS_DT.hasStartDate,Literal(f"{START_YEAR}-01-01",datatype=XSD.date))) + out.add((uri,CMNS_DT.hasEndDate, Literal(f"{END_YEAR}-12-31",datatype=XSD.date))) + tp=URIRef(f"{uri}/timeperiod") + out.add((uri,CMNS_DT.hasTimePeriod,tp)) + out.add((tp,RDF.type,CMNS_DT.ExplicitTimePeriod)) + st_uri, en_uri = URIRef(f"{uri}/startTime"),URIRef(f"{uri}/endTime") + out.add((tp,CMNS_DT.hasStart,st_uri)) + out.add((tp,CMNS_DT.hasEndTime,en_uri)) + out.add((st_uri,RDF.type,CMNS_DT.TimeOfDay)) + out.add((en_uri,RDF.type,CMNS_DT.TimeOfDay)) + out.add((st_uri,CMNS_DT.hasTimeValue,Literal(pr['start_time'],datatype=XSD.time))) + out.add((en_uri,CMNS_DT.hasTimeValue,Literal(pr['end_time'], datatype=XSD.time))) + for wd in pr['days']: + out.add((uri,FIBO_FD.hasRecurrenceInterval,FIBO_FD[wd])) + elif pr['type'].startswith('adhoc'): + adhocs[pr['key']]=pr + + # build adhoc schedules + for key,info in adhocs.items(): + dates=[] + if info['type']=='adhoc_ph': + hols=holidays.CountryHoliday(COUNTRY_CODE,years=range(START_YEAR,END_YEAR+1)) + dates=sorted(d.isoformat() for d in hols) + elif info['type']=='adhoc_md': + for y in range(START_YEAR,END_YEAR+1): + dates.append(f"{y:04d}-{info['month']:02d}-{info['day']:02d}") + uri=URIRef(f"{base}/{key}") + out.add((entity,FIBO_FD.hasSchedule,uri)) + out.add((uri,RDF.type,FIBO_FD.AdHocSchedule)) + out.add((uri,RDFS.label,Literal(f"Ad hoc: {key}"))) + for d in dates: + euri=URIRef(f"{uri}/entry/{d}") + out.add((uri,CMNS_COL.hasConstituent,euri)) + out.add((euri,RDF.type,FIBO_FD.AdHocScheduleEntry)) + out.add((euri,FIBO_FD.hasDate,Literal(d,datatype=XSD.date))) + + out.serialize(destination=OUTPUT_FILE,format='turtle') + print(f"Serialized to {OUTPUT_FILE}") + +if __name__=='__main__': + main() diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/results/.gitignore b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/results/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/relative_time_filter/results/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/absolute_time_filter.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/absolute_time_filter.sparql new file mode 100644 index 00000000000..a0344a0aa92 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/absolute_time_filter.sparql @@ -0,0 +1,24 @@ +PREFIX fh: +PREFIX ies4: +PREFIX geo: +PREFIX wgs: +PREFIX xsd: + +SELECT DISTINCT ?be ?geometry ?wkt ?lat ?long +WHERE {{ + BIND("{given_time}"^^xsd:dateTimeStamp AS ?givenTime) + BIND(SUBSTR(STR(?givenTime), 1, 4) AS ?timeYear) + ?be a fh:BusinessEstablishment ; + geo:hasGeometry ?geometry . + FILTER(CONTAINS(STR(?geometry), CONCAT("/", ?timeYear))) + ?geometry geo:asWKT ?wkt . + OPTIONAL {{ ?geometry wgs:lat ?lat . }} + OPTIONAL {{ ?geometry wgs:long ?long . }} + ?startEvent ies4:isStartOf ?be ; + ies4:inPeriod ?startPeriod . + ?startPeriod ies4:iso8601PeriodRepresentation ?startTime . + ?endEvent ies4:isEndOf ?be ; + ies4:inPeriod ?endPeriod . + ?endPeriod ies4:iso8601PeriodRepresentation ?endTime . + FILTER(?startTime <= ?givenTime && ?endTime >= ?givenTime) +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/add_trip_detection_columns.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/add_trip_detection_columns.sql new file mode 100644 index 00000000000..4d483496327 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/add_trip_detection_columns.sql @@ -0,0 +1,8 @@ +DO $$ BEGIN + IF NOT EXISTS ( + SELECT FROM information_schema.columns + WHERE table_name = '{table_name}' AND column_name = '{column_name}' + ) THEN + EXECUTE 'ALTER TABLE "{table_name}" ADD COLUMN "{column_name}" {col_type}'; + END IF; +END $$; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/check_trip_index_column.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/check_trip_index_column.sql new file mode 100644 index 00000000000..3a5e790446f --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/check_trip_index_column.sql @@ -0,0 +1,3 @@ +SELECT column_name +FROM information_schema.columns +WHERE table_name = %s AND column_name = 'Trip index'; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_area_generic.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_area_generic.sparql new file mode 100644 index 00000000000..853944316e9 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_area_generic.sparql @@ -0,0 +1,7 @@ +PREFIX geo: +SELECT ?feature ?geometry +WHERE {{ + ?feature geo:isPartOfDomain <{env_data_iri}> . + ?geoRef geo:isPartOf ?feature ; + geo:hasGeometry ?geometry . +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_crs_for_env_data.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_crs_for_env_data.sparql new file mode 100644 index 00000000000..e4d360b8e5c --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_crs_for_env_data.sparql @@ -0,0 +1,6 @@ +PREFIX geo: + +SELECT DISTINCT ?crs WHERE {{ + ?feature a <{env_data_iri}> ; + geo:hasGeometry/geo:crs ?crs . +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_data_sources.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_data_sources.sparql new file mode 100644 index 00000000000..a83a06aa3c3 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_data_sources.sparql @@ -0,0 +1,10 @@ +PREFIX exposure: +PREFIX fh: +PREFIX gs: +SELECT ?domainIRI ?domainName ?dataSourceName +WHERE {{ + ?domainIRI a exposure:Domain ; + exposure:hasDomainName ?domainName ; + exposure:hasDataSource ?dataSourceName . + FILTER(?domainIRI = <{env_data_iri}>) +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_featuretype.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_featuretype.sparql new file mode 100644 index 00000000000..930e37fab6e --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_domain_and_featuretype.sparql @@ -0,0 +1,6 @@ +PREFIX exposure: +SELECT ?domainName ?featureType +WHERE {{ + <{env_data_iri}> exposure:hasDomainName ?domainName ; + exposure:hasFeatureType ?featureType . +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_food_hygiene_rating.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_food_hygiene_rating.sparql new file mode 100644 index 00000000000..bb267210b5a --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_food_hygiene_rating.sparql @@ -0,0 +1,13 @@ +PREFIX fh: +PREFIX geo: +SELECT ?id ?businessName ?businessType ?ratingTime ?ratingValue ?longitude ?latitude +WHERE {{ + ?id fh:isPartOfDomain <{env_data_iri}> ; + fh:hasBusinessName ?businessName ; + fh:hasBusinessType ?businessType ; + fh:hasRatingDate ?ratingTime ; + fh:hasRatingValue ?ratingValue . + ?geo geo:isPartOf ?id ; + geo:hasLongitude ?longitude ; + geo:hasLatitude ?latitude . +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_greenspace_area.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_greenspace_area.sparql new file mode 100644 index 00000000000..9d18ce4f5b4 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_greenspace_area.sparql @@ -0,0 +1,12 @@ +PREFIX gs: +PREFIX geo: +PREFIX ex: + +SELECT DISTINCT ?feature ?function ?geometry +WHERE {{ + ?feature a gs:Greenspace ; + a geo:Feature ; + gs:hasFunction ?function ; + geo:hasGeometry ?geomNode . + ?geomNode ex:hasWKB ?geometry . +}} \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_point_generic.sparql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_point_generic.sparql new file mode 100644 index 00000000000..7205e30e137 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_point_generic.sparql @@ -0,0 +1,8 @@ +PREFIX geo: +SELECT ?id ?longitude ?latitude +WHERE {{ + ?id geo:isPartOfDomain <{env_data_iri}> . + ?geo geo:isPartOf ?id ; + geo:hasLongitude ?longitude ; + geo:hasLatitude ?latitude . +}} diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries.sql new file mode 100644 index 00000000000..acf4bf25370 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries.sql @@ -0,0 +1,3 @@ +SELECT DISTINCT "tableName" +FROM "dbTable" +WHERE "timeseriesIRI" = %s; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries_fallback.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries_fallback.sql new file mode 100644 index 00000000000..d0024a642ab --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_table_name_for_timeseries_fallback.sql @@ -0,0 +1,3 @@ +SELECT "table_name" +FROM public."time_series_quantities" +WHERE "time_series_iri" = %s; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data.sql new file mode 100644 index 00000000000..4fe88ff87ea --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data.sql @@ -0,0 +1,11 @@ +SELECT + to_timestamp("UNIX_time" / 1000.0) AT TIME ZONE 'UTC' AS "time", + "column1", + "column2", + "column3", + "column4", + "column5", + "column6", + "column7" +FROM "{table_name}" +ORDER BY "UNIX_time" ASC; \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_fallback.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_fallback.sql new file mode 100644 index 00000000000..3ae6cde7cf3 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_fallback.sql @@ -0,0 +1,7 @@ +SELECT + to_timestamp("time" / 1000.0) AT TIME ZONE 'UTC' AS "time", + ST_Y("column4") AS "LATITUDE", + ST_X("column4") AS "LONGITUDE" +FROM "{table_name}" +WHERE "time_series_iri" = %s +ORDER BY "time" ASC; \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_with_trip.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_with_trip.sql new file mode 100644 index 00000000000..8a54a8c2a1d --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/get_timeseries_data_with_trip.sql @@ -0,0 +1,6 @@ +SELECT + to_timestamp("UNIX_time" / 1000.0) AT TIME ZONE 'UTC' AS "time", + "column1", "column2", "column3", "column4", "column5", "column6", "column7", + "Trip index" +FROM "{table_name}" +ORDER BY "UNIX_time" ASC; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_generic.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_generic.sql new file mode 100644 index 00000000000..bb9c4a1ba29 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_generic.sql @@ -0,0 +1,16 @@ +WITH Trajectory AS ( + SELECT ST_Transform( + ST_Buffer( + ST_MakeLine("column7"::geometry ORDER BY "time")::geography, + {exposure_radius} + )::geometry, + 27700 + ) AS buffered_trajectory + FROM "{table_name}" +), +combined_area AS ( + {union_sql} +) +SELECT ca.wkb_geometry +FROM combined_area ca, Trajectory t +WHERE ST_Intersects(ca.wkb_geometry, t.buffered_trajectory); diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_greenspace.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_greenspace.sql new file mode 100644 index 00000000000..34469666a70 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_area_greenspace.sql @@ -0,0 +1,13 @@ +WITH Trajectory AS ( + SELECT ST_Transform( + ST_Buffer( + ST_MakeLine("column7"::geometry ORDER BY "time")::geography, + {exposure_radius} + )::geometry, + 27700 + ) AS buffered_trajectory + FROM "{table_name}" +) +SELECT g.ogc_fid, g.function, g."distName1", g."distName2", g.wkb_geometry +FROM public."GB_GreenspaceSite" g, Trajectory t +WHERE ST_Intersects(g.wkb_geometry, t.buffered_trajectory); diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_food_hygiene.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_food_hygiene.sql new file mode 100644 index 00000000000..3fda9d42a06 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_food_hygiene.sql @@ -0,0 +1,25 @@ +WITH BufferedLine AS ( + SELECT + ST_Buffer( + ST_MakeLine(gps."column7"::geometry ORDER BY gps."time")::geography, + {exposure_radius} + ) AS buffered_geom + FROM + "{table_name}" gps +), +combined_frs AS ( + {union_sql} +) +SELECT + frs."Name" AS entity_name, + frs."Address" AS address, + ST_AsText(frs.geom) AS entity_geom, + COUNT(frs."Name") AS no_of_entities +FROM + BufferedLine bl +JOIN + combined_frs frs +ON + ST_Intersects(bl.buffered_geom, frs.geom::geography) +GROUP BY + frs."Name", frs."Address", frs.geom; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_generic.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_generic.sql new file mode 100644 index 00000000000..8fc0aaead67 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/simplified_get_point_generic.sql @@ -0,0 +1,19 @@ +WITH BufferedLine AS ( + SELECT + ST_Buffer( + ST_MakeLine(gps."column7"::geometry ORDER BY gps."time")::geography, + {exposure_radius} + ) AS buffered_geom + FROM + "{table_name}" gps +), +combined_pts AS ( + {union_sql} +) +SELECT combined_pts.geom +FROM + BufferedLine bl +JOIN + combined_pts +ON + ST_Intersects(bl.buffered_geom, combined_pts.geom::geography); diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/time_slicing_filter.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/time_slicing_filter.sql new file mode 100644 index 00000000000..45fcbdbfbef --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/time_slicing_filter.sql @@ -0,0 +1,90 @@ +DO $$ +DECLARE + column_exists BOOLEAN; + sql_query TEXT; +BEGIN + SELECT EXISTS ( + SELECT 1 + FROM pg_catalog.pg_attribute + WHERE attrelid = '{feature_table}'::regclass + AND attname = 'RatingDate' + AND NOT attisdropped + ) INTO column_exists; + + sql_query := E' + WITH BufferedLine AS ( + SELECT ST_Buffer( + ST_MakeLine(gps."column7"::geometry ORDER BY gps."time")::geography, + {exposure_radius} + ) AS buffered_geom + FROM "{gps_table}" gps + ), + IntersectedObjects AS ( + SELECT + frs."Name"::text AS entity_name, + frs."Address"::text AS address, + ST_AsText(frs.geom) AS entity_geom, + frs.geom, + frs.* + FROM "{feature_table}" frs + JOIN BufferedLine bl + ON ST_Intersects(bl.buffered_geom, frs.geom::geography) + )'; + + IF column_exists THEN + sql_query := sql_query || E' + , TrajectoryTimes AS ( + SELECT MIN("time") AS traj_start, MAX("time") AS traj_end + FROM "{gps_table}" + ), + SubBufferedLine AS ( + SELECT ST_Buffer( + ST_MakeLine(gps."column7"::geometry ORDER BY gps."time")::geography, + {exposure_radius} + ) AS sub_buffered_geom + FROM "{gps_table}" gps + JOIN IntersectedObjects frs + ON gps."time" > frs."RatingDate" + GROUP BY frs."RatingDate" + ), + FilteredExposure AS ( + SELECT entity_name, address, entity_geom, + CASE + WHEN "RatingDate" > (SELECT traj_end FROM TrajectoryTimes) THEN 0 + WHEN "RatingDate" > (SELECT traj_start FROM TrajectoryTimes) + AND "RatingDate" <= (SELECT traj_end FROM TrajectoryTimes) THEN ( + SELECT CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END + FROM SubBufferedLine sb + WHERE ST_Intersects(sb.sub_buffered_geom, IntersectedObjects.geom::geography) + ) + ELSE 1 + END AS exposure_count + FROM IntersectedObjects + CROSS JOIN TrajectoryTimes + ) + SELECT * FROM FilteredExposure + '; + ELSE + sql_query := sql_query || E' + SELECT entity_name, address, entity_geom, + COUNT(entity_name)::integer AS exposure_count + FROM IntersectedObjects + GROUP BY entity_name, address, entity_geom + '; + END IF; + + EXECUTE ' + CREATE OR REPLACE FUNCTION get_exposure_results() + RETURNS TABLE ( + entity_name text, + address text, + entity_geom text, + exposure_count integer + ) + AS $$ + BEGIN + RETURN QUERY EXECUTE ' || quote_literal(sql_query) || '; + END; + $$ LANGUAGE plpgsql; + '; +END $$; diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/update_trip_detection_results.sql b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/update_trip_detection_results.sql new file mode 100644 index 00000000000..be069bc6d4f --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/templates/update_trip_detection_results.sql @@ -0,0 +1,14 @@ +UPDATE "{table_name}" +SET "Points in segment" = %s, + "x_utm_sd" = %s, + "y_utm_sd" = %s, + "kernel" = %s, + "zone" = %s, + "norm_kernel" = %s, + "modfied_kernel" = %s, + "norm_modified_kernel" = %s, + "snap_to_hs" = %s, + "Trip index" = %s, + "Visit index" = %s, + "Gap" = %s +WHERE "time" = %s; \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/trip_detection.py b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/trip_detection.py new file mode 100644 index 00000000000..8484049a8da --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/trip_detection.py @@ -0,0 +1,1255 @@ +""" +trip_detection.py +Author: Janelle Berscheid, July 2018 + +# MODIFIED BY JIYING CHEN in 2025 +# Focused on improving compatibility and maintainability: +# - Replaced deprecated package use (e.g. pd.np.nan) +# - Adjusted relative imports and modular structure for TWA stack + +Parameters: + +CELL_SIZE: The square cell size of the location grid, in meters. +KERNEL_BANDWIDTH: The uncertainty of the GPS reading, in meters. Used for KDE calculations. +MAX_RASTER_SIZE: If trip detection fails with simply "Killed" printed to the console, + you probably ran out of memory (most likely in hotspot extraction). + Try reducing the maximum raster size. If this is found to be + exceeded, the cell size will automatically be increased to meet this + requirement, up to the point it becomes incompatible with the kernel bandwidth. +INTERPOLATE_MAX_DELAY: The time threshold for starting interpolation, in seconds. +INTERPOLATE_MAX_DROP_TIME: The maximum time between points after which not to interpolate, in seconds. +INTERPOLATE_MAX_DISTANCE: The maximum distance between points, in meters, to interpolate over. +MIN_VISIT_DURATION: Minimum duration of a dwell, in seconds, for it to be considered a valid visit +MIN_VISIT_TIMESPAN: Minimum time, in seconds, that must elapse between dwells; otherwise they will be merged into a single visit. +MIN_LOOP_DURATION: Minimum duration of a looping path, in second, to keep it and consider it a valid path. +DOWNSAMPLE_FREQUENCY: The frequency to downsample GPS signals to: one point every ?? seconds. +MIN_HOTSPOT_KEEP_DURATION: Minimum duration of a hotspot, in seconds, to keep it and consider it a hotspot. +BUFFER_WEIGHT: Array for convolving the GPS points in temporal order to weigh the kernel values to identify hotspots. +BUFFER_SNAP: Array for convolving the GPS points in temporal order to determine which should be snapped to hotspots. +MIN_PATH_KEEP_DURATION: (new parameter) The minimum duration of a path to keep it, in seconds. +KERNEL_HOTSPOT_THRESHOLD: (new parameter) A threshold above which kernel values may be considered on hotspots. + The original code set it as 1.0 (standard deviations above the mean zone kernel value), but this + port uses a different kernel and performs better with lower values. + Warning: this value may be tuned to fit the sample data too closely. Test with other participants + to determine the best default value for this parameter. +SIGNIFICANT_GAP_LENGTH: (new parameter) Minimum length, in seconds, of a GPS data gap for it to be labeled as a period + of missing data. Gaps with missing data cannot be labeled as either trips or visits. + In the final results, some gaps may be folded into temporally adjacent visits at the same location + to create a single, longer visit if the length of the gap is less than MIN_VISIT_TIMESPAN. + +Remaining work: +- Implement smoothing on the gap identification, to discard single observations surrounded by large gaps. +- Test to see if TinyExtentsError will ever actually trigger; if so, port code which handles it, otherwise remove it +- Test to see if the maximum raster size will ever be a problem; if not, remove code which limits it +- Add option to smooth out the detected missing data gaps, combining long adjacent gaps separated by a few observations + +Quality of life improvements: +- Verbose mode to save more intermediate steps to disk +- Clean up temporary files when finished, if desired +- Write log file of each run +""" + +try: + from utilities import * +except ModuleNotFoundError: + from .utilities import * +import numpy as np +import pandas as pd +import sys +import os +import logging +from time import perf_counter, strftime +import matplotlib.pyplot as plt +from sklearn.neighbors import KernelDensity +from skimage.segmentation import watershed +from skimage.feature import peak_local_max +from scipy import ndimage as ndi +from KDEpy import TreeKDE, NaiveKDE, FFTKDE +from shapely.geometry import Point, LineString + + +# Default Parameters (see explanation above) +DEFAULT_PARAMS = { + "CELL_SIZE": 10, # meters, for grid + "KERNEL_BANDWIDTH": 100, # basically uncertainty of GPS in meters + "MAX_RASTER_SIZE": 14000*14000, # originally, greatest possible number of cells per row/column, default 30000 + "INTERPOLATE_MAX_DELAY": 120, # seconds + "INTERPOLATE_MAX_DROP_TIME": 360000, # seconds, default: 3600 || high value ~ infinity + "INTERPOLATE_MAX_DISTANCE": 100000, # meters, default: 100 || high value ~ infinity + "MIN_VISIT_DURATION": 300, # seconds + "MIN_VISIT_TIMESPAN": 900, #3600, + "MIN_LOOP_DURATION": 300, + "DOWNSAMPLE_FREQUENCY": 0, # default: 5 (0=off) + "MIN_HOTSPOT_KEEP_DURATION": 150, # seconds + "MIN_PATH_KEEP_DURATION": 30, # NEW parameter! seconds + "KERNEL_HOTSPOT_THRESHOLD": 1, #0.1 NEW parameter! May need to be tuned + "BUFFER_WEIGHT": np.array([0.1, 0.25, 0.3, 0.25, 0.1]), + "BUFFER_SNAP": np.array([0.5, 1.0, 1.5, 1.0, 0.5]), + "SIGNIFICANT_GAP_LENGTH": 900 # NEW parameter! Minimum length of a data gap to be considered a missing data period +} + +SENSEDOC_COLUMNS = { + "utc_date": "utc_date", + "lat": "lat", + "lon": "lon", + "utm_n": "y_utm_sd", + "utm_e": "x_utm_sd" +} + +ETHICA_COLUMNS = { + "utc_date": "utc_date", + "lat": "y_wgs_ph", + "lon": "x_wgs_ph", + "utm_n": "y_utm_ph", + "utm_e": "x_utm_ph" +} + +# NB In spite of the name suggesting otherwise, we need a full +# time stamp in the relevant column, not just a date!!! +AI4PH_COLUMNS = { + "utc_date": "Timestamp", + "lat": "LATITUDE", + "lon": "LONGITUDE", + "utm_n": "y_utm_sd", + "utm_e": "x_utm_sd" +} + +# Column headers. +CH_TRIP_INDEX = "Trip index" +CH_VISIT_INDEX = "Visit index" +CH_GAP = "Gap" + +################################## +# Utility methods +################################## + + +def format_gps_data(df, columns, code=None): + """ + :param df: pandas dataframe containing GPS data to format + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :param code: the ESPG code of the data to be projected, as a string. + :return: formatted pandas dataframe + """ + # Sort rows by time stamp. + df = df.sort_values(columns["utc_date"]) + + # Add UTM projections to DF if it doesn't exist already + if columns["utm_e"] not in list(df.columns) or columns["utm_n"] not in list(df.columns): + utm_e, utm_n = get_projection(df[columns["lon"]].values, df[columns["lat"]].values, code=code) + df[columns["utm_e"]] = utm_e + df[columns["utm_n"]] = utm_n + + return df + + +def get_data_gaps(df, pid, columns, + gap_length=DEFAULT_PARAMS["SIGNIFICANT_GAP_LENGTH"], smooth_gaps=False): + """ + Utility function for characterizing gaps existing in GPS data prior to trip detection. + Creates a dataframe containing gaps of non-trivial length, marking their start and stop time, and saves as a temporary + file to disk. + :param df: pandas dataframe containing the GPS observations + :param pid: the participant's ID, as an integer + :param gap_length: minimum length of a data gap, in seconds, to be considered as a missing data period + :param smooth_gaps: If True, gaps will be merged if separated by a small number of observations + :return: None. Writes the gaps to a temporary file, to be loaded during later processing. + """ + df = df.sort_values(columns["utc_date"]) + df.loc[:, "gap_time"] = ((df[columns["utc_date"]] - df.shift(1)[columns["utc_date"]]) / np.timedelta64(1, 's')).fillna(0) + gaps = df.loc[df["gap_time"] > 1, (columns["utc_date"], "gap_time")] + sig_gaps = gaps.loc[gaps["gap_time"] > gap_length].copy() + sig_gaps.loc[:, "end_time"] = sig_gaps[columns["utc_date"]] - np.timedelta64(1, 's') + sig_gaps.loc[:, "start_time"] = sig_gaps["end_time"] - pd.to_timedelta(sig_gaps["gap_time"], unit="s") + sig_gaps = sig_gaps.loc[:, ("start_time", "end_time")] + + if smooth_gaps: + # TODO: If a very long gap is broken up by just a couple of observations, discard the observations and fold into the gap + pass + + df = df.drop(["gap_time"], axis=1) + return sig_gaps + + +def resize_cells(w, h, cell_size, kernel_bandwidth, max_raster_size): + """ + Method for resizing the grid cells if the area covered is too + large to compute with the default cell size. The main limiting + factor is the availability of sufficient RAM. + :param w: width of the extents, in metres + :param h: height of the extents, in metres + :param cell_size: current width/height of square grid cells, in meters + :param kernel_bandwidth: width/height of kernel, in meters + :return: integer value of the new grid cell size, in meters + """ + raster_size = ( w / cell_size ) * ( h / cell_size ) + """ + BT: another option would be to implement a multiscale/sparse raster + processing approach and focus on the areas where GPS fixes exist + """ + if raster_size > max_raster_size: + # redefine cellsize to stay within limits + cell_size = int(np.ceil(np.sqrt( w * h / max_raster_size ))) + + # check that new cell size is still compatible with the kernel bandwidth + if kernel_bandwidth / 2.0 >= cell_size: + print(f"Changing raster resolution to {cell_size} m to keep raster size below threshold.") + else: + print(f"FATAL ERROR: New cell size {cell_size} m is incompatible with kernel bandwidth!") + sys.exit() + return cell_size + + +def downsample_trace(df, columns, frequency=5): + """ + Takes the original GPS trace, and downsamples it to one reading every [frequency] seconds. + :param df: GPS points as a pandas dataframe + :param frequency: integer describing the number of seconds to elapse between readings in the downsampled data + :return: a new dataframe containing the downsampled data + """ + dates = df[columns["utc_date"]] + # Calculate difference between two adjacent samples + differences = ((dates - dates.shift(1)) / np.timedelta64(1, 's')).fillna(0) + + #compute step between resampled records + step = int(np.ceil(frequency / np.median(differences))) + + # Extract those that have difference over frequency + mask = differences > frequency + keep = df[mask] + lower_diff = df[~mask] + lower_diff = lower_diff.iloc[::step] + downsampled = pd.concat([keep, lower_diff], axis=0) + downsampled = downsampled.sort_values(columns["utc_date"]).reset_index(drop=True) + return downsampled + + +def interpolate_over_period(df, columns, frequency, noise=3.0): + """ + Helper method for interpolate_over_dropped_periods() (see below), which performs actual interpolation + :param df: pandas dataframe (group??) containing the range of points to iterpolate over + :param frequency: how frequently to add interpolated points, in seconds + :param noise: sigma parameter for the random noise distribution, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :return: pandas dataframe containing only the new rows + concatenate these new rows onto your original dataframe + """ + + f_lat, f_lon, f_n, f_e = tuple(df[[columns["lat"], columns["lon"], columns["utm_n"], columns["utm_e"]]].iloc[0]) + new_rows = df.copy().set_index(columns["utc_date"]).resample(f"{int(frequency)}S").asfreq().replace([0.0], np.nan) + new_rows = new_rows.reset_index() + new_rows.at[0, columns["lat"]] = f_lat + new_rows.at[0, columns["lon"]] = f_lon + new_rows.at[0, columns["utm_e"]] = f_e + new_rows.at[0, columns["utm_n"]] = f_n + new_rows = new_rows.interpolate() + + noise = pd.DataFrame(np.random.normal(0, noise, (len(new_rows), 2)), columns=["xnoise", "ynoise"]) + new_rows[columns["utm_n"]] = new_rows[columns["utm_n"]] + noise["ynoise"] + new_rows[columns["utm_e"]] = new_rows[columns["utm_e"]] + noise["xnoise"] + + new_rows = new_rows[(new_rows[columns["utc_date"]] >= df.iloc[0][columns["utc_date"]]) & (new_rows[columns["utc_date"]] <= df.iloc[1][columns["utc_date"]])].copy() + + # TODO: if target dataframe holds both WGS and UTM, convert the columns UTM to WGS and add those (or vice versa) + + return new_rows + +def interpolate_over_period_test1(df, columns, frequency, noise=3.0): + """ + Helper method for interpolate_over_dropped_periods() (see below), which performs actual interpolation + :param df: pandas dataframe (group??) containing the range of points to iterpolate over + :param frequency: how frequently to add interpolated points, in seconds + :param noise: sigma parameter for the random noise distribution, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :return: pandas dataframe containing only the new rows + concatenate these new rows onto your original dataframe + -- + Interpolation done along the road network, shortest path chosen based on GPS track, + linear sampling along path. + """ + from .interpolate import interpolate_graph + from .interpolate import density_sample + + f_lat, f_lon, f_n, f_e = tuple(df[[columns["lat"], columns["lon"], columns["utm_n"], columns["utm_e"]]].iloc[0]) + new_rows = df.copy().set_index(columns["utc_date"]).resample(f"{int(frequency)}S").asfreq().replace([0.0], np.nan) + new_rows = new_rows.reset_index() + + from_pt = Point((df.iloc[0][columns["utm_e"]], df.iloc[0][columns["utm_n"]])) + to_pt = Point((df.iloc[-1][columns["utm_e"]], df.iloc[-1][columns["utm_n"]])) + n_samples = len(new_rows.index) + + ipath = interpolate_graph(from_pt, to_pt, r'test_data/utm_roadnet.bin', impedance='weighted_length') + if ipath.is_empty: + ipath = LineString([from_pt, to_pt]) + igps_fix = density_sample(ipath, n_samples, noise=noise) + igps_coords = map(lambda p: (p.x, p.y), igps_fix) + igps_x, igps_y = zip(*igps_coords) + new_rows.loc[:,columns["utm_e"]] = igps_x + new_rows.loc[:,columns["utm_n"]] = igps_y + + noise = pd.DataFrame(np.random.normal(0, noise, (len(new_rows), 2)), columns=["xnoise", "ynoise"]) + new_rows[columns["utm_n"]] = new_rows[columns["utm_n"]] + noise["ynoise"] + new_rows[columns["utm_e"]] = new_rows[columns["utm_e"]] + noise["xnoise"] + + new_rows = new_rows[(new_rows[columns["utc_date"]] >= df.iloc[0][columns["utc_date"]]) & (new_rows[columns["utc_date"]] <= df.iloc[1][columns["utc_date"]])].copy() + + return new_rows + +def interpolate_over_period_test2(df, columns, frequency, noise=3.0): + """ + Helper method for interpolate_over_dropped_periods() (see below), which performs actual interpolation + :param df: pandas dataframe (group??) containing the range of points to iterpolate over + :param frequency: how frequently to add interpolated points, in seconds + :param noise: sigma parameter for the random noise distribution, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :return: pandas dataframe containing only the new rows + concatenate these new rows onto your original dataframe + -- + Interpolation done along the road network, shortest path chosen based on GPS track, + density sampling along path using KDE50 raster. + """ + from .interpolate import interpolate_graph + from .interpolate import density_sample + + f_lat, f_lon, f_n, f_e = tuple(df[[columns["lat"], columns["lon"], columns["utm_n"], columns["utm_e"]]].iloc[0]) + new_rows = df.copy().set_index(columns["utc_date"]).resample(f"{int(frequency)}S").asfreq().replace([0.0], np.nan) + new_rows = new_rows.reset_index() + + from_pt = Point((df.iloc[0][columns["utm_e"]], df.iloc[0][columns["utm_n"]])) + to_pt = Point((df.iloc[-1][columns["utm_e"]], df.iloc[-1][columns["utm_n"]])) + n_samples = len(new_rows.index) + + ipath = interpolate_graph(from_pt, to_pt, r'test_data/utm_roadnet.bin', impedance='weighted_length') + if ipath.is_empty: + logging.warning('Interpolation returned with no valid path, defaulting to straight line') + ipath = LineString([from_pt, to_pt]) + igps_fix = density_sample(ipath, n_samples, gps_dsty_rst=r'test_data/sd_gps_asap_KDE50c10.tif', noise=noise) + igps_coords = map(lambda p: (p.x, p.y), igps_fix) + igps_x, igps_y = zip(*igps_coords) + new_rows.loc[:,columns["utm_e"]] = igps_x + new_rows.loc[:,columns["utm_n"]] = igps_y + + noise = pd.DataFrame(np.random.normal(0, noise, (len(new_rows), 2)), columns=["xnoise", "ynoise"]) + new_rows[columns["utm_n"]] = new_rows[columns["utm_n"]] + noise["ynoise"] + new_rows[columns["utm_e"]] = new_rows[columns["utm_e"]] + noise["xnoise"] + + new_rows = new_rows[(new_rows[columns["utc_date"]] >= df.iloc[0][columns["utc_date"]]) & (new_rows[columns["utc_date"]] <= df.iloc[1][columns["utc_date"]])].copy() + + return new_rows + +def interpolate_over_period_test3(df, columns, frequency, noise=3.0, min_dist=50): + """ + Helper method for interpolate_over_dropped_periods() (see below), which performs actual interpolation + :param df: pandas dataframe (group??) containing the range of points to iterpolate over + :param frequency: how frequently to add interpolated points, in seconds + :param noise: sigma parameter for the random noise distribution, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :return: pandas dataframe containing only the new rows + concatenate these new rows onto your original dataframe + -- + Interpolation done along the road network, shortest path chosen based on GPS track, + density sampling along path using KDE50 raster. + """ + from .interpolate import interpolate_graph + from .interpolate import density_sample + + f_lat, f_lon, f_n, f_e = tuple(df[[columns["lat"], columns["lon"], columns["utm_n"], columns["utm_e"]]].iloc[0]) + new_rows = df.copy().set_index(columns["utc_date"]).resample(f"{int(frequency)}S").asfreq().replace([0.0], np.nan) + new_rows = new_rows.reset_index() + + from_pt = Point((df.iloc[0][columns["utm_e"]], df.iloc[0][columns["utm_n"]])) + to_pt = Point((df.iloc[-1][columns["utm_e"]], df.iloc[-1][columns["utm_n"]])) + n_samples = len(new_rows.index) + + ipath = interpolate_graph(from_pt, to_pt, r'test_data/utm_roadnet.bin', impedance='weighted_length') + if ipath.is_empty: + logging.warning('Interpolation returned with no valid path, defaulting to straight line') + ipath = LineString([from_pt, to_pt]) + igps_fix = density_sample(ipath, n_samples, gps_dsty_rst=r'test_data/sd_gps_asap_KDE50c10.tif', noise=noise, min_dist=min_dist) + igps_coords = map(lambda p: (p.x, p.y), igps_fix) + igps_x, igps_y = zip(*igps_coords) + new_rows.loc[:,columns["utm_e"]] = igps_x + new_rows.loc[:,columns["utm_n"]] = igps_y + + noise = pd.DataFrame(np.random.normal(0, noise, (len(new_rows), 2)), columns=["xnoise", "ynoise"]) + new_rows[columns["utm_n"]] = new_rows[columns["utm_n"]] + noise["ynoise"] + new_rows[columns["utm_e"]] = new_rows[columns["utm_e"]] + noise["xnoise"] + + new_rows = new_rows[(new_rows[columns["utc_date"]] >= df.iloc[0][columns["utc_date"]]) & (new_rows[columns["utc_date"]] <= df.iloc[1][columns["utc_date"]])].copy() + + return new_rows + +def interpolate_over_dropped_periods(df, columns, frequency, max_delay=120, + max_drop_time=3600, max_distance=100, + interpolate_helper_func=interpolate_over_period): + """ + Interpolate GPS points for periods with large gaps + Criteria: + - if delay with previous point over max_delay: start interpolation + - if dropped period is over max_drop_time: do not interpolate + (unless the distance from last valid point is below max_distance) + :param df: pandas dataframe of GPS points + :param frequency: how frequently to add interpolated points, in seconds + :param max_delay: time threshold for starting interpolation, in seconds + :param max_drop_time: maximum time after which to not interpolate, in seconds + :param max_distance: maximum distance for interpolation between points, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :param interpolate_helper_func: + :return: new dataframe containing the interpolated points + """ + # get time delta between all adjacent pairs of points (in seconds) + df = df.sort_values(columns["utc_date"]).reset_index(drop=True) + differences = ((df[columns["utc_date"]].shift(-1) - df[columns["utc_date"]]).shift(1) / np.timedelta64(1, 's')).fillna(0.0) + + # get Euclidean distance delta between all adjacent pairs of points + df["prev_e"] = df.shift(1)[columns["utm_e"]] + df["prev_n"] = df.shift(1)[columns["utm_n"]] + distance_delta = df[[columns["utm_e"], columns["utm_n"], + "prev_e", "prev_n"]].apply(lambda x: get_euclidean_distance((x["prev_e"], x["prev_n"]), + (x[columns["utm_e"]], x[columns["utm_n"]])), axis=1) + df = df.drop(["prev_e", "prev_n"], axis=1) + + #distance_delta = pd.Series([0.0]).append(distance_delta, ignore_index=True) + distance_delta = pd.Series(distance_delta).fillna(0.0) + + # find all rows where time delta is greater than the max_delay parameter AND + # (the delay is less than max_drop_time OR distance delta is less than max_distance) + delay_mask = differences > max_delay + drop_distance_mask = ((differences < max_drop_time) | (distance_delta < max_distance)) + df['gap'] = delay_mask & drop_distance_mask + + # interpolate over those matching rows with _interpolate_over_period() + print("Previous trace length: ", len(df)) + interpolated_df = df.copy() + for i, row in df.loc[df['gap']].iterrows(): + new_points = interpolate_helper_func(df.loc[i-1:i], columns, frequency) + interpolated_df = pd.concat([interpolated_df, new_points], axis=0) + df = df.drop(['gap'], axis=1) + interpolated_df = interpolated_df.drop(['gap'], axis=1) + interpolated_df = interpolated_df.drop_duplicates([columns["utc_date"]]) + interpolated_df = interpolated_df.sort_values(columns["utc_date"]) + interpolated_df = interpolated_df.reset_index(drop=True) + print("New trace length after interpolation: ", len(interpolated_df)) + return interpolated_df + +def interpolate_over_dropped_periods_median(df, columns, frequency, + max_delay=120, max_drop_time=3600, max_distance=100, + interpolate_helper_func=interpolate_over_period): + """ + Interpolate GPS points for periods with large gaps + Criteria: + - if delay with previous point over max_delay: start interpolation + - if dropped period is over max_drop_time: do not interpolate + (unless the distance from last valid point is below max_distance) + :param df: pandas dataframe of GPS points + :param frequency: how frequently to add interpolated points, in seconds + :param max_delay: time threshold for starting interpolation, in seconds + :param max_drop_time: maximum time after which to not interpolate, in seconds + :param max_distance: maximum distance for interpolation between points, in meters + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :param interpolate_helper_func: + :return: new dataframe containing the interpolated points + """ + # get time delta between all adjacent pairs of points (in seconds) + df = df.sort_values(columns["utc_date"]).reset_index(drop=True) + differences = ((df[columns["utc_date"]].shift(-1) - df[columns["utc_date"]]).shift(1) / np.timedelta64(1, 's')).fillna(0.0) + + # compute median point over moving window and get Euclidian distance between those points + bins = 5 # length of window over which median position is computed + med_prev = df.rolling(bins).median().rename(columns={columns["utm_e"]: 'prev_e', columns["utm_n"]: 'prev_n'}) + med_next = df.shift(-bins + 1).rolling(bins).median() + med_df = pd.concat([med_prev, med_next], axis=1, sort=False) + med_distance_delta = med_df[[columns["utm_e"], columns["utm_n"], + "prev_e", "prev_n"]].apply(lambda x: get_euclidean_distance((x["prev_e"], x["prev_n"]), + (x[columns["utm_e"]], x[columns["utm_n"]])), axis=1) + med_distance_delta = pd.Series(med_distance_delta).fillna(0.0) + + # find all rows where time delta is greater than the max_delay parameter AND + # (the delay is less than max_drop_time OR distance delta is less than max_distance) + delay_mask = differences > max_delay + drop_distance_mask = ((differences < max_drop_time) | (med_distance_delta < max_distance)) + df['gap'] = delay_mask & drop_distance_mask + + # interpolate over those matching rows with _interpolate_over_period() + print("Previous trace length: ", len(df)) + interpolated_df = df.copy() + interpolated_df['interpolated'] = False + for i, row in df.loc[df['gap']].iterrows(): + prev_median = df.loc[i-bins:i-1].median() + next_median = df.loc[i:i+bins-1].median() + bounds = df.loc[i-1:i].copy() + for c in [columns["utm_e"], columns["utm_n"], columns["lat"], columns["lon"]]: + bounds.loc[i-1, c] = prev_median.loc[c] + bounds.loc[i, c] = next_median.loc[c] + new_points = interpolate_helper_func(bounds, columns, frequency) + new_points['interpolated'] = True + interpolated_df = pd.concat([interpolated_df, new_points], axis=0) + df = df.drop(['gap'], axis=1) + interpolated_df = interpolated_df.drop(['gap'], axis=1) + interpolated_df = interpolated_df.drop_duplicates([columns["utc_date"]]) + interpolated_df = interpolated_df.sort_values(columns["utc_date"]) + interpolated_df = interpolated_df.reset_index(drop=True) + print("New trace length after interpolation: ", len(interpolated_df)) + return interpolated_df + + +def grid_loc_to_utm(coord, axis): + """ + Utility method to convert a grid location to a UTM value + :param coord: the index value along a given axis, as a floating-point number + :param axis: numpy array containing the axis values for each index + :return: a single value representing the UTM coordinate at that index + """ + edge1 = int(np.floor(coord)) + edge2 = int(np.ceil(coord)) + if edge1 == edge2: + return axis[edge1] + + difference = axis[edge2] - axis[edge1] + return axis[edge1] + np.modf(coord)[0] * difference + +################################## +# Algorithm stages +################################## + + +# Step 1: Extract hotspots from GPS data (may have been resampled, interpolated etc.) + +def extract_hotspots(points, x_bounds, y_bounds, kernel_bandwidth, pid, + qualifiers="5", cell_size=10, save_kernel: bool=False, plot: bool=False): + """ + Extract hotspots from GPS track by identifying the local peaks + on a kernel density surface built from the GPS fixes. + :param points: a numpy array containing the GPS coordinates + :param x_bounds: tuple of the minimum/maximum x value + :param y_bounds: tuple of the minimum/maximum y value + :param kernel_bandwidth: the kernel uncertainty in meters + :param pid: the participant's ID, as an integer + :param qualifiers: a string containing parameter shorthand for the filenames; + "i" for interpolation or "" for none, followed by the downsampling frequency + :param cell_size: square cell size (meters) + :return: a numpy array containing the kernel values for each GPS point, + a dataframe with kernel zonal statistics (mean & std) + a numpy array containing the zone values for each GPS point, + and a pandas dataframe containing the discovered hotspots. + """ + + print("Initializing the grid") + # Step 0a: Initialize everything you need + x_grid = np.arange(x_bounds[0] - cell_size, x_bounds[1] + cell_size, cell_size) + y_grid = np.arange(y_bounds[0] - cell_size, y_bounds[1] + cell_size, cell_size) + print(f"Making {len(x_grid)} x {len(y_grid)} grid...") + x_mesh, y_mesh = np.meshgrid(x_grid, y_grid) + grid = np.vstack((y_mesh.ravel(), x_mesh.ravel())).T + + # Step 1: Run Kernel Density Estimation + # ArcPy assumes planar distance and quartic kernel; using biweight (~Quartic) kernel from KDEpy + # KDEpy kernel bandwidth are expressed as standard-deviation, + # see https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use for the + # characteristics of some common kernels + kde = FFTKDE(kernel='biweight', bw=np.sqrt(1/7) * kernel_bandwidth) + print("Fitting the kernel") + c0 = perf_counter() + result_kernel = kde.fit(points).evaluate(grid) * len(points) + print(f'Done: {perf_counter()-c0:.2f}s') + + # Produce 2D view of KDE + result_k_2d = result_kernel.reshape((len(y_grid), len(x_grid))) + if save_kernel: + print("Saving the kernel.") + np.savetxt(f"{os.getcwd()}/temp/{pid}_temp_kde{str(kernel_bandwidth)}_ds{qualifiers}.asc", + np.flip(result_k_2d, 0), delimiter=" ", comments='',fmt='%.18g', + header=f'NCOLS {len(x_grid)}\nNROWS {len(y_grid)}\nXLLCENTER {x_grid[0]}\nYLLCENTER {y_grid[0]}\nCELLSIZE {cell_size}\nNODATA_VALUE -9999') + if plot: + # Make the plot + levels = np.linspace(0, result_kernel.max(), 255) + plt.contourf(x_mesh, y_mesh, result_k_2d, levels=levels, cmap=plt.cm.Reds) + plt.savefig(f"{os.getcwd()}/temp/{pid}_temp_kernel_k{str(kernel_bandwidth)}_ds{qualifiers}.png") #plt.show() + + # Locating kernel values for GPS points + print("Snapping kernel values.") + y_indices = np.searchsorted(y_grid, points.T[0]) + x_indices = np.searchsorted(x_grid, points.T[1]) + indexes = [y_indices[i] * len(x_grid) + x_indices[i] for i in range(len(y_indices))] + kernel_snapped_values = [result_kernel[x] for x in indexes] + + print("Masking to area of interest") + # redefining area of interest + aoi = np.where(result_kernel == 0.0, np.nan, result_kernel) + + # get the mean of the non-zero values (from area of interest) + mean = np.nanmean(aoi) + + # Step 3: Retain values above average or 1 std, set others to null + aoi = np.where(result_kernel < mean, 0, result_kernel) + k_mask = np.where(aoi != 0, 1.0, 0).reshape((len(y_grid), len(x_grid))) + k_bool = k_mask.astype(bool) + + # Watershed algorithm using skimage: + # See http://scikit-image.org/docs/dev/auto_examples/segmentation/plot_watershed.html + # (removed the line about distance since the kernel function operates as distance) + # Increasing the footprint tends to decrease the number of local maxima + + print("Running watershed.") + # NB The last version of scikit-image to support the "indices" keyword + # argument seems to be 0.19.3. + # In order to migrate to a newer version, one would need to manually + # convert the returned coordinates to a boolean array: + # https://stackoverflow.com/questions/75825151/changes-to-peak-local-max-in-skimage-feature-how-do-i-get-the-boolean-array-sha + # local_maxi = peak_local_max(result_k_2d, indices=False, footprint=np.ones((3, 3)), + # labels=k_bool) + + # markers = ndi.label(local_maxi, structure=np.ones((3, 3)))[0] + local_maxi_coords = peak_local_max(result_k_2d, footprint=np.ones((3, 3)), labels=k_bool) + local_maxi = np.zeros_like(result_k_2d, dtype=bool) + if local_maxi_coords.size > 0: + local_maxi[tuple(local_maxi_coords.T)] = True + + markers = ndi.label(local_maxi, structure=np.ones((3, 3)))[0] + + sink_mask = np.where(markers > 0, 1, 0) + temp_markers = ndi.label(sink_mask, structure=np.ones((3, 3)))[0] + + maximum = np.max(temp_markers) + centroids = ndi.center_of_mass(sink_mask, temp_markers, index=range(1, maximum + 1)) + + print("Calculating hotspots.") + #np.set_printoptions(threshold=np.nan) #BT: wrong call to function, which does not allow NaN parameters) + if len(np.unique(markers)) == 2: + hotspots = pd.DataFrame([[1, grid_loc_to_utm(centroids[0][1], x_grid), + grid_loc_to_utm(centroids[0][0], y_grid)]], + columns=["hotspot_id", "centroid_x", "centroid_y"]) + else: + centroids_x = [grid_loc_to_utm(x[1], x_grid) for x in centroids] + centroids_y = [grid_loc_to_utm(x[0], y_grid) for x in centroids] + hotspots = pd.DataFrame([], columns=["hotspot_id", "centroid_x", "centroid_y"]) + hotspots["hotspot_id"] = range(1, np.max(markers) + 1) + hotspots["centroid_x"] = centroids_x + hotspots["centroid_y"] = centroids_y + + # Perform watershed using sinks as markers + labels = watershed(-result_k_2d, markers, mask=k_bool) + zone_labels = [labels.reshape((len(y_grid) * len(x_grid),))[x] for x in indexes] + + # Compute z-kernel for each zone + labels_1d = labels.reshape((1, labels.size)).ravel() + kdf = pd.DataFrame(data=np.vstack((labels_1d, result_kernel)).T, + columns=['zone', 'kernel']).astype({'zone':'int32'}) + zone_groups = kdf.groupby("zone") + aggregate_stats = zone_groups.agg({"kernel": ["mean", "std"]}) + aggregate_stats.columns = ["mean", "std"] + + pd.set_option('display.max_rows', len(hotspots)) + #print("Located hotspots:") + #print(hotspots) + + return kernel_snapped_values, aggregate_stats, zone_labels, hotspots + + +# Step 2: Calculate normal and modified kernel + +def get_norm_modified_kernel(df, aggregate_stats, buffer_weight, pid, kernel_bandwidth=100, qualifiers="5"): + """ + Adds the time-convolved modified kernel values to the GPS data, as well as the normalized values for both. + :param df: a pandas dataframe containing the GPS data (plus zone and kernel values from extract_hotspots()) + :param buffer_weight: a 1D numpy array containing the weight window for convolution + :param pid: the participant's ID, as an integer + :param kernel_bandwidth: the kernel uncertainty in meters + :param qualifiers: a string containing parameter shorthand for the filenames + "i" for interpolation or "" for none, followed by the downsampling frequency + :return: a pandas datafram containing the input data plus modified kernel and normalized kernel values. + """ + # calculate convolution of kernel values with weight (=modified kernel) --uses weight matrix and buffer values above + kernel = df["kernel"].values + zones = df["zone"].values + modified_kernel = np.convolve(kernel, buffer_weight, "same") + + # calculate normalized values of kernels based on the zone/basin + zone_mean = np.array([-9999 * len(zones)], float) + zone_std = np.array([-9999 * len(zones)], float) + norm_kernel = np.array([0 * len(kernel)], float) + for index, row in aggregate_stats.iterrows(): + mean = row["mean"] + std = row["std"] + zone_mean = np.where(zones == index, mean, zone_mean) + zone_std = np.where(zones == index, std, zone_std) + + + # calculate convolution of kernel values with weight (=modified kernel) --uses weight matrix and buffer values above + kernel = df["kernel"].values + zones = df["zone"].values + modified_kernel = np.convolve(kernel, buffer_weight, "same") + + # calculate normalized values of kernels based on the zone/basin + try: + norm_kernel = np.where(zone_mean == -9999, -9999, (kernel - zone_mean) / zone_std) + except RuntimeWarning: + print("Divide by zero error.") + norm_kernel = np.where(zone_std == 0, zone_mean, norm_kernel) + norm_modified_kernel = np.where(zone_mean == -9999, -9999, (modified_kernel - zone_mean) / zone_std) + + df["norm_kernel"] = norm_kernel + df["modfied_kernel"] = modified_kernel + df["norm_modified_kernel"] = norm_modified_kernel + if len(df[df["norm_kernel"] == -9999]) > 0 or len(df[df["norm_modified_kernel"] == -9999]) > 0: + print("WARNING: Some kernel values are still set to null values.") + + return df + +# Step 3: Link GPS data to discovered hotspots + +def link_gps_to_hotspots(df, buffer_snap, kernel_hotspot_threshold): + """ + Method for finding GPS points closest to hotspots, to determine which should be "snapped" to those hotspots. + :param df: a pandas dataframe containing the GPS data, zone, and normalized kernel values + :param buffer_snap: a 1D numpy array containing the weight window for convolution + :param kernel_hotspot_threshold: a positive float value, greater than 0, used to find significant kernel values + :return: a pandas dataframe with the GPS data, plus a binary "snap" value indicating a hotspot match + """ + # removing 0 ("no zone") from the list of valid zones) + zones = np.setdiff1d(df["zone"].value_counts().index.values, np.zeros((1,))) + + snap_dict = dict.fromkeys(np.unique(zones)) + for z in zones: + snap_dict[z] = np.zeros(len(df), dtype=int) + + # Find points with kernel values higher than the threshold and assume they belong to hotspots. + for i, row in df.iterrows(): + if row["zone"] != 0: + if ((row["norm_kernel"] >= kernel_hotspot_threshold) or + (row["norm_modified_kernel"] >= kernel_hotspot_threshold)): + snap_dict[row["zone"]][i] = 1 + + # Convolve the snap condition temporally for each zone + for z in zones: + snap_dict[z] = np.convolve(snap_dict[z], buffer_snap, "same") + + zone_array = df["zone"].values + snap_val = np.zeros(len(df)) + for k, snap in snap_dict.items(): + snap_val = np.where(zone_array == k, snap, snap_val) + + # give a mask of the points where the smoothed kernel values are above 0 + snap_val = np.where(snap_val > 0, 1, 0) + + df["snap_to_hs"] = snap_val + + return df + + +# Step 4: Refine the detected hotspots by extracting dwell times + +def refine_hotspots(df, columns, hotspots, data_gaps, min_keep_duration, + gap_length=DEFAULT_PARAMS["SIGNIFICANT_GAP_LENGTH"]): + """ + Create a list of dwells to find potential visits, find the time spent at each hotspot, and refine list of hotspots + to include only those meeting the minimum duration threshold. + :param df: a pandas dataframe containing the GPS data with the "snap" flag from link_gps_to_hotspots() + :param hotspots: a pandas dataframe containing the hotspots found in extract_hotspots() + Contains a hotspot ID and the centroid coordinates. + :param data_gaps: a pandas dataframe containing the start and stop periods for missing data gaps. + Used to create "dummy" points for breaking up visits + :param min_keep_duration: the number of seconds above which a hotspot must be visited to be considered valid + :param gap_length: minimum length of a missing data gap, in seconds, to be considered significant + :return: the filtered pandas dataframe containing hotspot data, with the total duration added, and + a pandas dataframe containing dwells, with the hotspot ID, start and end times and duration in seconds + """ + + # filter df to only snapped values + filtered = df[df["snap_to_hs"] > 0].copy() + + # Add fake GPS points to the filtered visits to signify data gaps. + # These are used to break up visits with long gaps, and are removed later. + fake_points = data_gaps.copy() + if not fake_points.empty: + fake_points.loc[:, columns["utc_date"]] = fake_points["start_time"] + np.timedelta64(1, 's') + fake_points.loc[:, "zone"] = -999 + filtered = pd.concat([filtered, fake_points], axis=0, sort=True) + + filtered = filtered.sort_values(columns["utc_date"]).reset_index(drop=True) + + # Identify points as beginnings of new visits by their difference in zone + # Fill appropriate values for first row to account for shifts + filtered["diff_visit"] = (filtered["zone"] != filtered["zone"].shift(-1)).shift(1) + if not filtered.empty: + filtered.loc[0, "diff_visit"] = True + + # A visit must be broken into two if it has a period of missing data in the middle. + filtered["sig_gap"] = (((filtered[columns["utc_date"]].shift(-1) - filtered[columns["utc_date"]]) / + np.timedelta64(1, 's')).fillna(0) > gap_length).shift(1) + filtered.loc[0, "sig_gap"] = True + filtered["diff_visit"] = (filtered["diff_visit"] | filtered["sig_gap"]) + + # Propagate the start time "downwards" through the results; all points from the same visit will have the same start + current_start_time = pd.NaT + for i, row in filtered.iterrows(): + if row['diff_visit']: + current_start_time = row[columns["utc_date"]] + filtered.at[i, 'start_time'] = current_start_time + + # Remove the fake data points representing gaps + filtered = filtered[filtered["zone"] != -999] + + # Get the end time for each dwell by finding the last timestamp for each dwell + visit_agg = filtered.groupby("start_time") + stop_times = [] + for a, b in visit_agg: + b = b.sort_values(columns["utc_date"]) + stop_times.append(b.iloc[-1][columns["utc_date"]]) + + # select rows where df["diff_visit"] = True to get a summary of the visit + dwell_rows = filtered[filtered["diff_visit"]].copy() + dwell_rows["stop_time"] = stop_times + + # Create dataframe containing dwells + dwell_rows["duration"] = (dwell_rows["stop_time"] - dwell_rows["start_time"]) / np.timedelta64(1, 's') + dwell_rows = dwell_rows.reset_index(drop=True) + dwell_rows = dwell_rows[["zone", "start_time", "stop_time", "duration"]] + + # Add the total visit duration (in seconds) to a location in the hotspot data and filter by duration + # Accounts for shift in hotspot ID and index + duration_agg = dwell_rows[["zone", "duration"]].groupby("zone").agg("sum") + last = duration_agg.iloc[-1]["duration"] + hotspots["total_duration"] = duration_agg["duration"] + hotspots["total_duration"] = hotspots["total_duration"].shift(-1) + hotspots.loc[max(duration_agg.index) - 1, "total_duration"] = last + hotspots = hotspots[~hotspots["total_duration"].isnull()] + hotspots = hotspots[hotspots["total_duration"] > min_keep_duration] + return hotspots, dwell_rows + + +# Step 5: Build visits from the refined hotspot data + +def filter_visits(dwell_table, hotspots, min_visit_timespan, min_visit_duration): + """ + Method for finding visits from the dwell data. + Group visits to the same place that too close to the previous visit, remove too-short visits + :param dwell_table: pandas dataframe containing dwell data (start, stop, locationID, duration) + :param hotspots: a pandas dataframe containing the hotspots found in extract_hotspots() + Contains a hotspot ID and the centroid coordinates. + :param min_visit_timespan: the minimum number of seconds that must elapse between visits to be considered different + :param min_visit_duration: the minimum number of seconds a dwell must be to be considered a visit + :return: dataframe containing the visits (columns: start, stop, locationID, duration) + """ + df = dwell_table.copy().reset_index(drop=True) + df = df.sort_values("start_time") + + # Columns used to make other calculations; add extra values on edges to account for shift + df["diff_location"] = (df["zone"] != df["zone"].shift(-1)).shift(1) + df.loc[0, "diff_location"] = True + + df["time_since_last_visit"] = ((df["start_time"].shift(-1) - df["stop_time"]).shift(1)) / np.timedelta64(1, 's') + df.loc[0, "time_since_last_visit"] = 0.0 + + # Do gnarly pandas calculations! + # Propagate start times "downwards" for visits meeting criteria: + # Propagate if the location hasn't changed and if the time falls below the threshold + last_result = pd.Series([0]) + while not df["start_time"].equals(last_result): + last_result = df["start_time"].copy() + df.loc[~((df["time_since_last_visit"] > min_visit_timespan) | + (df["diff_location"])), "start_time"] = df["start_time"].shift(1) + + # Aggregate result by start time to get records grouped by visit + agg_dwells = df.groupby("start_time") + + # Build the list of visits from the aggregated results + filtered_visits = pd.DataFrame() + for start, frame in agg_dwells: + stop = frame["stop_time"].max() + dur = (stop - start) / np.timedelta64(1, 's') + loc = frame.iloc[0]["zone"] + row = pd.DataFrame([[start, stop, loc, dur]], columns=["start_time", "stop_time", "zone", "duration"]) + filtered_visits = pd.concat([filtered_visits, row], axis=0, ignore_index=True) + + # Remove visits with a duration too short + filtered_visits = filtered_visits[filtered_visits["duration"] > min_visit_duration] + + # Remove visits with a zone not corresponding to a hotspot + filtered_visits = filtered_visits[filtered_visits["zone"].isin(hotspots["hotspot_id"].unique())] + + filtered_visits = filtered_visits.reset_index(drop=True) + return filtered_visits + + +def build_visits(dwell_table, hotspots, min_visit_timespan, min_visit_duration): + """ + Takes the identified dwells and turns them into visits. + :param dwell_table: a pandas dataframe containing the dwells from refine_hotspots() + :param hotspots: a pandas dataframe containing the hotspots found in extract_hotspots() + Contains a hotspot ID and the centroid coordinates. + :param min_visit_timespan: the minimum number of seconds that must elapse between visits to be considered different + :param min_visit_duration: the minimum number of seconds a dwell must be to be considered a visit + :return: a dataframe of visits containing the columns (zone, start_time, end_time, duration) + """ + # filter visits (only need one call unlike original code) + visit_table = filter_visits(dwell_table, hotspots, min_visit_timespan, min_visit_duration) + + # TODO figure out timestamp conversion if needed + # Convert local start and end times for each visit to UTC (or vice versa?) + + visit_table = visit_table.sort_values("start_time") + return visit_table + + +# Step 6: Update coordinates for snapped points + +def update_snapped_points(gps_data, columns, visits, hotspots): + """ + Update the GPS points to have the same coordinates as the hotspot where they fall within a visit. + For all filtered visits, finds the gps points within start and end times for that visit and sets that point's + location to be the hotspot centroid coordinates instead of the point coordinates + :param gps_data: a pandas dataframe containing the GPS data + :param visits: a pandas dataframe containing the visits found in build_visits() + :param hotspots: a pandas dataframe containing the hotspots refined from refine_hotspots() + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :return: the updated GPS dataframe with coordinates snapped to hotspot coordinates where applicable. + """ + for i in range(len(visits)): + gps_data.loc[((gps_data[columns["utc_date"]] >= visits.iloc[i]["start_time"]) & + (gps_data[columns["utc_date"]] <= visits.iloc[i]["stop_time"])), columns["utm_e"]] = \ + hotspots.loc[(hotspots["hotspot_id"] == visits.iloc[i]["zone"])].reset_index(drop=True).iloc[0]["centroid_x"] + + gps_data.loc[((gps_data[columns["utc_date"]] >= visits.iloc[i]["start_time"]) & + (gps_data[columns["utc_date"]] <= visits.iloc[i]["stop_time"])), columns["utm_n"]] = \ + hotspots.loc[(hotspots["hotspot_id"] == visits.iloc[i]["zone"])].reset_index(drop=True).iloc[0]["centroid_y"] + + return gps_data + + +# Step 7: Build path bouts from GPS, hotspot and visit data + +def detect_path(gps_data, columns, from_time, to_time, max_drop_time, + min_keep_time, path_df, utm=True): + """ + Helper method for build_paths() below. Given a set of boundary times, detect GPS points which fall into paths and + build a path from their coordinates and times. + :param gps_data: pandas dataframe containing GPS data + :param from_time: lower filter bounds for GPS data, as a timestamp + :param to_time: upper filter bounds for GPS data, as a timestamp + :param max_drop_time: the maximum time in seconds between points, after which to not consider them consecutive + :param min_keep_time: the minimum duration of a path, in seconds, for it to be valid + :param path_df: dataframe containing the path data so far + :param columns: a dictionary containing the names of lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :param utm: True if the GPS readings are in UTM format, false otherwise. + :return: the modified path dataframe with any newly discovered paths added + The path contains its start and end times, duration, its start and end point coordinates, its start and end + zones, the number of line segments in the path, and the points in the path. + """ + + if utm: + x_col = columns["utm_e"] + y_col = columns["utm_n"] + else: + x_col = columns["lon"] + y_col = columns["lat"] + + selected = gps_data.copy()[((gps_data[columns["utc_date"]] >= from_time) & + (gps_data[columns["utc_date"]] < to_time))] + + if len(selected) == 0: + # Don't give warning for very short intervals, because obviously nothing was selected. + if (to_time - from_time) / np.timedelta64(1, 's') > 10: + print("No points selected within interval:", from_time, " to ", to_time) + return path_df + + selected = selected.sort_values(columns["utc_date"]) + + # Split the path as needed by temporal gap + selected["between_point_time"] = ((selected.shift(-1)[columns["utc_date"]] - selected[columns["utc_date"]]).shift(1)) \ + / np.timedelta64(1, 's') + selected = selected.reset_index(drop=True) + selected.loc[0, "between_point_time"] = 0 + + # Split into multiple dataframes based on long temporal gaps + selected["path_id"] = (selected["between_point_time"] > max_drop_time).cumsum() + split_groups = selected.groupby("path_id") + + # For each collection of points, build a unique sequence of points for the path + for _, g in split_groups: + points = selected[[x_col, y_col]] + unique_sequence = points[~((points[x_col] == points[x_col].shift(-1)) & + (points[y_col] == points[y_col].shift(-1)))] + selected = selected[~((points[x_col] == points[x_col].shift(-1)) & + (points[y_col] == points[y_col].shift(-1)))].reset_index(drop=True) + segments = [tuple(x) for x in unique_sequence.values] + if len(segments) > 1: + # TODO: Format the dates for the found paths (local -> UTM or vice versa) + duration = (selected[columns["utc_date"]].max() - selected[columns["utc_date"]].min()) / np.timedelta64(1, 's') + if duration > min_keep_time: + new_path = pd.DataFrame([[selected[columns["utc_date"]].min(), selected[columns["utc_date"]].max(), + duration, segments[0][0], segments[0][1], selected.iloc[0]["zone"], + segments[-1][0], segments[-1][1], selected.iloc[-1]["zone"], + len(segments) - 1, segments]], + columns=list(path_df.columns)) + path_df = pd.concat([path_df, new_path], axis=0) + + return path_df + + +def build_paths(gps_data, columns, visits, data_gaps, max_drop_time, min_keep_time): + """ + Build paths from the detected visits and the GPS data. + :param gps_data: pandas dataframe containing the GPS data + :param visits: pandas dataframe containing the visit data + :param data_gaps: pandas dataframe containing the start and stop times of each missing data gap + :param max_drop_time: the maximum time in seconds between points, after which to not consider them consecutive + :param min_keep_time: the minimum duration of a path, in seconds, for it to be valid + :return: a dataframe containing paths with the columns + ["start_time", "end_time", "duration", "path_start_x", "path_start_y", + "path_end_x", "path_end_y", "num_segments", "segments"] + The segments are a list of (x, y) coordinate tuples as a string, which can be further parsed when the + dataframe is returned + """ + # Initialize empty path structure + path_cols = ["start_time", "stop_time", "duration", "start_x", "start_y", "start_location_id", "stop_x", + "stop_y", "stop_location_id", "num_segments", "segments"] + + paths = pd.DataFrame([], columns=path_cols) + + visits.loc[:, "type"] = "visit" + + # Consider significant gaps in GPS data to break trips, as well as visits + data_gaps.columns = ["start_time", "stop_time"] + if not data_gaps.empty: + data_gaps.loc[:, "type"] = "gap" + data_gaps.loc[:, "contained"] = False + + # Fold gaps into visits if they are fully contained within a visit + # Visits were previously merged if they were less than MIN_VISIT_TIMESPAN apart, so gaps detected as significant may + # be contained withing these merged visits, if the SIGNIFICANT_GAP_LENGTH is less than MIN_VISIT_TIMESPAN + for vindex, visit in visits.iterrows(): + data_gaps["contained"] = (((data_gaps["start_time"] > visit["start_time"]) & + (data_gaps["stop_time"] < visit["stop_time"])) | + (data_gaps["contained"])) + + data_gaps = data_gaps[~data_gaps["contained"]] + data_gaps = data_gaps[["start_time", "stop_time"]] + + stop_events = pd.concat([data_gaps[["start_time", "stop_time"]], visits[["start_time", "stop_time"]]], axis=0) + stop_events = stop_events.sort_values("start_time") + + # Loop through stops and find the points in-between stops to be candidate paths. + from_time = gps_data.iloc[0][columns["utc_date"]] + for i in range(len(stop_events)): + to_time = stop_events.iloc[i]["start_time"] + paths = detect_path(gps_data, columns, from_time, to_time, max_drop_time, min_keep_time, paths) + # Update reference time + from_time = stop_events.iloc[i]["stop_time"] + + # After iterating through stops, add the last piece of track which was not previously processed. + to_time = gps_data.iloc[-1][columns["utc_date"]] + pd.Timedelta(1, unit="s") + paths = detect_path(gps_data, columns, from_time, to_time, max_drop_time, min_keep_time, paths) + + # Drop duplicated paths (if any). + paths = paths.drop_duplicates(["start_time", "stop_time"]).reset_index(drop=True) + return paths + +############################################# +# Main method for putting it all together: +############################################# + +def make_index(iter, time_series: pd.Series, binary: bool) -> list: + index_list = [] + first_time = time_series.iat[0] + last_time = time_series.iat[-1] + stop_time = first_time - np.timedelta64(1, 's') + for _, t in time_series.items(): + while t>stop_time: + try: + index, row = next(iter) + index += 1 + start_time = row["start_time"] + stop_time = row["stop_time"] + except StopIteration: + index = 0 + start_time = first_time + stop_time = last_time + index_list.append(0 if t0) if binary else index)) + return index_list + +def detect_trips(gps_data, pid, columns, params=DEFAULT_PARAMS, + interpolate_helper_func=interpolate_over_period, code=None): + """ + Method for detecting trips from GPS data. + Intended for use with only a single participant's data at a time. + :param gps_data: data frame containing the GPS data to detect trips on. + :param pid: a string containing the participant's INTERACT ID (or other unique identifier); + Used when generating temporary files. + :param params: dictionary containing the parameters for the trip detection, each as described in the documentation. + :param interpolate_helper_func: function to be used to interpolate GPS track. Function signature as follow: + interpolate_helper_func(df, columns, frequency, noise). No interpolation if None + :param columns: a dictionary containing the names of utc_date, lat, lon, utm_n and utm_e columns in the dataframe. + May differ between Ethica and Sensedoc data. + :param code: the ESPG code of the data to be projected, as a string. Leave as None if no conversion is needed. + :return: a pandas dataframe containing the final detected trips, + a pandas dataframe containing the detected hotspots, and + a pandas dataframe containing the detected visits. + """ + + # Make temporary scratch folder to store intermediate values and prevent thrashing. + os.makedirs(os.getcwd() + "/temp", exist_ok=True) + + # Load parameters and initialize needed values + cell_size = params["CELL_SIZE"] # meters + kernel_bandwidth = params["KERNEL_BANDWIDTH"] # meters + max_raster_size = params["MAX_RASTER_SIZE"] + downsample_frequency = params["DOWNSAMPLE_FREQUENCY"] # seconds + max_delay = params["INTERPOLATE_MAX_DELAY"] # seconds + max_drop_time = params["INTERPOLATE_MAX_DROP_TIME"] # seconds + max_distance = params["INTERPOLATE_MAX_DISTANCE"] # meters + frequency = downsample_frequency if downsample_frequency else 1 + i_string = "" + if interpolate_helper_func is not None: + i_string = "i" + i_string += str(downsample_frequency) + min_visit_duration = params["MIN_VISIT_DURATION"] + min_visit_timespan = params["MIN_VISIT_TIMESPAN"] + buffer_weight = params["BUFFER_WEIGHT"] + buffer_snap = params["BUFFER_SNAP"] + min_hotspot_keep = params["MIN_HOTSPOT_KEEP_DURATION"] + min_path_keep = params["MIN_PATH_KEEP_DURATION"] + min_loop_duration = params["MIN_LOOP_DURATION"] + kernel_hotspot_threshold = params["KERNEL_HOTSPOT_THRESHOLD"] + utm = True + + if utm: + x_col = columns["utm_e"] + y_col = columns["utm_n"] + else: + x_col = columns["lon"] + y_col = columns["lat"] + + # Currently assumes dataframe contains the following columns: (interact_id, utc_date, lat, lon, utm_e, utm_n) + gps_data = format_gps_data(gps_data, columns, code=code) + + # get grid width/height + width = np.ceil(gps_data[x_col].max() - gps_data[x_col].min()) + height = np.ceil(gps_data[y_col].max() - gps_data[y_col].min()) + print("Width: ", width) + print("Height: ", height) + print(f"Cell size: {cell_size} m") + print("Kernel_bandwidth", kernel_bandwidth) + + cell_size = resize_cells(width, height, cell_size, kernel_bandwidth, + max_raster_size) + + if downsample_frequency != 0: + print(f"[{strftime('%H:%M:%S')}] Downsampling gps data.") + gps_data = downsample_trace(gps_data, columns, downsample_frequency) + + if interpolate_helper_func is not None: + print(f"[{strftime('%H:%M:%S')}] Interpolating over GPS data") + gps_data = interpolate_over_dropped_periods_median(gps_data, columns, + frequency, max_delay, max_drop_time, max_distance, + interpolate_helper_func=interpolate_helper_func) + gps_data.to_csv(r'temp\gps_interpolated.csv') + + points = gps_data[[y_col, x_col]].ffill().values + x_bounds = (gps_data[x_col].min(), gps_data[x_col].max()) + y_bounds = (gps_data[y_col].min(), gps_data[y_col].max()) + + print(f"[{strftime('%H:%M:%S')}] Extracting hotspots.") + kernel, zone_stats, zones, hotspots = extract_hotspots(points, x_bounds, y_bounds, kernel_bandwidth=kernel_bandwidth, pid=pid, + qualifiers=i_string, cell_size=cell_size) + + gps_data["kernel"] = kernel + gps_data["zone"] = zones + + print(f"[{strftime('%H:%M:%S')}] Normalizing the kernel.") + gps_data = get_norm_modified_kernel(gps_data, zone_stats, buffer_weight, pid, kernel_bandwidth, i_string) + print(f"[{strftime('%H:%M:%S')}] Linking the GPS points to hotspots.") + gps_data = link_gps_to_hotspots(gps_data, buffer_snap, kernel_hotspot_threshold) + print(f"[{strftime('%H:%M:%S')}] Characterizing existing gaps in GPS data.") + gaps = get_data_gaps(gps_data, pid, columns) + print(f"[{strftime('%H:%M:%S')}] Refining hotspots and finding dwells.") + hotspots, dwells = refine_hotspots(gps_data, columns, hotspots, gaps, min_hotspot_keep) + print(f"[{strftime('%H:%M:%S')}] Building visits from dwells.") + visit_table = build_visits(dwells, hotspots, min_visit_timespan, min_visit_duration) + print(f"[{strftime('%H:%M:%S')}] Updating GPS values for snapped points.") + gps_data = update_snapped_points(gps_data, columns, visit_table, hotspots) + print(f"[{strftime('%H:%M:%S')}] Detecting the final bouts.") + final_bouts = build_paths(gps_data, columns, visit_table, gaps, max_drop_time, min_path_keep) + print(f"[{strftime('%H:%M:%S')}] Building incidents table.") + incidents_table = build_incident_table(final_bouts, hotspots, visit_table, gaps, pid) + + print(f"[{strftime('%H:%M:%S')}] Building trip, visit, and gap look-ups...") + gps_data[CH_TRIP_INDEX] = make_index(final_bouts.iterrows(), gps_data["Timestamp"], False) + gps_data[CH_VISIT_INDEX] = make_index(visit_table.iterrows(), gps_data["Timestamp"], False) + gps_data[CH_GAP] = make_index(gaps.iterrows(), gps_data["Timestamp"], True) + + return gps_data, incidents_table, width, height, cell_size + + +def build_incident_table(paths, hotspots, visits, gaps, pid): + incident_columns = ["start_time", "stop_time", "start_location_id", "stop_location_id", + "start_x", "start_y", "stop_x", "stop_y", "type"] + + paths["type"] = "trip" + visits["type"] = "visit" + + visits["start_location_id"] = visits["zone"] + visits["stop_location_id"] = visits["zone"] + visits["start_x"] = visits["start_location_id"].map(lambda x: hotspots.loc[hotspots["hotspot_id"] == x, + "centroid_x"].iloc[0]) + visits["start_y"] = visits["start_location_id"].map(lambda y: hotspots.loc[hotspots["hotspot_id"] == y, + "centroid_y"].iloc[0]) + visits["stop_x"] = visits["start_x"] + visits["stop_y"] = visits["start_y"] + + # Format gaps to fit in the table + if not gaps.empty: + gaps = gaps[~gaps["contained"]] + gaps = gaps[["start_time", "stop_time"]] + gaps["type"] = "gap" + gaps["start_location_id"] = np.nan + gaps["stop_location_id"] = np.nan + gaps["start_x"] = np.nan + gaps["start_y"] = np.nan + gaps["stop_x"] = np.nan + gaps["stop_y"] = np.nan + incidents = pd.concat([paths[incident_columns], visits[incident_columns], gaps[incident_columns]], axis=0) + else: + incidents = pd.concat([paths[incident_columns], visits[incident_columns]], axis=0) + + incidents.loc[:, "interact_id"] = pid #TODO: change interact_id + incidents = incidents[["interact_id"] + incident_columns] + incidents = incidents.sort_values("start_time") + return incidents + + +def main(): + # testing the code above + PID = 'test' + TEST_FILE_DIR = os.path.join("input", "SampleGPSTrajectory_02-processed.csv") + OUT_FILE_DIR = os.path.join("output", "SampleGPSTrajectory_02_incidents_table.csv") + + # Load GPS track. + try: + gps_data = pd.read_csv(TEST_FILE_DIR, parse_dates=[AI4PH_COLUMNS["utc_date"]]) + except FileNotFoundError: + print("ERROR: unable to locate gps .csv file at ", TEST_FILE_DIR, ". Aborting.") + exit() + + gps_data, incidents_table, _, _, _ = detect_trips( + gps_data, PID, AI4PH_COLUMNS, params=DEFAULT_PARAMS, + interpolate_helper_func=None) #interpolate_over_period_test1 + + gps_data.to_csv(os.path.join("output", "SampleGPSTrajectory_02_trip_detection.csv"), + encoding='utf-8', index=False) + incidents_table.to_csv(OUT_FILE_DIR, index=False) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + #import cProfile + #cProfile.run('main()', r'temp\prof_main_gap.file') + main() diff --git a/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/utilities.py b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/utilities.py new file mode 100644 index 00000000000..e200d1dc3b4 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/exposure_calculator/trip_detector/utilities.py @@ -0,0 +1,158 @@ +""" +utilities.py +Author: Janelle Berscheid, June 2018 + +A collection of utility methods for handling GPS data, including conversions +""" + +import numpy as np +import requests +import xml.etree.ElementTree as ET +from pyproj import Transformer, CRS + + +EARTH_RADIUS = 6378137.0 * 2 * np.pi # earth radius from semimajor axis of WGS 84 spheroid + +PROJ_CODES = { + "victoria": "epsg:32610", + "vancouver": "epsg:32610", + "saskatoon": "epsg:32613", + "montreal": "epsg:26918", + "london": "EPSG:27700" +} + + +def meters_to_decimal_degrees(value): + """ + Rough conversion of meters to decimal degrees only. + For proper conversion, use a projection. + :param value: value in meters + :return: value in decimal degrees + """ + return value * 360.0 / EARTH_RADIUS + + +def decimal_degrees_to_meters(value): + """ + Rough conversion of decimal degrees to meters only. + For proper conversion, use a projection. + :param value: value in meters + :return: value in decimal degrees + """ + return value * EARTH_RADIUS / 360.0 + + +def wgs_to_utm_code(lat, lon): + # convert_wgs_to_utm function + # see https://stackoverflow.com/questions/40132542/get-a-cartesian-projection-accurate-around-a-lat-lng-pair + utm_band = str(int((np.floor((lon + 180) / 6) % 60) + 1)) + if len(utm_band) == 1: + utm_band = '0' + utm_band + if lat >= 0: + epsg_code = '326' + utm_band + else: + epsg_code = '327' + utm_band + return int(epsg_code) + + +def get_great_circle_distance(p1, p2): + """ + Great circle distance: using haversine formula for small distances + :param p1: the first lon-lat point, as a tuple of floats + :param p2: the second lon-lat point, as a tuple of floats + :return: distance (in meters?) + """ + # Great circle distance between two geopoints + lat1, lon1 = np.radians(p1[1]), np.radians(p1[0]) # get lat, long in radians + lat2, lon2 = np.radians(p2[1]), np.radians(p2[0]) + d_lat = lat2 - lat1 + d_lon = lon2 - lon1 + + # Haversine formulae + ro = 2 * np.arcsin(np.sqrt(np.power(np.sin(d_lat / 2), 2) + + np.cos(lat1) * np.cos(lat2) * np.power(np.sin(d_lon / 2), 2))) + gc_dist = ro * EARTH_RADIUS # earth radius from semimajor axis of WGS 84 spheroid + + return gc_dist + + +def get_euclidean_distance(p1, p2): + """ + It's Euclidean distance. Use as a distance metric for UTM coords (not WGS). + :param p1: the first x-y point, as a tuple of floats + :param p2: the second x-y point, as a tuple of floats + :return: a single floating-point distance value + """ + return np.sqrt(np.power((p2[0] - p1[0]), 2) + np.power((p2[1] - p1[1]), 2)) + + +def get_address_from_coord(lat, lon, proxy): + """ + Reverse geocoding of some point to get the street address. + :param lat: latitude of coord + :param lon: longitude of coord + :param proxy: proxy for the request, as a string + :return: string address of given coordinate + """ + address = "" + try: + # build url query to gmap reverse geocoding + urlquery = u'http://maps.googleapis.com/maps/api/geocode/xml?latlng=%f,%f&sensor=false' % (lat, lon) + gmap_response = requests.get(urlquery, proxies=proxy) + gmap_xml = ET.fromstring(gmap_response.text.encode('utf-8')) + + # get address (first result) + first_result = gmap_xml.find(u'result/formatted_address') + if not (first_result is None): + address = first_result.text + except Exception as e: + print(str(e)) + pass + finally: + return address + + +def get_vector_magnitude_column(x, y, z, epoch_length=60): + """ + Per Ruben on Slack: + + I have been working on my scripts lately, and I came across a crucial mistake in the calculation of Activity + Intensity when using the vector magnitude. It is not in the scripts of INTERACT, because there is no such + calculation at the moment. The problem comes down to the epoch length at which the vector magnitude is + calculated. For example, I am using a cut-point at 15 seconds for older adults, based on the vm. It is a mistake + to calculate the vm at the 1 second level, and then sum it up to 15 seconds. The three axis should be summed up + to the 15 seconds first and then the vm should be calculated. It is a mistake that is easy to make, but it + introduces a large bias. I suppose that in we will be using vm-cutpoints in the future, so I thought it might be + good to alert this now. + + So calculate with epoch length set accordingly (in seconds) when implementing this function. + :param x: a 1D numpy float array containing the x axis column of a dataset. Must be the same length as y and z. + :param y: a 1D numpy float array containing the y axis column of a dataset. Must be the same length as x and z. + :param z: a 1D numpy float array containing the z axis column of a dataset. Must be the same length as x and y. + :param epoch_length: the length of the epoch in seconds (see above note) + :return: a 1D numpy array containing the vector magnitude column + """ + # TODO: implement utility function + return + + +def get_projection(lon, lat, city="london", code=None): + """ + Given WGS readings, project them into UTM for a given city/code. + :param lon: a 1D numpy float array containing the longitude column of a dataset. Must be the same length as lat. + :param lat: a 1D numpy float array containing the latitude column of a dataset. Must be the same length as lon. + :param city: a string containing the name of the city the WGS readings came from. + Used to determine the correct UTM projection. + :return: the 1D numpy arrays corresponding to the projected UTM easting and northing coordinates + """ + if code is None: + # TODO: Doesn't work if a city is on the boundary of two UTM zones (per Benoit). Figure out solution for future. + code = PROJ_CODES[city.strip().lower()] + + transformer = Transformer.from_crs(CRS("EPSG:4326"), CRS(code)) + + def transf(x, y): + return transformer.transform(x, y) + + utm_e, utm_n = np.vectorize(transf)(lat, lon) + return utm_e, utm_n \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/flaskapp/__init__.py b/Agents/FenlandTrajectoryAgent/agent/flaskapp/__init__.py index c33b6848585..0aec78dbb2f 100644 --- a/Agents/FenlandTrajectoryAgent/agent/flaskapp/__init__.py +++ b/Agents/FenlandTrajectoryAgent/agent/flaskapp/__init__.py @@ -2,6 +2,7 @@ # Import blueprints from agent.flaskapp.gpstasks.routes import gps_instantiation_bp from agent.flaskapp.home.routes import gps_bp +from agent.flaskapp.exposure.route import exposure_bp # Import utilities for RDF store and database initialization from agent.kgutils.initialise_kg import create_blazegraph_namespace from agent.kgutils.utils import * @@ -20,6 +21,8 @@ def create_app(test_config=None): # Register blueprints for application components app.register_blueprint(gps_instantiation_bp, url_prefix='') app.register_blueprint(gps_bp, url_prefix='') + app.register_blueprint(exposure_bp, url_prefix='') + # Perform initial setup actions within the app context with app.app_context(): diff --git a/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/__init__.py b/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/config.properties b/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/config.properties new file mode 100644 index 00000000000..bb8a1fc6b42 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/config.properties @@ -0,0 +1,4 @@ +[DEFAULT] +ENV_DATA_ENDPOINT_URL=http://host.docker.internal:5007/ontop/ui/sparql +TRAJECTORY_DB_HOST=localhost +TRAJECTORY_DB_PASSWORD= diff --git a/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/route.py b/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/route.py new file mode 100644 index 00000000000..b9f69b8ce4f --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/agent/flaskapp/exposure/route.py @@ -0,0 +1,126 @@ +from flask import Blueprint, request, jsonify +from agent.exposure_calculator.exposure_utilities import ExposureUtils +import logging +logger = logging.getLogger("exposure") +exposure_bp = Blueprint('exposure_bp', __name__) +exposure_util = ExposureUtils() + +@exposure_bp.route('/exposure', methods=['POST']) +def calculate_exposure(): + data = request.json + trajectoryIRIs = data.get("trajectoryIRIs", []) + exposure_radius = data.get("exposure_radius", 100) + DataIRIs = data.get("DataIRIs", []) + + final_results = [] + for trajectoryIRI in trajectoryIRIs: + try: + trajectory_df = exposure_util.fetch_trajectory_data_from_db(trajectoryIRI) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + reference_time = None + if not trajectory_df.empty: + reference_time = trajectory_df.iloc[0]['trajectory_start_time'] + + for env_data_iri in DataIRIs: + try: + env_df, domain_label = exposure_util.fetch_env_data(env_data_iri, reference_time=reference_time) + _, feature_type = exposure_util.get_domain_and_featuretype(env_data_iri) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + normalized_columns = [col.lower().replace(" ", "_") for col in trajectory_df.columns] + has_trip_index = "trip_index" in normalized_columns + + if has_trip_index: + trip_index_col = [col for col in trajectory_df.columns if col.lower().replace(" ", "_") == "trip_index"][0] + trip_df = trajectory_df[trajectory_df[trip_index_col].fillna(0).astype(int) > 0] + grouped = trip_df.groupby(trip_index_col) + logger.info(f"[TRIP FILTER] Found Trip index column: '{trip_index_col}', total trips: {grouped.ngroups}") + + if grouped.ngroups == 0: + logger.info(f"[TRIP FILTER] No trips detected for {trajectoryIRI}, fallback to full trajectory.") + calc_res = exposure_util.exposure_calculation( + trajectory_df=trajectory_df, + env_df=env_df, + env_data_iri=env_data_iri, + domain_label=domain_label, + feature_type=feature_type, + exposure_radius=exposure_radius, + trajectory_iri=trajectoryIRI, + ) + calc_res["trajectoryIRI"] = trajectoryIRI + final_results.append(calc_res) + else: + logger.info(f"[TRIP FILTER] Detected {grouped.ngroups} trips for {trajectoryIRI}, computing exposure per trip.") + combined_result = { + "envFeatureName": domain_label, + "type": feature_type, + "count": 0, + "trajectoryIRI": trajectoryIRI + } + if feature_type.upper() == "AREA": + combined_result["totalArea"] = 0.0 + + for trip_index, sub_df in grouped: + trip_res = exposure_util.exposure_calculation( + trajectory_df=sub_df, + env_df=env_df, + env_data_iri=env_data_iri, + domain_label=domain_label, + feature_type=feature_type, + exposure_radius=exposure_radius, + trajectory_iri=trajectoryIRI, + ) + combined_result["count"] += trip_res.get("count", 0) + if feature_type.upper() == "AREA" and "totalArea" in trip_res: + combined_result["totalArea"] += trip_res.get("totalArea", 0.0) + + final_results.append(combined_result) + + else: + logger.info(f"[TRIP FILTER] No Trip index column found for {trajectoryIRI}, computing exposure on full trajectory.") + calc_res = exposure_util.exposure_calculation( + trajectory_df=trajectory_df, + env_df=env_df, + env_data_iri=env_data_iri, + domain_label=domain_label, + feature_type=feature_type, + exposure_radius=exposure_radius, + trajectory_iri=trajectoryIRI, + ) + calc_res["trajectoryIRI"] = trajectoryIRI + final_results.append(calc_res) + + return jsonify(final_results) + +@exposure_bp.route('/exposure/simplified', methods=['POST']) +def calculate_exposure_simplified(): + """ + workflow is packaged in ExposureUtils.calculate_exposure_simplified_util() + """ + data = request.json + try: + results = exposure_util.calculate_exposure_simplified_util(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + return jsonify(results) + +@exposure_bp.route('/trip_detection', methods=['POST']) +def trip_detection(): + """ + Endpoint for trip detection on a trajectory. + The endpoint will: + 1. Detect trips in the trajectory + 2. Add necessary columns to the trajectory table + 3. Update the table with detection results + """ + data = request.json + try: + result = exposure_util.detect_trips_util(data.get("trajectoryIRI")) + return jsonify(result) + except ValueError as e: + return jsonify({"error": str(e)}), 400 + except Exception as e: + return jsonify({"error": str(e)}), 500 \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/agent/flaskapp/gpstasks/routes.py b/Agents/FenlandTrajectoryAgent/agent/flaskapp/gpstasks/routes.py index 940fd1bf267..22fe15559a3 100644 --- a/Agents/FenlandTrajectoryAgent/agent/flaskapp/gpstasks/routes.py +++ b/Agents/FenlandTrajectoryAgent/agent/flaskapp/gpstasks/routes.py @@ -2,6 +2,7 @@ import os import sys import glob +import shutil from agent.utils.stack_configs import DB_URL, DB_USER, DB_PASSWORD,SPARQL_QUERY_ENDPOINT, SPARQL_UPDATE_ENDPOINT import agent.datainstantiation.gps_client as gdi ##from agent.datainstantiation.jpsSingletons import jpsBaseLibGW @@ -11,94 +12,101 @@ from agent.kgutils.utils import * from agent.layergenerator.geoserver_gen import create_functions, create_geoserver_layer import logging +from agent.datainstantiation.cleaning_tool import clean_gps_data # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +INTERNAL_DATA_PATH = "/app/agent/raw_data/gps_target_folder" # Blueprint configuration gps_instantiation_bp = Blueprint('gps_instantiation_bp', __name__) # Define the route for instantiating GPS data @gps_instantiation_bp.route('/preprocess', methods=['POST']) -def load_and_preprocess(): +def preprocess_files(): + """ + Preprocess CSV files in the internal data folder by cleaning them. + """ try: - file_path = request.json.get('file_path') - if not file_path: - logger.error("File path is missing in the request.") - return jsonify({"error": "File path is missing in the request."}), 400 - - logger.info(f"Received request to load and preprocess files at: {file_path}") - csv_files = glob.glob(os.path.join(file_path, '*.csv')) + folder_path = INTERNAL_DATA_PATH + csv_files = glob.glob(os.path.join(folder_path, '*.csv')) if not csv_files: - logger.error("No CSV files found at the specified path") - return jsonify({"error": "No CSV files found at the specified path"}), 400 + logger.error("No CSV files found in the data folder of the Stack Manager: %s", folder_path) + return jsonify({"error": "No CSV files found in the data folder of the Stack Manager."}), 400 + temp_output_dir = os.path.join(folder_path, "temp_cleaned") results = [] for csv_file in csv_files: - logger.info(f"Loading and preprocessing file: {csv_file}") - gps_object = gdi.process_gps_csv_file(csv_file) - if not gps_object: - logger.warning(f"Failed to preprocess file: {csv_file}") - results.append({"file": csv_file, "status": "failed", "error": "Failed to preprocess file"}) - else: - results.append({"file": csv_file, "status": "success"}) - - if all(res["status"] == "failed" for res in results): - logger.error("Failed to preprocess all files") - return jsonify({"error": "Failed to preprocess all files", "results": results}), 500 - - logger.info("Files loaded and preprocessed successfully") - return jsonify({"message": "Files loaded and preprocessed", "results": results}), 200 - + logger.info("Cleaning file: %s", csv_file) + try: + cleaned_df, _ = clean_gps_data(csv_file, output_dir=temp_output_dir) + results.append({"file": csv_file, "status": "success", "data_cleaning": True}) + except Exception as e: + logger.error("Failed to clean file: %s", csv_file, exc_info=True) + results.append({"file": csv_file, "status": "failed", "error": str(e)}) + return jsonify({"message": "Files preprocessed successfully", "results": results}), 200 except Exception as e: - logger.error(f"Error in load_and_preprocess: {str(e)}") + logger.error("Error in preprocess_files: %s", e, exc_info=True) return jsonify({"error": "Internal Server Error", "details": str(e)}), 500 @gps_instantiation_bp.route('/instantiate', methods=['POST']) -def process_and_instantiate(): +def instantiate_files(): + """ + Process cleaned files using TSCLIENT. + After processing, temporary cleaned files are removed to keep storage usage minimal. + """ try: - file_path = request.json.get('file_path') - if not file_path: - logger.error("File path is missing in the request.") - return jsonify({"error": "File path is missing in the request."}), 400 - - logger.info(f"Received request to process files at: {file_path}") - csv_files = glob.glob(os.path.join(file_path, '*.csv')) + # Use the constant internal data path instead of reading from the request. + folder_path = INTERNAL_DATA_PATH + logger.info("Using internal data folder: %s", folder_path) + csv_files = glob.glob(os.path.join(folder_path, '*.csv')) if not csv_files: - logger.error("No CSV files found at the specified path") - return jsonify({"error": "No CSV files found at the specified path"}), 400 + logger.error("No CSV files found in the data folder of the Stack Manager: %s", folder_path) + return jsonify({"error": "No CSV files found in the data folder of the Stack Manager"}), 400 results = [] + temp_output_dir = os.path.join(folder_path, "temp_cleaned") + if not os.path.exists(temp_output_dir): + os.makedirs(temp_output_dir) for csv_file in csv_files: - logger.info(f"Processing file: {csv_file}") - gps_object = gdi.process_gps_csv_file(csv_file) - if not gps_object: - logger.warning(f"Failed to process file: {csv_file}") - results.append({"file": csv_file, "status": "failed", "error": "Failed to process file"}) + logger.info("Cleaning file: %s", csv_file) + try: + _, temp_file = clean_gps_data(csv_file, output_dir=temp_output_dir) + except Exception as e: + logger.error("Failed to clean file: %s", csv_file, exc_info=True) + results.append({"file": csv_file, "status": "failed", "error": "Data cleaning failed: " + str(e)}) continue try: + logger.info("Processing cleaned file.") + gps_object = gdi.process_gps_csv_file(temp_file) + if not gps_object: + logger.warning("Failed to process cleaned file for: %s", csv_file) + results.append({"file": csv_file, "status": "failed", "error": "Processing failed"}) + continue + kg_client, ts_client, double_class, point_class = gdi.setup_clients() - gdi.instantiate_gps_data(gps_object, kg_client, ts_client, double_class, point_class) + dataIRIs = gdi.instantiate_gps_data(gps_object, kg_client, ts_client, double_class, point_class) + if dataIRIs: + first_dataIRI = dataIRIs[0] + gdi.update_unix_time(ts_client, first_dataIRI) + else: + logger.warning("No dataIRIs returned from instantiate_gps_data.") results.append({"file": csv_file, "status": "success"}) except Exception as e: - logger.error(f"Error instantiating data for file {csv_file}: {e}") + logger.error("Error instantiating data for file %s: %s", csv_file, e, exc_info=True) results.append({"file": csv_file, "status": "failed", "error": str(e)}) - if all(res["status"] == "failed" for res in results): - logger.error("Failed to process and instantiate all files") - return jsonify({"error": "Failed to process and instantiate all files", "results": results}), 500 + if os.path.exists(temp_output_dir): + shutil.rmtree(temp_output_dir) + logger.info("Temporary DataFrames after cleaning deleted: %s", temp_output_dir) - logger.info("GPS data processed and instantiated successfully") - return jsonify({"message": "GPS data processed and instantiated", "results": results}), 200 + return jsonify({"message": "Files processed and instantiated successfully.", "results": results}), 200 - except AssertionError as e: - logger.error(f"Assertion error during processing: {str(e)}") - return jsonify({"error": str(e)}), 400 except Exception as e: - logger.error(f"Error in process_and_instantiate: {str(e)}") + logger.error("Error in instantiate_files: %s", e, exc_info=True) return jsonify({"error": "Internal Server Error", "details": str(e)}), 500 @gps_instantiation_bp.route('/layer_generator', methods=['POST']) diff --git a/Agents/FenlandTrajectoryAgent/docker-compose-build.yml b/Agents/FenlandTrajectoryAgent/docker-compose-build.yml index 9a18e7a7426..b2849ea0736 100644 --- a/Agents/FenlandTrajectoryAgent/docker-compose-build.yml +++ b/Agents/FenlandTrajectoryAgent/docker-compose-build.yml @@ -1,4 +1,3 @@ -version: '3.8' services: fenland-trajectory-agent: build: diff --git a/Agents/FenlandTrajectoryAgent/docker-compose-debug.yml b/Agents/FenlandTrajectoryAgent/docker-compose-debug.yml index 50deb784bfb..ddd7980ecc2 100644 --- a/Agents/FenlandTrajectoryAgent/docker-compose-debug.yml +++ b/Agents/FenlandTrajectoryAgent/docker-compose-debug.yml @@ -1,4 +1,3 @@ -version: '3.8' services: fenland-trajectory-agent: image: ghcr.io/cambridge-cares/fenland-trajectory-agent:1.0.0 diff --git a/Agents/FenlandTrajectoryAgent/docker-compose-stack.yml b/Agents/FenlandTrajectoryAgent/docker-compose-stack.yml index 9202f3244a8..88cfeffaa0d 100644 --- a/Agents/FenlandTrajectoryAgent/docker-compose-stack.yml +++ b/Agents/FenlandTrajectoryAgent/docker-compose-stack.yml @@ -1,4 +1,3 @@ -version: '3.8' services: fenland-trajectory-agent: deploy: diff --git a/Agents/FenlandTrajectoryAgent/docker-compose.yml b/Agents/FenlandTrajectoryAgent/docker-compose.yml index 9bbdab63a56..411ed6bd41e 100644 --- a/Agents/FenlandTrajectoryAgent/docker-compose.yml +++ b/Agents/FenlandTrajectoryAgent/docker-compose.yml @@ -1,5 +1,3 @@ -version: "3.8" - services: fenland-trajectory-agent: image: ghcr.io/cambridge-cares/fenland-trajectory-agent:1.0.0 diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count.http new file mode 100644 index 00000000000..6a69de50d9d --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count.http @@ -0,0 +1,15 @@ +POST http://localhost:3838/fenland-trajectory-agent/exposure +Content-Type: application/json + +{ + "trajectoryIRIs": [ + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_a57a5978-0d2f-41b3-ab35-35944166f322", + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_5674b1de-a387-433d-a650-f455291d3f73" + ], + "exposure_radius": 100, + "DataIRIs": [ + "http://www.theworldavatar.com/ontology/OntoFHRS/FoodHygieneRating", + "https://www.theworldavatar.com/kg/ontogreenspace/Greenspace" + + ] +} diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count_single_stack.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count_single_stack.http new file mode 100644 index 00000000000..581f650b91e --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/exposure_count_single_stack.http @@ -0,0 +1,13 @@ +POST http://localhost:3838/fenland-trajectory-agent/exposure/simplified +Content-Type: application/json + +{ + "trajectoryIRIs": [ + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_a57a5978-0d2f-41b3-ab35-35944166f322" + ], + "exposure_radius": 100, + "DataIRIs": [ + "http://www.theworldavatar.com/ontology/OntoFHRS/FoodHygieneRating", + "https://www.theworldavatar.com/kg/ontogreenspace/Greenspace" + ] +} diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_instantiate.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_instantiate.http index 3c058463ab5..79520e3bf9b 100644 --- a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_instantiate.http +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_instantiate.http @@ -1,7 +1,3 @@ - -POST http://localhost:3840/fenland-trajectory-agent/instantiate +POST http://localhost:3838/fenland-trajectory-agent/instantiate Content-Type: application/json -{ - "file_path": "/app/agent/raw_data/gps_target_folder" -} diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_preprocess.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_preprocess.http index befb98fb1d3..3cc1cc165f7 100644 --- a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_preprocess.http +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/gps_preprocess.http @@ -1,7 +1,2 @@ - -POST http://localhost:3840/fenland-trajectory-agent/preprocess +POST http://localhost:3838/fenland-trajectory-agent/preprocess Content-Type: application/json - -{ - "file_path": "/app/agent/raw_data/gps_target_folder" -} diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/layer_generator.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/layer_generator.http index b0e836199a9..858868116dd 100644 --- a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/layer_generator.http +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/layer_generator.http @@ -1,4 +1,4 @@ -POST http://localhost:3840/fenland-trajectory-agent/layer_generator +POST http://localhost:3838/fenland-trajectory-agent/layer_generator Content-Type: application/json { diff --git a/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/trip_detection.http b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/trip_detection.http new file mode 100644 index 00000000000..b37c544534f --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/example-requests/SendHTTP/trip_detection.http @@ -0,0 +1,6 @@ +POST http://localhost:5007/fenland-trajectory-agent/trip_detection +Content-Type: application/json + +{ + "trajectoryIRI": "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_15f6441b-21a8-4614-9eb5-66d70aea0aee" +} diff --git a/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count.sh b/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count.sh new file mode 100755 index 00000000000..86d5270019a --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copy all the following commands once, paste them into the terminal together, and then press Enter +curl -X POST http://localhost:3838/fenland-trajectory-agent/exposure \ + -H "Content-Type: application/json" \ + -d '{ + "trajectoryIRIs": [ + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_a57a5978-0d2f-41b3-ab35-35944166f322", + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_5674b1de-a387-433d-a650-f455291d3f73" + ], + "exposure_radius": 100, + "DataIRIs": [ + "http://www.theworldavatar.com/ontology/OntoFHRS/FoodHygieneRating", + "https://www.theworldavatar.com/kg/ontogreenspace/Greenspace" + ] + }' diff --git a/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count_single_stack.sh b/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count_single_stack.sh new file mode 100755 index 00000000000..1fed5be9766 --- /dev/null +++ b/Agents/FenlandTrajectoryAgent/example-requests/curl/exposure_count_single_stack.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copy all the following commands once, paste them into the terminal together, and then press Enter +curl -X POST http://localhost:3838/fenland-trajectory-agent/exposure/simplified \ + -H "Content-Type: application/json" \ + -d '{ + "trajectoryIRIs": [ + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_a57a5978-0d2f-41b3-ab35-35944166f322", + "https://www.theworldavatar.com/kg/ontotimeseries/Timeseries_5674b1de-a387-433d-a650-f455291d3f73" + ], + "exposure_radius": 100, + "DataIRIs": [ + "http://www.theworldavatar.com/ontology/OntoFHRS/FoodHygieneRating", + "https://www.theworldavatar.com/kg/ontogreenspace/Greenspace" + ] + }' diff --git a/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_instantiate.sh b/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_instantiate.sh index de87380a3fb..5fdb1c0e3e0 100755 --- a/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_instantiate.sh +++ b/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_instantiate.sh @@ -1,5 +1,4 @@ #!/bin/bash # Copy all the following commands once, paste them into the terminal together, and then press Enter curl -X POST http://localhost:3838/fenland-trajectory-agent/instantiate \ - -H "Content-Type: application/json" \ - -d '{"file_path":"/app/agent/raw_data/gps_target_folder"}' + -H "Content-Type: application/json" diff --git a/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_preprocess.sh b/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_preprocess.sh index 581088a4f52..828650d1bff 100755 --- a/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_preprocess.sh +++ b/Agents/FenlandTrajectoryAgent/example-requests/curl/gps_preprocess.sh @@ -1,5 +1,4 @@ #!/bin/bash # Copy all the following commands once, paste them into the terminal together, and then press Enter curl -X POST http://localhost:3838/fenland-trajectory-agent/preprocess \ - -H "Content-Type: application/json" \ - -d '{"file_path":"/app/agent/raw_data/gps_target_folder"}' + -H "Content-Type: application/json" diff --git a/Agents/FenlandTrajectoryAgent/resources/ontop.obda b/Agents/FenlandTrajectoryAgent/resources/ontop.obda index 7bfbac3b807..f0f754a565f 100644 --- a/Agents/FenlandTrajectoryAgent/resources/ontop.obda +++ b/Agents/FenlandTrajectoryAgent/resources/ontop.obda @@ -1,45 +1,169 @@ [PrefixDeclaration] -ontodevice: http://www.theworldavatar.com/kb/ontodevice/ -rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# -rdfs: http://www.w3.org/2000/01/rdf-schema# -ontodevice: https://www.theworldavatar.com/kg/ontodevice.owl# -geolit: http://www.bigdata.com/rdf/geospatial/literals/v1 -geo: http://www.bigdata.com/rdf/geospatial# -ts: http://www.theworldavatar.com/kb/ontotimeseries/ -xsd: http://www.w3.org/2001/XMLSchema# -om: http://www.ontology-of-units-of-measure.org/resource/om-2/ +fh: http://www.theworldavatar.com/ontology/OntoFHRS/ +gs: https://www.theworldavatar.com/kg/ontogreenspace/ +ies4: http://ies.data.gov.uk/ontology/ies4# +xsd: http://www.w3.org/2001/XMLSchema# +rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# +rdfs: http://www.w3.org/2000/01/rdf-schema# +owl: http://www.w3.org/2002/07/owl# +geo: http://www.opengis.net/ont/geosparql# +time: http://www.w3.org/2006/time/ +wgs: http://www.w3.org/2003/01/geo/wgs84_pos# +ex: http://example.org/ns# +exposure: http://www.theworldavatar.com/ontology/OntoEnvExpo/ [MappingDeclaration] @collection [[ -# Mapping for GPS data including units with the updated "ontodevice:" prefix -mappingId GPSDataWithUnits -target ontodevice:GPS_{filename} a ontodevice:GPSDevice ; - rdfs:label "GPS data for {filename} at {timestamp}"^^xsd:string ; - ontodevice:hasLatitude [ - a ontodevice:Latitude ; - rdf:value "{LATITUDE}"^^xsd:float ; - om:hasUnit om:degree - ] ; - ontodevice:hasLongitude [ - a ontodevice:Longitude ; - rdf:value "{LONGITUDE}"^^xsd:float ; - om:hasUnit om:degree - ] ; - ontodevice:hasSpeed [ - a ontodevice:Speed ; - rdf:value "{SPEED}"^^xsd:float ; - om:hasUnit om:kilometrePerHour" - ] ; - ontodevice:hasDistance [ - a ontodevice:Distance ; - rdf:value "{DISTANCE}"^^xsd:string ; # Assuming DISTANCE might be a string that needs conversion - om:hasUnit om:metre - ] ; - ts:hasTimestamp "{timestamp}"^^xsd:dateTime . -source SELECT 'gps_testcase' AS filename, - "LATITUDE", - "LONGITUDE", - "SPEED", - "DISTANCE", - ("UTC DATE" || 'T' || "UTC TIME" || 'Z') AS timestamp - FROM gps_testcase +mappingId FoodHygieneDomain +target a exposure:Domain ; exposure:hasDomainName "Food hygiene rating"^^xsd:string ; exposure:hasFeatureType "POINT"^^xsd:string ; exposure:hasDataSource "supermarket"^^xsd:string, "takeaways"^^xsd:string . +source SELECT DISTINCT 'Food hygiene rating' AS "DomainName" FROM "fhr_cambridge_2025" + +mappingId Time-BEStart-2023 +target fh:BusinessEstablishmentStart/{FHRSID} a fh:BusinessEstablishmentStart ; ies4:isStartOf fh:BusinessEstablishment/{FHRSID} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_start/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId Time-BEStart-2025 +target fh:BusinessEstablishmentStart/{FHRSID} a fh:BusinessEstablishmentStart ; ies4:isStartOf fh:BusinessEstablishment/{FHRSID} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_start/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId Time-PeriodStart-2023 +target ies4:ParticularPeriod/{FHRSID}_start/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_Start}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_Start", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId Time-PeriodStart-2025 +target ies4:ParticularPeriod/{FHRSID}_start/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_Start}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_Start", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId Time-BEEnd-2023 +target fh:BusinessEstablishmentEnd/{FHRSID} a fh:BusinessEstablishmentEnd ; ies4:isEndOf fh:BusinessEstablishment/{FHRSID} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_end/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2023" + +mappingId Time-BEEnd-2025 +target fh:BusinessEstablishmentEnd/{FHRSID} a fh:BusinessEstablishmentEnd ; ies4:isEndOf fh:BusinessEstablishment/{FHRSID} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_end/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2025" + +mappingId Time-PeriodEnd-2023 +target ies4:ParticularPeriod/{FHRSID}_end/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_End}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_End", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2023" + +mappingId Time-PeriodEnd-2025 +target ies4:ParticularPeriod/{FHRSID}_end/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_End}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_End", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2025" + +mappingId Attributes-BE-with-Ratings-2023 +target fh:BusinessEstablishment/{FHRSID} a fh:BusinessEstablishment ; fh:hasRating fh:Rating/{FHRSID}/{year} ; rdf:type fh:{BusinessTypeID} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year, CAST("BusinessTypeID" AS varchar) AS "BusinessTypeID" + FROM "fhr_cambridge_2023" + +mappingId Attributes-BE-with-Ratings-2025 +target fh:BusinessEstablishment/{FHRSID} a fh:BusinessEstablishment ; fh:hasRating fh:Rating/{FHRSID}/{year} ; rdf:type fh:{BusinessTypeID} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year, CAST("BusinessTypeID" AS varchar) AS "BusinessTypeID" + FROM "fhr_cambridge_2025" + +mappingId Attributes-BE-with-Geometry-2023 +target fh:BusinessEstablishment/{FHRSID} a geo:Feature ; geo:hasGeometry geo:Geometry/{FHRSID}/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId Attributes-BE-with-Geometry-2025 +target fh:BusinessEstablishment/{FHRSID} a geo:Feature ; geo:hasGeometry geo:Geometry/{FHRSID}/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId Geometry_Details-2023 +target geo:Geometry/{FHRSID}/{year} a geo:Point ; geo:asWKT "POINT({Geocode/Longitude} {Geocode/Latitude})"^^geo:wktLiteral ; geo:crs ; wgs:long "{Geocode/Longitude}"^^xsd:double ; wgs:lat "{Geocode/Latitude}"^^xsd:double . +source SELECT "FHRSID", "Geocode/Longitude", "Geocode/Latitude", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId Geometry_Details-2025 +target geo:Geometry/{FHRSID}/{year} a geo:Point ; geo:asWKT "POINT({Geocode/Longitude} {Geocode/Latitude})"^^geo:wktLiteral ; geo:crs ; wgs:long "{Geocode/Longitude}"^^xsd:double ; wgs:lat "{Geocode/Latitude}"^^xsd:double . +source SELECT "FHRSID", "Geocode/Longitude", "Geocode/Latitude", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId RatingStart-2023 +target fh:RatingStart/{FHRSID}/{year} a fh:RatingStart ; ies4:isStartOf fh:Rating/{FHRSID}/{year} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_rating_start/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId RatingStart-2025 +target fh:RatingStart/{FHRSID}/{year} a fh:RatingStart ; ies4:isStartOf fh:Rating/{FHRSID}/{year} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_rating_start/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId RatingPeriodStart-2023 +target ies4:ParticularPeriod/{FHRSID}_rating_start/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_Start}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_Start", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId RatingPeriodStart-2025 +target ies4:ParticularPeriod/{FHRSID}_rating_start/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_Start}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_Start", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId RatingEnd-2023 +target fh:RatingEnd/{FHRSID}/{year} a fh:RatingEnd ; ies4:isEndOf fh:Rating/{FHRSID}/{year} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_rating_end/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2023" + +mappingId RatingEnd-2025 +target fh:RatingEnd/{FHRSID}/{year} a fh:RatingEnd ; ies4:isEndOf fh:Rating/{FHRSID}/{year} ; ies4:inPeriod ies4:ParticularPeriod/{FHRSID}_rating_end/{year} . +source SELECT "FHRSID", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2025" + +mappingId RatingPeriodEnd-2023 +target ies4:ParticularPeriod/{FHRSID}_rating_end/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_End}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_End", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2023" + +mappingId RatingPeriodEnd-2025 +target ies4:ParticularPeriod/{FHRSID}_rating_end/{year} a ies4:ParticularPeriod ; ies4:iso8601PeriodRepresentation {Business_Establishment_End}^^xsd:dateTimeStamp . +source SELECT "FHRSID", "Business_Establishment_End", EXTRACT(YEAR FROM "Business_Establishment_End") AS year + FROM "fhr_cambridge_2025" + +mappingId Rating_Details-2023 +target fh:Rating/{FHRSID}/{year} a fh:Rating ; fh:hasValue "{RatingValue}"^^xsd:string . +source SELECT "FHRSID", "RatingValue", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2023" + +mappingId Rating_Details-2025 +target fh:Rating/{FHRSID}/{year} a fh:Rating ; fh:hasValue "{RatingValue}"^^xsd:string . +source SELECT "FHRSID", "RatingValue", EXTRACT(YEAR FROM "Business_Establishment_Start") AS year + FROM "fhr_cambridge_2025" + +mappingId LocalAuthorityMapping +target fh:BusinessEstablishment/{FHRSID} fh:hasLocalAuthority fh:LocalAuthority/{LocalAuthorityCode} . +source SELECT DISTINCT "FHRSID", "LocalAuthorityCode" FROM "fhr_cambridge_2025" + +mappingId LocalAuthorityDetails +target fh:LocalAuthority/{LocalAuthorityCode} a fh:LocalAuthority ; fh:hasName "{LocalAuthorityName}"^^xsd:string ; fh:hasWebsite "{LocalAuthorityWebSite}"^^xsd:anyURI ; fh:hasEmailAddress "{LocalAuthorityEmailAddress}"^^xsd:string . +source SELECT DISTINCT "LocalAuthorityCode", "LocalAuthorityName", "LocalAuthorityWebSite", "LocalAuthorityEmailAddress" + FROM "fhr_cambridge_2025" + +mappingId GreenspaceCore +target gs:{id} a gs:Greenspace ; gs:hasFunction "{function}"^^xsd:string ; gs:hasFirstAlternativeName "{distName1}"^^xsd:string ; gs:hasSecondAlternativeName "{distName2}"^^xsd:string ; gs:hasThirdAlternativeName "{distName3}"^^xsd:string . +source SELECT "id", "function", "distName1", "distName2", "distName3" + FROM "GB_GreenspaceSite" + +mappingId GreenspaceGeometry +target gs:{id} a geo:Feature ; geo:hasGeometry geo:Geometry{id} . +source SELECT "id" + FROM "GB_GreenspaceSite" + +mappingId GS-Geometry-details +target geo:Geometry{id} geo:asWKT "{wkt}"^^geo:wktLiteral ; ex:hasWKB "{wkb_hex}"^^xsd:string ; geo:crs . +source SELECT "id", "wkb_geometry" AS "wkb_hex", ST_AsText("wkb_geometry") AS "wkt" + FROM "GB_GreenspaceSite" + +mappingId GreenspaceDomain +target a exposure:Domain ; exposure:hasDomainName "Greenspace"^^xsd:string ; exposure:hasFeatureType "AREA"^^xsd:string ; exposure:hasDataSource "GB_GreenspaceSite"^^xsd:string . +source SELECT DISTINCT 'Greenspace' AS "DomainName" + FROM "GB_GreenspaceSite" + ]] diff --git a/Agents/FenlandTrajectoryAgent/secrets/.gitignore b/Agents/FenlandTrajectoryAgent/secrets/.gitignore deleted file mode 100644 index c96a04f008e..00000000000 --- a/Agents/FenlandTrajectoryAgent/secrets/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/Agents/FenlandTrajectoryAgent/setup.py b/Agents/FenlandTrajectoryAgent/setup.py index c735353eeba..cb5003ebdef 100644 --- a/Agents/FenlandTrajectoryAgent/setup.py +++ b/Agents/FenlandTrajectoryAgent/setup.py @@ -7,17 +7,20 @@ author_email='jc2341@cam.ac.uk', packages=find_packages(), # Automatically find all packages install_requires=[ + 'pyproj==3.4.0', 'Flask==2.2.2', + 'flask-cors==3.0.10', + 'SPARQLWrapper==1.8.5', 'pytz==2024.1', 'configobj==5.0.6', 'APScheduler==3.9.1', - 'numpy==1.21.6', + 'numpy>=1.22', 'tabulate==0.8.10', 'uuid==1.30', 'datetime==4.7', 'openpyxl==3.0.10', 'geomet==1.0.0', - 'geopandas==0.12.2', + 'geopandas==1.0.1', 'shapely==2.0.0', 'netCDF4==1.6.2', 'matplotlib==3.6.2', @@ -29,14 +32,17 @@ 'cryptography==39.0.0', 'python-dotenv==0.21.0', 'docopt==0.6.2', - 'pandas==1.3.5', + 'pandas>=1.4.0', 'JayDeBeApi==1.2.3', 'twa', 'requests==2.28.1', 'gunicorn==20.1.0', 'click>=8.0', 'werkzeug==2.2.2', - 'psycopg2-binary==2.9.5' + 'psycopg2-binary==2.9.5', + 'scikit-learn>=1.0.2', + 'scikit-image>=0.19.0', + 'KDEpy>=1.1.0' ], extras_require={ 'dev': [ diff --git a/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent-debug.json b/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent-debug.json index a618ad511fb..f679b6b36f8 100644 --- a/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent-debug.json +++ b/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent-debug.json @@ -12,6 +12,11 @@ "Type": "bind", "Source": "//TheWorldAvatar/Agents/FenlandTrajectoryAgent", "Target": "/app" + }, + { + "Type": "volume", + "Source": "fta-input", + "Target": "/app/agent/raw_data/gps_target_folder" } ], "Env": [ diff --git a/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent.json b/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent.json index 0d2e8ba5322..706704ca81b 100644 --- a/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent.json +++ b/Agents/FenlandTrajectoryAgent/stack-manager-input-config-service/fenland-trajectory-agent.json @@ -11,6 +11,13 @@ "GEOSERVER_WORKSPACE=gps_trajectory", "ONTOP_FILE=/app/resources/ontop.obda" ], + "Mounts": [ + { + "Type": "volume", + "Source": "fta-input", + "Target": "/app/agent/raw_data/gps_target_folder" + } + ], "Configs": [ { "ConfigName": "blazegraph"