From a0fbc2306a45e1bf0360419f539b40724e8c77bd Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:57:24 +0000 Subject: [PATCH 01/74] #1132 raw_segments table definition --- .../create-table-congestion_raw_segments.sql | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 here/traffic/sql/create-table-congestion_raw_segments.sql diff --git a/here/traffic/sql/create-table-congestion_raw_segments.sql b/here/traffic/sql/create-table-congestion_raw_segments.sql new file mode 100644 index 000000000..037275267 --- /dev/null +++ b/here/traffic/sql/create-table-congestion_raw_segments.sql @@ -0,0 +1,42 @@ +-- Table: gwolofs.congestion_raw_segments + +-- DROP TABLE IF EXISTS gwolofs.congestion_raw_segments; + +CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments +( + time_grp timestamp without time zone NOT NULL, + segment_id integer NOT NULL, + bin_range tsrange NOT NULL, + dt_start timestamp without time zone, + dt_end timestamp without time zone, + tt numeric, + unadjusted_tt numeric, + total_length numeric, + length_w_data numeric, + num_obs integer, + CONSTRAINT dynamic_bins_unique EXCLUDE USING gist ( + segment_id WITH =, + bin_range WITH &&, + time_grp WITH = + ) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments +OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.congestion_raw_segments FROM bdit_humans; + +GRANT SELECT ON TABLE gwolofs.congestion_raw_segments TO bdit_humans; + +GRANT ALL ON TABLE gwolofs.congestion_raw_segments TO gwolofs; +-- Index: dynamic_bin_idx + +-- DROP INDEX IF EXISTS gwolofs.dynamic_bin_idx; + +CREATE INDEX IF NOT EXISTS dynamic_bin_idx + ON gwolofs.congestion_raw_segments USING btree + (segment_id ASC NULLS LAST, time_grp ASC NULLS LAST) + WITH (deduplicate_items=True) + TABLESPACE pg_default; \ No newline at end of file From 48ad3a8f4327c285633322c4acb4f2edafca7285 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:57:36 +0000 Subject: [PATCH 02/74] #1132 raw_segments aggregation --- .../sql/select-congestion_raw_segments.sql | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 here/traffic/sql/select-congestion_raw_segments.sql diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql new file mode 100644 index 000000000..81241b3d3 --- /dev/null +++ b/here/traffic/sql/select-congestion_raw_segments.sql @@ -0,0 +1,152 @@ +--TRUNCATE gwolofs.congestion_raw_segments; + +--INSERT 0 771478 +--Query returned successfully in 2 min 36 secs. +-- vs 7,756,256 rows in (SELECT COUNT(*) FROM here.ta_path WHERE dt = '2025-01-04') = 1/10 + +WITH segment_5min_bins AS ( + SELECT + segments.segment_id, + date_trunc('hour', ta.tx) AS time_grp, + ta.tx, + RANK() OVER w AS bin_rank, + SUM(links.length) / segments.total_length AS sum_length, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, + ARRAY_AGG(links.length ORDER BY link_dir) AS lengths + FROM here.ta_path AS ta + JOIN congestion.network_links_23_4_geom AS links USING (link_dir) + JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) + WHERE ta.dt = '2025-01-04' + --AND tx < '2025-01-04 01:00:00' + --AND segment_id = 1 AND date_trunc('hour', ta.tx) = '2025-01-04 00:00:00' + GROUP BY + segments.segment_id, + ta.tx, + segments.total_length + WINDOW w AS ( + PARTITION BY segments.segment_id, date_trunc('hour', ta.tx) + ORDER BY ta.tx + ) +), + +dynamic_bin_options AS ( + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80% length + SELECT + tx, + time_grp, + segment_id, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY time_grp, segment_id + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) +), + +unnested_db_options AS ( + SELECT + dbo.time_grp, + dbo.segment_id, + dbo.tx AS dt_start, + --exclusive end bin + MAX(s5b.tx) + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.segment_id = dbo.segment_id + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + GROUP BY + dbo.time_grp, + dbo.segment_id, + dbo.tx, + dbo.end_bin, + unnested.link_dir, + unnested.len +) + +INSERT INTO gwolofs.congestion_raw_segments ( + time_grp, segment_id, dt_start, dt_end, bin_range, tt, + unadjusted_tt, total_length, length_w_data, num_obs +) +--this query contains overlapping values which get eliminated +--via on conflict with the exclusion constraint on congestion_raw_segments table. +SELECT DISTINCT ON (udbo.time_grp, udbo.segment_id, udbo.dt_start) + udbo.time_grp, + udbo.segment_id, + udbo.dt_start, + udbo.dt_end, + tsrange(udbo.dt_start, udbo.dt_end, '[)') AS bin_range, + segments.total_length / SUM(udbo.len) * SUM(udbo.tt) AS tt, + SUM(udbo.tt) AS unadjusted_tt, + segments.total_length, + SUM(udbo.len) AS length_w_data, + SUM(udbo.num_obs) AS num_obs --sum of here.ta_path sample_size for each segment +FROM unnested_db_options AS udbo +LEFT JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) +GROUP BY + udbo.time_grp, + udbo.segment_id, + udbo.dt_start, + udbo.dt_end, + segments.total_length +HAVING SUM(udbo.len) >= 0.8 * segments.total_length +ORDER BY + udbo.time_grp, + udbo.segment_id, + udbo.dt_start, + udbo.dt_end +--exclusion constraint + ordered insert to prevent overlapping bins +ON CONFLICT ON CONSTRAINT congestion_raw_segments_unique +DO NOTHING; + +/* +--bins which were not used. Might consider adding these on to bins that already have sufficient data. +SELECT * +FROM gwolofs.segment_5min_bins AS s5b +LEFT JOIN gwolofs.congestion_raw_segments AS dyb ON + s5b.time_grp = dyb.time_grp + AND s5b.source_node = dyb.source_node + AND s5b.dest_node = dyb.dest_node + AND s5b.tx <@ dyb.bin_range +WHERE dyb.bin_range IS NULL +*/ + +/* +WITH hourly_obs AS ( + SELECT time_grp, segment_id, AVG(tt) AS avg_hour_tt, COUNT(*) + FROM gwolofs.congestion_raw_segments + GROUP BY time_grp, segment_id +) + +SELECT segment_id, date_part('hour', time_grp), AVG(avg_hour_tt) AS avg_tt, SUM(count) +FROM hourly_obs +GROUP BY 1, 2 ORDER BY 1, 2; +*/ \ No newline at end of file From 073a814a891b4bde0e3c338c601732f2c0d8214e Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Jan 2025 20:06:03 +0000 Subject: [PATCH 03/74] #1132 add time periods in addition to hours --- .../create-table-congestion_raw_segments.sql | 2 +- .../sql/select-congestion_raw_segments.sql | 112 +++++++++++++----- 2 files changed, 86 insertions(+), 28 deletions(-) diff --git a/here/traffic/sql/create-table-congestion_raw_segments.sql b/here/traffic/sql/create-table-congestion_raw_segments.sql index 037275267..9b6d39f5d 100644 --- a/here/traffic/sql/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/create-table-congestion_raw_segments.sql @@ -4,7 +4,7 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments ( - time_grp timestamp without time zone NOT NULL, + time_grp tsrange NOT NULL, segment_id integer NOT NULL, bin_range tsrange NOT NULL, dt_start timestamp without time zone, diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql index 81241b3d3..093de5e1c 100644 --- a/here/traffic/sql/select-congestion_raw_segments.sql +++ b/here/traffic/sql/select-congestion_raw_segments.sql @@ -1,21 +1,52 @@ --TRUNCATE gwolofs.congestion_raw_segments; --INSERT 0 771478 ---Query returned successfully in 2 min 36 secs. +--Query returned successfully in 2 min 51 secs. -- vs 7,756,256 rows in (SELECT COUNT(*) FROM here.ta_path WHERE dt = '2025-01-04') = 1/10 +--with addition of am/pm/midday time ranges: +--INSERT 0 1251472 (2024-01-04) +--Query returned successfully in 6 min 29 secs. -WITH segment_5min_bins AS ( +WITH time_bins AS ( + SELECT + start_time, + start_time + '1 hour'::interval AS end_time, + tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp + FROM generate_series( + '2025-01-04'::date, + '2025-01-04'::date + interval '23 hours', + '1 hour'::interval + ) AS hours(start_time) + UNION + SELECT + start_time + '2025-01-04'::date, + end_time + '2025-01-04'::date, + tsrange(start_time + '2025-01-04'::date, end_time + '2025-01-04'::date, '[)') + FROM ( + VALUES + ('07:00'::time, '10:00'::time), + ('10:00', '16:00'), + ('16:00', '19:00') + ) AS time_periods(start_time, end_time) + ORDER BY start_time +), + +segment_5min_bins AS ( SELECT segments.segment_id, - date_trunc('hour', ta.tx) AS time_grp, + tb.time_grp, ta.tx, RANK() OVER w AS bin_rank, + segments.total_length, SUM(links.length) / segments.total_length AS sum_length, + SUM(links.length) AS length_w_data, + SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, SUM(sample_size) AS num_obs, ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(links.length ORDER BY link_dir) AS lengths FROM here.ta_path AS ta + JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time JOIN congestion.network_links_23_4_geom AS links USING (link_dir) JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) WHERE ta.dt = '2025-01-04' @@ -23,10 +54,11 @@ WITH segment_5min_bins AS ( --AND segment_id = 1 AND date_trunc('hour', ta.tx) = '2025-01-04 00:00:00' GROUP BY segments.segment_id, + tb.time_grp, ta.tx, segments.total_length WINDOW w AS ( - PARTITION BY segments.segment_id, date_trunc('hour', ta.tx) + PARTITION BY segments.segment_id, tb.time_grp ORDER BY ta.tx ) ), @@ -68,6 +100,7 @@ unnested_db_options AS ( SELECT dbo.time_grp, dbo.segment_id, + s5b.total_length, dbo.tx AS dt_start, --exclusive end bin MAX(s5b.tx) + interval '5 minutes' AS dt_end, @@ -83,9 +116,12 @@ unnested_db_options AS ( AND s5b.bin_rank <= dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --we need to use nested data to determine length for these multi-period bins + WHERE dbo.start_bin != dbo.end_bin GROUP BY dbo.time_grp, dbo.segment_id, + s5b.total_length, dbo.tx, dbo.end_bin, unnested.link_dir, @@ -98,33 +134,55 @@ INSERT INTO gwolofs.congestion_raw_segments ( ) --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. -SELECT DISTINCT ON (udbo.time_grp, udbo.segment_id, udbo.dt_start) - udbo.time_grp, - udbo.segment_id, - udbo.dt_start, - udbo.dt_end, - tsrange(udbo.dt_start, udbo.dt_end, '[)') AS bin_range, - segments.total_length / SUM(udbo.len) * SUM(udbo.tt) AS tt, - SUM(udbo.tt) AS unadjusted_tt, - segments.total_length, - SUM(udbo.len) AS length_w_data, - SUM(udbo.num_obs) AS num_obs --sum of here.ta_path sample_size for each segment +SELECT DISTINCT ON (time_grp, segment_id, dt_start) + time_grp, + segment_id, + dt_start, + dt_end, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(tt) AS unadjusted_tt, + total_length, + SUM(len) AS length_w_data, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment FROM unnested_db_options AS udbo -LEFT JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) GROUP BY - udbo.time_grp, - udbo.segment_id, - udbo.dt_start, - udbo.dt_end, - segments.total_length -HAVING SUM(udbo.len) >= 0.8 * segments.total_length + time_grp, + segment_id, + dt_start, + dt_end, + total_length +HAVING SUM(len) >= 0.8 * total_length +UNION +--these 5 minute bins already have sufficient length +--don't need to use nested data to validate. +SELECT + dbo.time_grp, + dbo.segment_id, + dbo.tx AS dt_start, + dbo.tx + interval '5 minutes' AS dt_end, + tsrange(dbo.tx, dbo.tx + interval '5 minutes', '[)') AS bin_range, + s5b.total_length / s5b.length_w_data * s5b.unadjusted_tt AS tt, + s5b.unadjusted_tt, + s5b.total_length, + s5b.length_w_data, + s5b.num_obs --sum of here.ta_path sample_size for each segment +FROM dynamic_bin_options AS dbo +JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.segment_id = dbo.segment_id + AND s5b.bin_rank = dbo.start_bin +--we do not need to use nested data to determine length here. +WHERE + dbo.start_bin = dbo.end_bin + AND s5b.sum_length >= 0.8 ORDER BY - udbo.time_grp, - udbo.segment_id, - udbo.dt_start, - udbo.dt_end + time_grp, + segment_id, + dt_start, + dt_end --exclusion constraint + ordered insert to prevent overlapping bins -ON CONFLICT ON CONSTRAINT congestion_raw_segments_unique +ON CONFLICT ON CONSTRAINT dynamic_bins_unique DO NOTHING; /* From 29b8326b24c9aa2860ea46228cf8965dc85f4978 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:09:40 +0000 Subject: [PATCH 04/74] #1132 dynamic bins should not exceed 1hr in length --- here/traffic/sql/select-congestion_raw_segments.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql index 093de5e1c..7ec8f2adf 100644 --- a/here/traffic/sql/select-congestion_raw_segments.sql +++ b/here/traffic/sql/select-congestion_raw_segments.sql @@ -126,6 +126,8 @@ unnested_db_options AS ( dbo.end_bin, unnested.link_dir, unnested.len + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' ) INSERT INTO gwolofs.congestion_raw_segments ( From 6d1847e5127ece4277f5b7391ad9a81611303f45 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 10 Jan 2025 18:47:02 +0000 Subject: [PATCH 05/74] #1132 reamde describing dynamic binning query --- .../sql/select-congestion_raw_segments.md | 282 ++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 here/traffic/sql/select-congestion_raw_segments.md diff --git a/here/traffic/sql/select-congestion_raw_segments.md b/here/traffic/sql/select-congestion_raw_segments.md new file mode 100644 index 000000000..35f5a3e02 --- /dev/null +++ b/here/traffic/sql/select-congestion_raw_segments.md @@ -0,0 +1,282 @@ +This is a readme to describe the complex query [here](./select-congestion_raw_segments.sql). +Samples from each of the CTEs are shown for one segment/time_grp. Not all columns are shown from each CTE result. + +### time_bins +Contains hourly and period definitions, known as `time_grp`s. These define the extents within which to evaluate dynamic bin options. A dynamic bin must be fully within the time_grp. + +```sql +WITH time_bins AS ( + SELECT + start_time, + start_time + '1 hour'::interval AS end_time, + tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp + FROM generate_series( + '2025-01-04'::date, + '2025-01-04'::date + interval '23 hours', + '1 hour'::interval + ) AS hours(start_time) + UNION + SELECT + start_time + '2025-01-04'::date, + end_time + '2025-01-04'::date, + tsrange(start_time + '2025-01-04'::date, end_time + '2025-01-04'::date, '[)') + FROM ( + VALUES + ('07:00'::time, '10:00'::time), + ('10:00', '16:00'), + ('16:00', '19:00') + ) AS time_periods(start_time, end_time) + ORDER BY start_time +), +``` + +| "start_time" | "end_time" | "time_grp" | +|-----------------------|-----------------------|-----------------------------------------------------| +| "2025-01-04 00:00:00" | "2025-01-04 01:00:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | +| "2025-01-04 01:00:00" | "2025-01-04 02:00:00" | "[""2025-01-04 01:00:00"",""2025-01-04 02:00:00"")" | +| "2025-01-04 02:00:00" | "2025-01-04 03:00:00" | "[""2025-01-04 02:00:00"",""2025-01-04 03:00:00"")" | +| "2025-01-04 03:00:00" | "2025-01-04 04:00:00" | "[""2025-01-04 03:00:00"",""2025-01-04 04:00:00"")" | +| "2025-01-04 04:00:00" | "2025-01-04 05:00:00" | "[""2025-01-04 04:00:00"",""2025-01-04 05:00:00"")" | +| "2025-01-04 05:00:00" | "2025-01-04 06:00:00" | "[""2025-01-04 05:00:00"",""2025-01-04 06:00:00"")" | +| "2025-01-04 06:00:00" | "2025-01-04 07:00:00" | "[""2025-01-04 06:00:00"",""2025-01-04 07:00:00"")" | +| "2025-01-04 07:00:00" | "2025-01-04 08:00:00" | "[""2025-01-04 07:00:00"",""2025-01-04 08:00:00"")" | +| "2025-01-04 07:00:00" | "2025-01-04 10:00:00" | "[""2025-01-04 07:00:00"",""2025-01-04 10:00:00"")" | +| "2025-01-04 08:00:00" | "2025-01-04 09:00:00" | "[""2025-01-04 08:00:00"",""2025-01-04 09:00:00"")" | + +### segment_5min_bins +In this step we pull the relevant data from `here.ta_path` for each segment / time_grp. We save the disaggregate travel time data by link in 3 arrays (link_dirs, tts, lengths), so that in future steps we can reaggregate average segment travel time and distinct length over different ranges without referring back to the here.ta_path table. The time bins (`tx`) are also ranked to make it easier to enumerate possible bin extents using generate_series in the next step. + +```sql +segment_5min_bins AS ( + SELECT + segments.segment_id, + tb.time_grp, + ta.tx, + RANK() OVER w AS bin_rank, + segments.total_length, + SUM(links.length) / segments.total_length AS sum_length, + SUM(links.length) AS length_w_data, + SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, + ARRAY_AGG(links.length ORDER BY link_dir) AS lengths + FROM here.ta_path AS ta + JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time + JOIN congestion.network_links_23_4_geom AS links USING (link_dir) + JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) + WHERE ta.dt = '2025-01-04' + --AND tx < '2025-01-04 01:00:00' + AND segment_id = 29 AND date_trunc('hour', ta.tx) = '2025-01-04 00:00:00' + GROUP BY + segments.segment_id, + tb.time_grp, + ta.tx, + segments.total_length + WINDOW w AS ( + PARTITION BY segments.segment_id, tb.time_grp + ORDER BY ta.tx + ) +), +``` + +`SELECT bin_rank, tx, round(sum_length, 2) AS sum_length, link_dirs, tts FROM segment_5min_bins;` + +| "bin_rank" | "tx" | "sum_length" | "link_dirs" | "tts" | +|------------|-----------------------|--------------|----------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | "2025-01-04 00:00:00" | 1.01 | "{1000589822T,1000589823T,1280577167T,792343539T, 792343541T, 836248875T,836248876T,845737718T,845737719T}" | {4.59624489795918360,1.274693877551020408164, 4.96575000000000000,6.68329411764705876, 1.101306122448979591836, 1.526693877551020408164,1.196816326530612244884,4.79172413793103452,9.12626086956521724} | +| 2 | "2025-01-04 00:05:00" | 0.12 | "{845737718T}" | {19.85142857142857148} | +| 3 | "2025-01-04 00:15:00" | 0.07 | "{1280577167T}" | {2.787789473684210526300} | +| 4 | "2025-01-04 00:50:00" | 0.39 | "{845737718T,845737719T}" | {34.74000000000000000,28.62327272727272724} | +| 5 | "2025-01-04 00:55:00" | 1.01 | "{1000589822T,1000589823T,1280577167T,792343539T,792343541T, 836248875T,836248876T,845737718T,845737719T}" | {5.17737931034482764,1.435862068965517241376,1.826482758620689655172,3.91779310344827580,1.240551724137931034496,1.719724137931034482752, 1.348137931034482758612,2.459469026548672566372,6.42563265306122460} | + +### dynamic_bin_options +Here we enumerate all the possible dynamic bin options for each starting point. The number of combinations are cut down significantly with the `CASE` statements inside the `generate_series`: +- Don't enumerate options for 5min bins with sufficient length. +- Only look forward until the next 5min bin with sufficient lenght. + +```sql +dynamic_bin_options AS ( + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80% length + SELECT + tx, + time_grp, + segment_id, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY time_grp, segment_id + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) +), +``` + +In this case we find 8 dynamic bin options with the pruning conditions, down from max of 5+4+3+2+1 = 15. + +| "tx" | "time_grp" | "segment_id" | "start_bin" | "end_bin" | +|-----------------------|-----------------------------------------------------|--------------|-------------|-----------| +| "2025-01-04 00:00:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 1 | 1 | +| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 3 | +| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 4 | +| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 5 | +| "2025-01-04 00:15:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 3 | 4 | +| "2025-01-04 00:15:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 3 | 5 | +| "2025-01-04 00:50:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 4 | 5 | +| "2025-01-04 00:55:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 5 | 5 | + +### unnested_db_options +Combining the previous two steps, we have enumerated all the possible bin start/end ranges (`dynamic_bin_options`), now we can unnest the disaggregate data (`segment_5min_bins`) and evaluate them. +Note the multiple arrays unnested at once into rows, see `unnest ( anyarray, anyarray [, ... ] ) ` [here](https://www.postgresql.org/docs/current/functions-array.html#id-1.5.8.25.6.2.2.19.1.1.1). +We then group the results by bin / link_dir so we only have the unique length within each bin. + +```sql +unnested_db_options AS ( + SELECT + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx AS dt_start, + --exclusive end bin + MAX(s5b.tx) + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.segment_id = dbo.segment_id + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --we need to use nested data to determine length for these multi-period bins + WHERE dbo.start_bin != dbo.end_bin + GROUP BY + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx, + dbo.end_bin, + unnested.link_dir, + unnested.len +) +``` + +`SELECT dt_start, dt_end, link_dir, len, tt, num_obs FROM unnested_db_options WHERE dt_start = '2025-01-04 00:05:00' AND dt_end = '2025-01-04 01:00:00'` + +| "dt_start" | "dt_end" | "link_dir" | "len" | "tt" | "num_obs" | +|-----------------------|-----------------------|---------------|-------|--------------------------|-----------| +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1000589822T" | 62.56 | 5.17737931034482764 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1000589823T" | 17.35 | 1.435862068965517241376 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1280577167T" | 22.07 | 2.307136116152450090736 | 20 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "792343539T" | 47.34 | 3.91779310344827580 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "792343541T" | 14.99 | 1.240551724137931034496 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "836248875T" | 20.78 | 1.719724137931034482752 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "836248876T" | 16.29 | 1.348137931034482758612 | 18 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "845737718T" | 38.60 | 19.016965865992414682124 | 21 | +| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "845737719T" | 87.46 | 17.52445269016697592 | 20 | + +### Insert statement +Here we find bins with sufficient length, for the two cases: +- Multiple 5min bins assembled: need to check sufficient length from last step. +- An original 5min bin, no group by needed to check length. + +```sql +INSERT INTO gwolofs.congestion_raw_segments ( + time_grp, segment_id, dt_start, dt_end, bin_range, tt, + unadjusted_tt, total_length, length_w_data, num_obs +) +--this query contains overlapping values which get eliminated +--via on conflict with the exclusion constraint on congestion_raw_segments table. +SELECT DISTINCT ON (time_grp, segment_id, dt_start) + time_grp, + segment_id, + dt_start, + dt_end, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(tt) AS unadjusted_tt, + total_length, + SUM(len) AS length_w_data, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment +FROM unnested_db_options AS udbo +GROUP BY + time_grp, + segment_id, + dt_start, + dt_end, + total_length +HAVING SUM(len) >= 0.8 * total_length +UNION +--these 5 minute bins already have sufficient length +--don't need to use nested data to validate. +SELECT + time_grp, + segment_id, + tx AS dt_start, + tx + interval '5 minutes' AS dt_end, + tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, + total_length / length_w_data * unadjusted_tt AS tt, + unadjusted_tt, + total_length, + length_w_data, + num_obs --sum of here.ta_path sample_size for each segment +FROM segment_5min_bins +--we do not need to use nested data to determine length here. +WHERE sum_length >= 0.8 +ORDER BY + time_grp, + segment_id, + dt_start, + dt_end +--exclusion constraint + ordered insert to prevent overlapping bins +ON CONFLICT ON CONSTRAINT dynamic_bins_unique +DO NOTHING; +``` + +`SELECT segment_id, bin_range, round(tt, 2) AS tt, total_length, length_w_data FROM inserted;` + +| "segment_id" | "bin_range" | "tt" | "total_length" | "length_w_data" | +|--------------|-----------------------------------------------------|-------|----------------|-----------------| +| 29 | "[""2025-01-04 00:00:00"",""2025-01-04 00:05:00"")" | 34.93 | 324.33 | 327.44 | +| 29 | "[""2025-01-04 00:05:00"",""2025-01-04 01:00:00"")" | 53.18 | 324.33 | 327.44 | +| 29 | "[""2025-01-04 00:15:00"",""2025-01-04 01:00:00"")" | 52.76 | 324.33 | 327.44 | +| 29 | "[""2025-01-04 00:50:00"",""2025-01-04 01:00:00"")" | 52.29 | 324.33 | 327.44 | +| 29 | "[""2025-01-04 00:55:00"",""2025-01-04 01:00:00"")" | 25.31 | 324.33 | 327.44 | + +After insert against exclusion constraint, only 2 remain, since records 3,4,5 overlap with record 2 above. +`SELECT segment_id, bin_range, round(tt, 2) AS tt, total_length, length_w_data FROM gwolofs.congestion_raw_segments WHERE segment_id = 29 AND time_grp = '["2025-01-04 00:00:00","2025-01-04 01:00:00")'::tsrange` + +Constraint: +```sql + CONSTRAINT dynamic_bins_unique EXCLUDE USING gist ( + segment_id WITH =, + bin_range WITH &&, + time_grp WITH = + ) +``` + +| "segment_id" | "bin_range" | "tt" | "total_length" | "length_w_data" | +|--------------|-----------------------------------------------------|-------|----------------|-----------------| +| 29 | "[""2025-01-04 00:00:00"",""2025-01-04 00:05:00"")" | 34.93 | 324.33 | 327.44 | +| 29 | "[""2025-01-04 00:05:00"",""2025-01-04 01:00:00"")" | 53.18 | 324.33 | 327.44 | \ No newline at end of file From 696de6eee38a02a365a58b05bb1f2ad97c862058 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 10 Jan 2025 18:47:23 +0000 Subject: [PATCH 06/74] #1132 remove unnecessary join from insert subquery --- .../sql/select-congestion_raw_segments.sql | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql index 7ec8f2adf..ddf922fd1 100644 --- a/here/traffic/sql/select-congestion_raw_segments.sql +++ b/here/traffic/sql/select-congestion_raw_segments.sql @@ -147,7 +147,7 @@ SELECT DISTINCT ON (time_grp, segment_id, dt_start) total_length, SUM(len) AS length_w_data, SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment -FROM unnested_db_options AS udbo +FROM unnested_db_options GROUP BY time_grp, segment_id, @@ -159,25 +159,19 @@ UNION --these 5 minute bins already have sufficient length --don't need to use nested data to validate. SELECT - dbo.time_grp, - dbo.segment_id, - dbo.tx AS dt_start, - dbo.tx + interval '5 minutes' AS dt_end, - tsrange(dbo.tx, dbo.tx + interval '5 minutes', '[)') AS bin_range, - s5b.total_length / s5b.length_w_data * s5b.unadjusted_tt AS tt, - s5b.unadjusted_tt, - s5b.total_length, - s5b.length_w_data, - s5b.num_obs --sum of here.ta_path sample_size for each segment -FROM dynamic_bin_options AS dbo -JOIN segment_5min_bins AS s5b - ON s5b.time_grp = dbo.time_grp - AND s5b.segment_id = dbo.segment_id - AND s5b.bin_rank = dbo.start_bin + time_grp, + segment_id, + tx AS dt_start, + tx + interval '5 minutes' AS dt_end, + tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, + total_length / length_w_data * unadjusted_tt AS tt, + unadjusted_tt, + total_length, + length_w_data, + num_obs --sum of here.ta_path sample_size for each segment +FROM segment_5min_bins --we do not need to use nested data to determine length here. -WHERE - dbo.start_bin = dbo.end_bin - AND s5b.sum_length >= 0.8 +WHERE sum_length >= 0.8 ORDER BY time_grp, segment_id, From 8edb8c8b604edee000291ee1befeeaf8dd535c6a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 17 Jan 2025 18:02:29 +0000 Subject: [PATCH 07/74] #1132 here dynamic binning: function --- .../create-function-here_dynamic_bin_avg.sql | 81 ++++++++ .../create-table-dynamic_binning_results.sql | 44 ++++ here/traffic/sql/create-table-tt_segments.sql | 23 +++ .../sql/procedure-cache_tt_results.sql | 195 ++++++++++++++++++ .../sql/procedure-cache_tt_segment.sql | 25 +++ 5 files changed, 368 insertions(+) create mode 100644 here/traffic/sql/create-function-here_dynamic_bin_avg.sql create mode 100644 here/traffic/sql/create-table-dynamic_binning_results.sql create mode 100644 here/traffic/sql/create-table-tt_segments.sql create mode 100644 here/traffic/sql/procedure-cache_tt_results.sql create mode 100644 here/traffic/sql/procedure-cache_tt_segment.sql diff --git a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql new file mode 100644 index 000000000..4c344020c --- /dev/null +++ b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql @@ -0,0 +1,81 @@ +-- FUNCTION: gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], text[], boolean) + +-- DROP FUNCTION IF EXISTS gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], text[], boolean); + +CREATE OR REPLACE FUNCTION gwolofs.here_dynamic_bin_avg( + start_date date, + end_date date, + start_tod time without time zone, + end_tod time without time zone, + dow_list integer[], + link_dirs text[], + holidays boolean) + RETURNS numeric + LANGUAGE 'sql' + COST 100 + VOLATILE PARALLEL UNSAFE +AS $BODY$ + +CALL gwolofs.cache_tt_segment(here_dynamic_bin_avg.link_dirs); + +CALL gwolofs.cache_tt_results( + start_date := here_dynamic_bin_avg.start_date, + end_date := here_dynamic_bin_avg.end_date, + start_tod := here_dynamic_bin_avg.start_tod, + end_tod := here_dynamic_bin_avg.end_tod, + dow_list := here_dynamic_bin_avg.dow_list, + link_dirs := here_dynamic_bin_avg.link_dirs, + holidays := here_dynamic_bin_avg.holidays +); + +WITH time_grps AS ( + SELECT tsrange( + (days.dt + here_dynamic_bin_avg.start_tod)::timestamp, + (days.dt + here_dynamic_bin_avg.end_tod)::timestamp, '[)') AS time_grp + FROM generate_series( + here_dynamic_bin_avg.start_date::date, + here_dynamic_bin_avg.end_date::date - '1 day'::interval, '1 day'::interval) AS days(dt) + WHERE date_part('isodow', dt) = ANY(here_dynamic_bin_avg.dow_list) +) + +SELECT AVG(tt) +FROM gwolofs.dynamic_binning_results AS res +JOIN time_grps USING (time_grp) +JOIN gwolofs.tt_segments AS segs ON res.segment_uid = segs.uid +WHERE segs.link_dirs = here_dynamic_bin_avg.link_dirs + +$BODY$; + +ALTER FUNCTION gwolofs.here_dynamic_bin_avg( + date, date, time without time zone, time without time zone, integer[], text[], boolean +) +OWNER TO gwolofs; + + +/*example of use: + +SELECT + start_date, + end_date, + start_tod, + end_tod, + dow_list, + link_dirs, + gwolofs.here_dynamic_bin_avg( + start_date := l.start_date, + end_date := l.end_date, + start_tod := l.start_tod, + end_tod := l.end_tod, + dow_list := l.dow_list, + link_dirs := l.link_dirs, + holidays := TRUE + ) +FROM +(VALUES +('2025-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,2,3,4,5}'::int[], '{1258924853F,1258924867F,1258924868F,1258924894F}'::text[]), +('2025-01-02'::date, '2025-01-10'::date, '11:00'::time, '15:00'::time, '{1,2,3,4,5}'::int[], '{1258924852F,1258924867F,1258924868F,1258924894F}'::text[]), +('2025-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,3,5}'::int[], '{1258924852F,1258924853F,1258924868F,1258924894F}'::text[]), +('2024-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,2,3,4,5}'::int[], '{1258924852F,1258924853F,1258924867F,1258924894F}'::text[]) +) AS l(start_date, end_date, start_tod, end_tod, dow_list, link_dirs); + +*/ \ No newline at end of file diff --git a/here/traffic/sql/create-table-dynamic_binning_results.sql b/here/traffic/sql/create-table-dynamic_binning_results.sql new file mode 100644 index 000000000..c83618ed3 --- /dev/null +++ b/here/traffic/sql/create-table-dynamic_binning_results.sql @@ -0,0 +1,44 @@ +-- Table: gwolofs.dynamic_binning_results + +-- DROP TABLE IF EXISTS gwolofs.dynamic_binning_results; + +CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results ( + time_grp tsrange NOT NULL, + bin_range tsrange NOT NULL, + dt_start timestamp without time zone, + dt_end timestamp without time zone, + tt numeric, + unadjusted_tt numeric, + total_length numeric, + length_w_data numeric, + num_obs integer, + segment_uid smallint, + CONSTRAINT dynamic_bins_unique_temp EXCLUDE USING gist ( + bin_range WITH &&, + time_grp WITH =, + segment_uid WITH = + ) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.dynamic_binning_results +OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.dynamic_binning_results FROM bdit_humans; + +GRANT SELECT ON TABLE gwolofs.dynamic_binning_results TO bdit_humans; + +GRANT ALL ON TABLE gwolofs.dynamic_binning_results TO gwolofs; +-- Index: dynamic_binning_results_time_grp_segment_uid_idx + +-- DROP INDEX IF EXISTS gwolofs.dynamic_binning_results_time_grp_segment_uid_idx; + +CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_segment_uid_idx +ON gwolofs.dynamic_binning_results USING btree +( + time_grp ASC NULLS LAST, + segment_uid ASC NULLS LAST +) +WITH (deduplicate_items=True) +TABLESPACE pg_default; \ No newline at end of file diff --git a/here/traffic/sql/create-table-tt_segments.sql b/here/traffic/sql/create-table-tt_segments.sql new file mode 100644 index 000000000..677c8641e --- /dev/null +++ b/here/traffic/sql/create-table-tt_segments.sql @@ -0,0 +1,23 @@ +-- Table: gwolofs.tt_segments + +-- DROP TABLE IF EXISTS gwolofs.tt_segments; + +CREATE TABLE IF NOT EXISTS gwolofs.tt_segments +( + link_dirs text[] COLLATE pg_catalog."default", + lengths numeric[], + geom geometry, + total_length numeric, + uid smallint NOT NULL DEFAULT nextval('tt_segments_uid_seq'::regclass), + CONSTRAINT unique_link_dirs UNIQUE (link_dirs) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.tt_segments OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.tt_segments FROM bdit_humans; + +GRANT SELECT ON TABLE gwolofs.tt_segments TO bdit_humans; + +GRANT ALL ON TABLE gwolofs.tt_segments TO gwolofs; \ No newline at end of file diff --git a/here/traffic/sql/procedure-cache_tt_results.sql b/here/traffic/sql/procedure-cache_tt_results.sql new file mode 100644 index 000000000..d1f5e3d0e --- /dev/null +++ b/here/traffic/sql/procedure-cache_tt_results.sql @@ -0,0 +1,195 @@ +-- PROCEDURE: gwolofs.cache_tt_results(date, date, time without time zone, time without time zone, integer[], text[], boolean) + +-- DROP PROCEDURE IF EXISTS gwolofs.cache_tt_results(date, date, time without time zone, time without time zone, integer[], text[], boolean); + +CREATE OR REPLACE PROCEDURE gwolofs.cache_tt_results( + IN start_date date, + IN end_date date, + IN start_tod time without time zone, + IN end_tod time without time zone, + IN dow_list integer[], + IN link_dirs text[], + IN holidays boolean +) +LANGUAGE 'plpgsql' +AS $BODY$ + +BEGIN +EXECUTE format( + $$ + WITH segment AS ( + SELECT + uid AS segment_uid, + unnested.link_dir, + unnested.length, + tt_segments.total_length + FROM gwolofs.tt_segments, + UNNEST(tt_segments.link_dirs, tt_segments.lengths) AS unnested(link_dir, length) + WHERE link_dirs = %L + ), + + segment_5min_bins AS ( + SELECT + seg.segment_uid, + ta.tx, + seg.total_length, + tsrange( + ta.dt + %L::time, + ta.dt + %L::time, '[)') AS time_grp, + RANK() OVER w AS bin_rank, + SUM(seg.length) / seg.total_length AS sum_length, + SUM(seg.length) AS length_w_data, + SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, + ARRAY_AGG(seg.length ORDER BY link_dir) AS lengths + FROM here.ta_path AS ta + JOIN segment AS seg USING (link_dir) + WHERE + ( + tod >= %L + AND --{ToD_and_or} + tod < %L + ) + AND date_part('isodow', dt) = ANY(%L) + AND dt >= %L + AND dt < %L + /*--{holiday_clause} + AND NOT EXISTS ( + SELECT 1 FROM ref.holiday WHERE ta.dt = holiday.dt + )*/ + GROUP BY + ta.tx, + ta.dt, + seg.total_length, + segment_uid + WINDOW w AS ( + PARTITION BY seg.segment_uid, ta.dt + ORDER BY ta.tx + ) + ), + + dynamic_bin_options AS ( + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80%% length + SELECT + tx, + time_grp, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY time_grp + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) + ), + + unnested_db_options AS ( + SELECT + s5b.segment_uid, + dbo.time_grp, + s5b.total_length, + dbo.tx AS dt_start, + --exclusive end bin + MAX(s5b.tx) + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --we need to use nested data to determine length for these multi-period bins + WHERE dbo.start_bin != dbo.end_bin + GROUP BY + s5b.segment_uid, + dbo.time_grp, + s5b.total_length, + dbo.tx, + dbo.end_bin, + unnested.link_dir, + unnested.len + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' + ) + + INSERT INTO gwolofs.dynamic_binning_results ( + time_grp, segment_uid, dt_start, dt_end, bin_range, tt, + unadjusted_tt, total_length, length_w_data, num_obs + ) + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + SELECT DISTINCT ON (dt_start) + time_grp, + segment_uid, + dt_start, + dt_end, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(tt) AS unadjusted_tt, + total_length, + SUM(len) AS length_w_data, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + segment_uid, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + UNION + --these 5 minute bins already have sufficient length + --don't need to use nested data to validate. + SELECT + time_grp, + segment_uid, + tx AS dt_start, + tx + interval '5 minutes' AS dt_end, + tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, + total_length / length_w_data * unadjusted_tt AS tt, + unadjusted_tt, + total_length, + length_w_data, + num_obs --sum of here.ta_path sample_size for each segment + FROM segment_5min_bins + --we do not need to use nested data to determine length here. + WHERE sum_length >= 0.8 + ORDER BY + dt_start, + dt_end + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT dynamic_bins_unique_temp + DO NOTHING; + $$, + link_dirs, start_tod, end_tod, start_tod, end_tod, dow_list, start_date, end_date +); + +END; +$BODY$; +ALTER PROCEDURE gwolofs.cache_tt_results( + date, date, time without time zone, time without time zone, integer[], text[], boolean +) +OWNER TO gwolofs; diff --git a/here/traffic/sql/procedure-cache_tt_segment.sql b/here/traffic/sql/procedure-cache_tt_segment.sql new file mode 100644 index 000000000..f88d011e9 --- /dev/null +++ b/here/traffic/sql/procedure-cache_tt_segment.sql @@ -0,0 +1,25 @@ +-- PROCEDURE: gwolofs.cache_tt_segment(text[]) + +-- DROP PROCEDURE IF EXISTS gwolofs.cache_tt_segment(text[]); + +CREATE OR REPLACE PROCEDURE gwolofs.cache_tt_segment( + IN link_dirs text[] +) +LANGUAGE 'sql' +AS $BODY$ + +INSERT INTO gwolofs.tt_segments (link_dirs, lengths, geom, total_length) +SELECT + ARRAY_AGG(link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(length ORDER BY link_dir) AS lengths, + st_union(st_linemerge(geom)) AS geom, + SUM(length) AS total_length +FROM congestion.network_links_23_4_geom +WHERE link_dir = ANY (cache_tt_segment.link_dirs) +ON CONFLICT (link_dirs) +DO NOTHING; + +$BODY$; + +ALTER PROCEDURE gwolofs.cache_tt_segment(text[]) +OWNER TO gwolofs; From 0128961e6181d05fb070da5c638cc41399d72c03 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:02:01 +0000 Subject: [PATCH 08/74] #1132 update cache_tt_segment to return segment details + Check if exists before routing --- .../sql/procedure-cache_tt_segment.sql | 93 +++++++++++++++---- 1 file changed, 75 insertions(+), 18 deletions(-) diff --git a/here/traffic/sql/procedure-cache_tt_segment.sql b/here/traffic/sql/procedure-cache_tt_segment.sql index f88d011e9..18f39eb4e 100644 --- a/here/traffic/sql/procedure-cache_tt_segment.sql +++ b/here/traffic/sql/procedure-cache_tt_segment.sql @@ -1,25 +1,82 @@ --- PROCEDURE: gwolofs.cache_tt_segment(text[]) +-- FUNCTION: gwolofs.cache_tt_segment(bigint, bigint, text) --- DROP PROCEDURE IF EXISTS gwolofs.cache_tt_segment(text[]); +DROP FUNCTION IF EXISTS gwolofs.cache_tt_segment(bigint, bigint, text); + +CREATE OR REPLACE FUNCTION gwolofs.cache_tt_segment( + IN node_start bigint, + IN node_end bigint, + IN map_version text, + OUT uid smallint, + OUT link_dirs text[], + OUT lengths numeric[], + OUT total_length numeric) + LANGUAGE 'plpgsql' + COST 100 + VOLATILE PARALLEL SAFE -CREATE OR REPLACE PROCEDURE gwolofs.cache_tt_segment( - IN link_dirs text[] -) -LANGUAGE 'sql' AS $BODY$ -INSERT INTO gwolofs.tt_segments (link_dirs, lengths, geom, total_length) -SELECT - ARRAY_AGG(link_dir ORDER BY link_dir) AS link_dirs, - ARRAY_AGG(length ORDER BY link_dir) AS lengths, - st_union(st_linemerge(geom)) AS geom, - SUM(length) AS total_length -FROM congestion.network_links_23_4_geom -WHERE link_dir = ANY (cache_tt_segment.link_dirs) -ON CONFLICT (link_dirs) -DO NOTHING; +DECLARE + routing_function text := 'get_links_btwn_nodes_' || map_version; + street_geoms_table text := 'routing_streets_' || map_version; + +BEGIN + --check if the node pair and map_version have already been routed + --and if so, return values + SELECT + tt.uid, + tt.link_dirs, + tt.lengths, + tt.total_length + INTO uid, link_dirs, lengths, total_length + FROM gwolofs.tt_segments AS tt + WHERE + tt.node_start = cache_tt_segment.node_start + AND tt.node_end = cache_tt_segment.node_end + AND tt.map_version = cache_tt_segment.map_version; + IF FOUND THEN + RETURN; + END IF; + +EXECUTE format ( + $$ + WITH routed_links AS ( + SELECT link_dir, seq + FROM here_gis.%1$I(%2$L, %3$L), + UNNEST (links) WITH ORDINALITY AS unnested (link_dir, seq) + ) + + INSERT INTO gwolofs.tt_segments ( + node_start, node_end, map_version, link_dirs, lengths, geom, total_length + ) + SELECT + %2$L AS node_start, + %3$L AS node_end, + %4$L AS map_version, + ARRAY_AGG(rl.link_dir ORDER BY rl.seq) AS link_dirs, + --lengths in m + ARRAY_AGG(ST_Length(ST_Transform(streets.geom,2952)) ORDER BY rl.seq) AS lengths, + st_union(st_linemerge(streets.geom)) AS geom, + SUM(ST_Length(ST_Transform(streets.geom,2952))) AS total_length + FROM routed_links AS rl + JOIN here.%5$I AS streets USING (link_dir) + --conflict would occur because of null values + ON CONFLICT (node_start, node_end, map_version) + DO UPDATE + SET + link_dirs = excluded.link_dirs, + lengths = excluded.lengths, + total_length = excluded.total_length + RETURNING uid, link_dirs, lengths, total_length + $$, + routing_function, node_start, node_end, -- For routed_links + map_version, -- For INSERT SELECT values + street_geoms_table -- For JOIN table +) INTO uid, link_dirs, lengths, total_length; +RETURN; +END; $BODY$; -ALTER PROCEDURE gwolofs.cache_tt_segment(text[]) -OWNER TO gwolofs; +ALTER FUNCTION gwolofs.cache_tt_segment(bigint, bigint, text) + OWNER TO gwolofs; From 330668c630e4f995a85bd7302f1970f8e464fb73 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:03:04 +0000 Subject: [PATCH 09/74] #1132 cache_tt_segment procedure->function --- ...ocedure-cache_tt_segment.sql => function-cache_tt_segment.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename here/traffic/sql/{procedure-cache_tt_segment.sql => function-cache_tt_segment.sql} (100%) diff --git a/here/traffic/sql/procedure-cache_tt_segment.sql b/here/traffic/sql/function-cache_tt_segment.sql similarity index 100% rename from here/traffic/sql/procedure-cache_tt_segment.sql rename to here/traffic/sql/function-cache_tt_segment.sql From 521eceff2261d4faaa78f0e04f2864063eab6cd6 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:10:50 +0000 Subject: [PATCH 10/74] #1132 cache_tt_results procedure->function --- ...ocedure-cache_tt_results.sql => function-cache_tt_results.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename here/traffic/sql/{procedure-cache_tt_results.sql => function-cache_tt_results.sql} (100%) diff --git a/here/traffic/sql/procedure-cache_tt_results.sql b/here/traffic/sql/function-cache_tt_results.sql similarity index 100% rename from here/traffic/sql/procedure-cache_tt_results.sql rename to here/traffic/sql/function-cache_tt_results.sql From 154ea44982fc4e7cc8cba08aa399c8b748a02628 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:10:59 +0000 Subject: [PATCH 11/74] #1132 select map version func --- .../sql/function-select_map_version.sql | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 here/traffic/sql/function-select_map_version.sql diff --git a/here/traffic/sql/function-select_map_version.sql b/here/traffic/sql/function-select_map_version.sql new file mode 100644 index 000000000..1bbf56064 --- /dev/null +++ b/here/traffic/sql/function-select_map_version.sql @@ -0,0 +1,26 @@ +-- FUNCTION: gwolofs.select_map_version(date, date) + +-- DROP FUNCTION IF EXISTS gwolofs.select_map_version(date, date); + +CREATE OR REPLACE FUNCTION gwolofs.select_map_version( + start_date date, + end_date date) + RETURNS text + LANGUAGE 'sql' + COST 100 + STABLE PARALLEL SAFE +AS $BODY$ + +SELECT street_version +FROM here.street_valid_range AS svr, +LATERAL ( + SELECT svr.valid_range * daterange(select_map_version.start_date, select_map_version.end_date, '[)') AS overlap +) AS lat +WHERE UPPER(lat.overlap) - LOWER(lat.overlap) IS NOT NULL +ORDER BY UPPER(lat.overlap) - LOWER(lat.overlap) DESC NULLS LAST +LIMIT 1; + +$BODY$; + +ALTER FUNCTION gwolofs.select_map_version(date, date) + OWNER TO gwolofs; From ab45c51b41564b125fb22949563e1de2de5e38e9 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:13:03 +0000 Subject: [PATCH 12/74] #1132 cache_tt_results updates; use map version, fix end_bin bug, save uri_string for result lookup --- .../traffic/sql/function-cache_tt_results.sql | 75 ++++++++++++------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/here/traffic/sql/function-cache_tt_results.sql b/here/traffic/sql/function-cache_tt_results.sql index d1f5e3d0e..2cc372e0a 100644 --- a/here/traffic/sql/function-cache_tt_results.sql +++ b/here/traffic/sql/function-cache_tt_results.sql @@ -1,20 +1,29 @@ --- PROCEDURE: gwolofs.cache_tt_results(date, date, time without time zone, time without time zone, integer[], text[], boolean) +-- FUNCTION: gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --- DROP PROCEDURE IF EXISTS gwolofs.cache_tt_results(date, date, time without time zone, time without time zone, integer[], text[], boolean); +-- DROP FUNCTION IF EXISTS gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); -CREATE OR REPLACE PROCEDURE gwolofs.cache_tt_results( - IN start_date date, - IN end_date date, - IN start_tod time without time zone, - IN end_tod time without time zone, - IN dow_list integer[], - IN link_dirs text[], - IN holidays boolean -) -LANGUAGE 'plpgsql' +CREATE OR REPLACE FUNCTION gwolofs.cache_tt_results( + uri_string text, + start_date date, + end_date date, + start_tod time without time zone, + end_tod time without time zone, + dow_list integer[], + node_start bigint, + node_end bigint, + holidays boolean) + RETURNS void + LANGUAGE 'plpgsql' + COST 100 + VOLATILE PARALLEL UNSAFE AS $BODY$ +DECLARE map_version text; + BEGIN + +SELECT gwolofs.select_map_version(cache_tt_results.start_date, cache_tt_results.end_date) INTO map_version; + EXECUTE format( $$ WITH segment AS ( @@ -22,10 +31,9 @@ EXECUTE format( uid AS segment_uid, unnested.link_dir, unnested.length, - tt_segments.total_length - FROM gwolofs.tt_segments, - UNNEST(tt_segments.link_dirs, tt_segments.lengths) AS unnested(link_dir, length) - WHERE link_dirs = %L + total_length + FROM gwolofs.cache_tt_segment(%L, %L, %L), + UNNEST(cache_tt_segment.link_dirs, cache_tt_segment.lengths) AS unnested(link_dir, length) ), segment_5min_bins AS ( @@ -44,7 +52,7 @@ EXECUTE format( ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(seg.length ORDER BY link_dir) AS lengths - FROM here.ta_path AS ta + FROM here.ta AS ta JOIN segment AS seg USING (link_dir) WHERE ( @@ -52,7 +60,7 @@ EXECUTE format( AND --{ToD_and_or} tod < %L ) - AND date_part('isodow', dt) = ANY(%L) + AND date_part('isodow', dt) = ANY(%L::int[]) AND dt >= %L AND dt < %L /*--{holiday_clause} @@ -109,16 +117,20 @@ EXECUTE format( s5b.total_length, dbo.tx AS dt_start, --exclusive end bin - MAX(s5b.tx) + interval '5 minutes' AS dt_end, + s5b_end.tx + interval '5 minutes' AS dt_end, unnested.link_dir, unnested.len, AVG(unnested.tt) AS tt, --avg TT for each link_dir - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir FROM dynamic_bin_options AS dbo LEFT JOIN segment_5min_bins AS s5b ON s5b.time_grp = dbo.time_grp AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin, + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) --we need to use nested data to determine length for these multi-period bins @@ -127,21 +139,23 @@ EXECUTE format( s5b.segment_uid, dbo.time_grp, s5b.total_length, - dbo.tx, - dbo.end_bin, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin unnested.link_dir, unnested.len --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' + --HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' ) INSERT INTO gwolofs.dynamic_binning_results ( + uri_string, time_grp, segment_uid, dt_start, dt_end, bin_range, tt, unadjusted_tt, total_length, length_w_data, num_obs ) --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. SELECT DISTINCT ON (dt_start) + %L, time_grp, segment_uid, dt_start, @@ -164,6 +178,7 @@ EXECUTE format( --these 5 minute bins already have sufficient length --don't need to use nested data to validate. SELECT + %L, time_grp, segment_uid, tx AS dt_start, @@ -184,12 +199,14 @@ EXECUTE format( ON CONFLICT ON CONSTRAINT dynamic_bins_unique_temp DO NOTHING; $$, - link_dirs, start_tod, end_tod, start_tod, end_tod, dow_list, start_date, end_date + node_start, node_end, map_version, --segment CTE + start_tod, end_tod, --segment_5min_bins CTE SELECT + start_tod, end_tod, dow_list, start_date, end_date, --segment_5min_bins CTE WHERE + cache_tt_results.uri_string, cache_tt_results.uri_string --INSERT ); END; $BODY$; -ALTER PROCEDURE gwolofs.cache_tt_results( - date, date, time without time zone, time without time zone, integer[], text[], boolean -) -OWNER TO gwolofs; + +ALTER FUNCTION gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) + OWNER TO gwolofs; From e51d6ed8c52c624ad401b442cd1f62efbe6d7856 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:14:02 +0000 Subject: [PATCH 13/74] #1132 tt_segments table; save map_version, nodes, remove link_dir constraint --- here/traffic/sql/create-table-tt_segments.sql | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/here/traffic/sql/create-table-tt_segments.sql b/here/traffic/sql/create-table-tt_segments.sql index 677c8641e..7426b9e66 100644 --- a/here/traffic/sql/create-table-tt_segments.sql +++ b/here/traffic/sql/create-table-tt_segments.sql @@ -9,12 +9,16 @@ CREATE TABLE IF NOT EXISTS gwolofs.tt_segments geom geometry, total_length numeric, uid smallint NOT NULL DEFAULT nextval('tt_segments_uid_seq'::regclass), - CONSTRAINT unique_link_dirs UNIQUE (link_dirs) + node_start bigint NOT NULL, + node_end bigint NOT NULL, + map_version text COLLATE pg_catalog."default" NOT NULL, + CONSTRAINT tt_segments_pkey PRIMARY KEY (node_start, node_end, map_version) ) TABLESPACE pg_default; -ALTER TABLE IF EXISTS gwolofs.tt_segments OWNER TO gwolofs; +ALTER TABLE IF EXISTS gwolofs.tt_segments + OWNER to gwolofs; REVOKE ALL ON TABLE gwolofs.tt_segments FROM bdit_humans; From 71468b60ede060bfa4e73ca5aa9a4fd3aedb8b9f Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:15:49 +0000 Subject: [PATCH 14/74] #1132 update dynamic bin results table; add uri --- .../create-table-dynamic_binning_results.sql | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/here/traffic/sql/create-table-dynamic_binning_results.sql b/here/traffic/sql/create-table-dynamic_binning_results.sql index c83618ed3..626a3dcec 100644 --- a/here/traffic/sql/create-table-dynamic_binning_results.sql +++ b/here/traffic/sql/create-table-dynamic_binning_results.sql @@ -2,7 +2,8 @@ -- DROP TABLE IF EXISTS gwolofs.dynamic_binning_results; -CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results ( +CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results +( time_grp tsrange NOT NULL, bin_range tsrange NOT NULL, dt_start timestamp without time zone, @@ -13,17 +14,20 @@ CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results ( length_w_data numeric, num_obs integer, segment_uid smallint, + uri_string text COLLATE pg_catalog."default", CONSTRAINT dynamic_bins_unique_temp EXCLUDE USING gist ( bin_range WITH &&, time_grp WITH =, - segment_uid WITH = + segment_uid WITH =, + uri_string WITH = ) + ) TABLESPACE pg_default; ALTER TABLE IF EXISTS gwolofs.dynamic_binning_results -OWNER TO gwolofs; + OWNER TO gwolofs; REVOKE ALL ON TABLE gwolofs.dynamic_binning_results FROM bdit_humans; @@ -35,10 +39,10 @@ GRANT ALL ON TABLE gwolofs.dynamic_binning_results TO gwolofs; -- DROP INDEX IF EXISTS gwolofs.dynamic_binning_results_time_grp_segment_uid_idx; CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_segment_uid_idx -ON gwolofs.dynamic_binning_results USING btree -( - time_grp ASC NULLS LAST, - segment_uid ASC NULLS LAST -) -WITH (deduplicate_items=True) -TABLESPACE pg_default; \ No newline at end of file + ON gwolofs.dynamic_binning_results USING btree + ( + time_grp ASC NULLS LAST, + segment_uid ASC NULLS LAST + ) + WITH (deduplicate_items=True) + TABLESPACE pg_default; \ No newline at end of file From dcd95840a43d1136d8d3fbffd13c299cb7e7d134 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 30 Jan 2025 23:18:29 +0000 Subject: [PATCH 15/74] #1132 update here_dynamic_bin_avg; use daily avgs -> avg method, use uri_string to locate results --- .../create-function-here_dynamic_bin_avg.sql | 99 ++++++++----------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql index 4c344020c..6f5bf82d5 100644 --- a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql +++ b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql @@ -1,81 +1,64 @@ --- FUNCTION: gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], text[], boolean) +-- FUNCTION: gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --- DROP FUNCTION IF EXISTS gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], text[], boolean); +-- DROP FUNCTION IF EXISTS gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); CREATE OR REPLACE FUNCTION gwolofs.here_dynamic_bin_avg( - start_date date, - end_date date, - start_tod time without time zone, - end_tod time without time zone, - dow_list integer[], - link_dirs text[], - holidays boolean) + start_date date, + end_date date, + start_tod time without time zone, + end_tod time without time zone, + dow_list integer[], + node_start bigint, + node_end bigint, + holidays boolean) RETURNS numeric - LANGUAGE 'sql' + LANGUAGE 'plpgsql' COST 100 VOLATILE PARALLEL UNSAFE AS $BODY$ -CALL gwolofs.cache_tt_segment(here_dynamic_bin_avg.link_dirs); +DECLARE uri_string_func text := + here_dynamic_bin_avg.node_start::text || '/' || + here_dynamic_bin_avg.node_end::text || '/' || + here_dynamic_bin_avg.start_tod::text || '/' || + here_dynamic_bin_avg.end_tod::text || '/' || + here_dynamic_bin_avg.start_date::text || '/' || + here_dynamic_bin_avg.end_date::text || '/' || + here_dynamic_bin_avg.holidays::text || '/' || + here_dynamic_bin_avg.dow_list::text; + res numeric; -CALL gwolofs.cache_tt_results( +BEGIN + +PERFORM gwolofs.cache_tt_results( + uri_string := uri_string_func, start_date := here_dynamic_bin_avg.start_date, end_date := here_dynamic_bin_avg.end_date, start_tod := here_dynamic_bin_avg.start_tod, end_tod := here_dynamic_bin_avg.end_tod, dow_list := here_dynamic_bin_avg.dow_list, - link_dirs := here_dynamic_bin_avg.link_dirs, + node_start := here_dynamic_bin_avg.node_start, + node_end := here_dynamic_bin_avg.node_end, holidays := here_dynamic_bin_avg.holidays ); -WITH time_grps AS ( - SELECT tsrange( - (days.dt + here_dynamic_bin_avg.start_tod)::timestamp, - (days.dt + here_dynamic_bin_avg.end_tod)::timestamp, '[)') AS time_grp - FROM generate_series( - here_dynamic_bin_avg.start_date::date, - here_dynamic_bin_avg.end_date::date - '1 day'::interval, '1 day'::interval) AS days(dt) - WHERE date_part('isodow', dt) = ANY(here_dynamic_bin_avg.dow_list) +WITH daily_means AS ( + SELECT + dt_start::date, + AVG(tt) AS daily_mean + FROM gwolofs.dynamic_binning_results + WHERE uri_string = uri_string_func + GROUP BY dt_start::date ) -SELECT AVG(tt) -FROM gwolofs.dynamic_binning_results AS res -JOIN time_grps USING (time_grp) -JOIN gwolofs.tt_segments AS segs ON res.segment_uid = segs.uid -WHERE segs.link_dirs = here_dynamic_bin_avg.link_dirs - -$BODY$; - -ALTER FUNCTION gwolofs.here_dynamic_bin_avg( - date, date, time without time zone, time without time zone, integer[], text[], boolean -) -OWNER TO gwolofs; +SELECT AVG(daily_mean) INTO res +FROM daily_means; +RETURN res; -/*example of use: +END; -SELECT - start_date, - end_date, - start_tod, - end_tod, - dow_list, - link_dirs, - gwolofs.here_dynamic_bin_avg( - start_date := l.start_date, - end_date := l.end_date, - start_tod := l.start_tod, - end_tod := l.end_tod, - dow_list := l.dow_list, - link_dirs := l.link_dirs, - holidays := TRUE - ) -FROM -(VALUES -('2025-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,2,3,4,5}'::int[], '{1258924853F,1258924867F,1258924868F,1258924894F}'::text[]), -('2025-01-02'::date, '2025-01-10'::date, '11:00'::time, '15:00'::time, '{1,2,3,4,5}'::int[], '{1258924852F,1258924867F,1258924868F,1258924894F}'::text[]), -('2025-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,3,5}'::int[], '{1258924852F,1258924853F,1258924868F,1258924894F}'::text[]), -('2024-01-02'::date, '2025-01-10'::date, '07:00'::time, '10:00'::time, '{1,2,3,4,5}'::int[], '{1258924852F,1258924853F,1258924867F,1258924894F}'::text[]) -) AS l(start_date, end_date, start_tod, end_tod, dow_list, link_dirs); +$BODY$; -*/ \ No newline at end of file +ALTER FUNCTION gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) + OWNER TO gwolofs; From ad3c31ecb258cd6b4c41fb2898f2e36763c4c7f5 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:03:50 +0000 Subject: [PATCH 16/74] #1132 apply end_bin fix to congestion network query --- .../traffic/sql/function-cache_tt_results.sql | 2 +- .../sql/select-congestion_raw_segments.sql | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/here/traffic/sql/function-cache_tt_results.sql b/here/traffic/sql/function-cache_tt_results.sql index 2cc372e0a..000164f49 100644 --- a/here/traffic/sql/function-cache_tt_results.sql +++ b/here/traffic/sql/function-cache_tt_results.sql @@ -144,7 +144,7 @@ EXECUTE format( unnested.link_dir, unnested.len --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - --HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' + --HAVING s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' ) INSERT INTO gwolofs.dynamic_binning_results ( diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql index ddf922fd1..30c60872d 100644 --- a/here/traffic/sql/select-congestion_raw_segments.sql +++ b/here/traffic/sql/select-congestion_raw_segments.sql @@ -103,7 +103,7 @@ unnested_db_options AS ( s5b.total_length, dbo.tx AS dt_start, --exclusive end bin - MAX(s5b.tx) + interval '5 minutes' AS dt_end, + s5b_end.tx + interval '5 minutes' AS dt_end, unnested.link_dir, unnested.len, AVG(unnested.tt) AS tt, --avg TT for each link_dir @@ -113,21 +113,26 @@ unnested_db_options AS ( ON s5b.time_grp = dbo.time_grp AND s5b.segment_id = dbo.segment_id AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin, + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --we need to use nested data to determine length for these multi-period bins - WHERE dbo.start_bin != dbo.end_bin + WHERE + --we need to use nested data to determine length for these multi-period bins + dbo.start_bin != dbo.end_bin + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + AND s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' GROUP BY dbo.time_grp, dbo.segment_id, s5b.total_length, - dbo.tx, - dbo.end_bin, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin unnested.link_dir, unnested.len - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' ) INSERT INTO gwolofs.congestion_raw_segments ( From b1befb71397599e67a5f3e7ef069fd88137dae7c Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 11 Feb 2025 19:34:28 +0000 Subject: [PATCH 17/74] #1132 rename files/functions as per proposed dictionary --- .../create-function-here_dynamic_bin_avg.sql | 14 +- .../create-table-congestion_raw_segments.sql | 22 +- .../create-table-dynamic_binning_results.sql | 34 ++- here/traffic/sql/create-table-tt_segments.sql | 20 +- .../traffic/sql/function-cache_tt_results.sql | 16 +- .../traffic/sql/function-cache_tt_segment.sql | 30 +-- .../sql/function-select_map_version.sql | 8 +- .../sql/select-congestion_raw_segments.sql | 211 ------------------ 8 files changed, 74 insertions(+), 281 deletions(-) delete mode 100644 here/traffic/sql/select-congestion_raw_segments.sql diff --git a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql index 6f5bf82d5..d51b7f0f0 100644 --- a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql +++ b/here/traffic/sql/create-function-here_dynamic_bin_avg.sql @@ -1,8 +1,8 @@ --- FUNCTION: gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +-- FUNCTION: gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --- DROP FUNCTION IF EXISTS gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); -CREATE OR REPLACE FUNCTION gwolofs.here_dynamic_bin_avg( +CREATE OR REPLACE FUNCTION gwolofs.congestion_dynamic_bin_avg( start_date date, end_date date, start_tod time without time zone, @@ -30,7 +30,7 @@ DECLARE uri_string_func text := BEGIN -PERFORM gwolofs.cache_tt_results( +PERFORM gwolofs.congestion_cache_tt_results( uri_string := uri_string_func, start_date := here_dynamic_bin_avg.start_date, end_date := here_dynamic_bin_avg.end_date, @@ -46,7 +46,7 @@ WITH daily_means AS ( SELECT dt_start::date, AVG(tt) AS daily_mean - FROM gwolofs.dynamic_binning_results + FROM gwolofs.congestion_raw_corridors WHERE uri_string = uri_string_func GROUP BY dt_start::date ) @@ -60,5 +60,7 @@ END; $BODY$; -ALTER FUNCTION gwolofs.here_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +ALTER FUNCTION gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_dynamic_bin_avg IS 'Previously gwolofs.here_dynamic_bin_avg.'; diff --git a/here/traffic/sql/create-table-congestion_raw_segments.sql b/here/traffic/sql/create-table-congestion_raw_segments.sql index 9b6d39f5d..7a0ec772d 100644 --- a/here/traffic/sql/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/create-table-congestion_raw_segments.sql @@ -4,21 +4,18 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments ( - time_grp tsrange NOT NULL, segment_id integer NOT NULL, bin_range tsrange NOT NULL, - dt_start timestamp without time zone, - dt_end timestamp without time zone, tt numeric, unadjusted_tt numeric, total_length numeric, length_w_data numeric, num_obs integer, - CONSTRAINT dynamic_bins_unique EXCLUDE USING gist ( - segment_id WITH =, + hr timestamp without time zone, + CONSTRAINT congestion_raw_segments_exclude EXCLUDE USING gist ( + hr WITH =, bin_range WITH &&, - time_grp WITH = - ) + segment_id WITH =) ) TABLESPACE pg_default; @@ -31,12 +28,21 @@ REVOKE ALL ON TABLE gwolofs.congestion_raw_segments FROM bdit_humans; GRANT SELECT ON TABLE gwolofs.congestion_raw_segments TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_raw_segments TO gwolofs; +-- Index: dynamic_bin_hr_idx + +-- DROP INDEX IF EXISTS gwolofs.dynamic_bin_hr_idx; + +CREATE INDEX IF NOT EXISTS dynamic_bin_hr_idx + ON gwolofs.congestion_raw_segments USING btree + (hr ASC NULLS LAST) + WITH (deduplicate_items=True) + TABLESPACE pg_default; -- Index: dynamic_bin_idx -- DROP INDEX IF EXISTS gwolofs.dynamic_bin_idx; CREATE INDEX IF NOT EXISTS dynamic_bin_idx ON gwolofs.congestion_raw_segments USING btree - (segment_id ASC NULLS LAST, time_grp ASC NULLS LAST) + (segment_id ASC NULLS LAST, hr ASC NULLS LAST) WITH (deduplicate_items=True) TABLESPACE pg_default; \ No newline at end of file diff --git a/here/traffic/sql/create-table-dynamic_binning_results.sql b/here/traffic/sql/create-table-dynamic_binning_results.sql index 626a3dcec..c0c101d1b 100644 --- a/here/traffic/sql/create-table-dynamic_binning_results.sql +++ b/here/traffic/sql/create-table-dynamic_binning_results.sql @@ -1,8 +1,8 @@ --- Table: gwolofs.dynamic_binning_results +-- Table: gwolofs.congestion_raw_corridors --- DROP TABLE IF EXISTS gwolofs.dynamic_binning_results; +-- DROP TABLE IF EXISTS gwolofs.congestion_raw_corridors; -CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results +CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors ( time_grp tsrange NOT NULL, bin_range tsrange NOT NULL, @@ -15,34 +15,30 @@ CREATE TABLE IF NOT EXISTS gwolofs.dynamic_binning_results num_obs integer, segment_uid smallint, uri_string text COLLATE pg_catalog."default", - CONSTRAINT dynamic_bins_unique_temp EXCLUDE USING gist ( + CONSTRAINT congestion_raw_corridors_exclude EXCLUDE USING gist ( bin_range WITH &&, - time_grp WITH =, segment_uid WITH =, - uri_string WITH = - ) - + time_grp WITH =, + uri_string WITH =) ) TABLESPACE pg_default; -ALTER TABLE IF EXISTS gwolofs.dynamic_binning_results - OWNER TO gwolofs; +ALTER TABLE IF EXISTS gwolofs.congestion_raw_corridors + OWNER to gwolofs; + +REVOKE ALL ON TABLE gwolofs.congestion_raw_corridors FROM bdit_humans; -REVOKE ALL ON TABLE gwolofs.dynamic_binning_results FROM bdit_humans; +GRANT SELECT ON TABLE gwolofs.congestion_raw_corridors TO bdit_humans; -GRANT SELECT ON TABLE gwolofs.dynamic_binning_results TO bdit_humans; +GRANT ALL ON TABLE gwolofs.congestion_raw_corridors TO gwolofs; -GRANT ALL ON TABLE gwolofs.dynamic_binning_results TO gwolofs; -- Index: dynamic_binning_results_time_grp_segment_uid_idx --- DROP INDEX IF EXISTS gwolofs.dynamic_binning_results_time_grp_segment_uid_idx; +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_time_grp_segment_uid_idx; CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_segment_uid_idx - ON gwolofs.dynamic_binning_results USING btree - ( - time_grp ASC NULLS LAST, - segment_uid ASC NULLS LAST - ) + ON gwolofs.congestion_raw_corridors USING btree + (time_grp ASC NULLS LAST, segment_uid ASC NULLS LAST) WITH (deduplicate_items=True) TABLESPACE pg_default; \ No newline at end of file diff --git a/here/traffic/sql/create-table-tt_segments.sql b/here/traffic/sql/create-table-tt_segments.sql index 7426b9e66..327c5cb8c 100644 --- a/here/traffic/sql/create-table-tt_segments.sql +++ b/here/traffic/sql/create-table-tt_segments.sql @@ -1,27 +1,27 @@ --- Table: gwolofs.tt_segments +-- Table: gwolofs.congestion_corridors --- DROP TABLE IF EXISTS gwolofs.tt_segments; +-- DROP TABLE IF EXISTS gwolofs.congestion_corridors; -CREATE TABLE IF NOT EXISTS gwolofs.tt_segments +CREATE TABLE IF NOT EXISTS gwolofs.congestion_corridors ( link_dirs text[] COLLATE pg_catalog."default", lengths numeric[], geom geometry, total_length numeric, - uid smallint NOT NULL DEFAULT nextval('tt_segments_uid_seq'::regclass), + corridor_id smallint NOT NULL DEFAULT nextval('congestion_corridors_uid_seq'::regclass), node_start bigint NOT NULL, node_end bigint NOT NULL, map_version text COLLATE pg_catalog."default" NOT NULL, - CONSTRAINT tt_segments_pkey PRIMARY KEY (node_start, node_end, map_version) + CONSTRAINT congestion_corridors_pkey PRIMARY KEY (node_start, node_end, map_version) ) TABLESPACE pg_default; -ALTER TABLE IF EXISTS gwolofs.tt_segments - OWNER to gwolofs; +ALTER TABLE IF EXISTS gwolofs.congestion_corridors +OWNER TO gwolofs; -REVOKE ALL ON TABLE gwolofs.tt_segments FROM bdit_humans; +REVOKE ALL ON TABLE gwolofs.congestion_corridors FROM bdit_humans; -GRANT SELECT ON TABLE gwolofs.tt_segments TO bdit_humans; +GRANT SELECT ON TABLE gwolofs.congestion_corridors TO bdit_humans; -GRANT ALL ON TABLE gwolofs.tt_segments TO gwolofs; \ No newline at end of file +GRANT ALL ON TABLE gwolofs.congestion_corridors TO gwolofs; diff --git a/here/traffic/sql/function-cache_tt_results.sql b/here/traffic/sql/function-cache_tt_results.sql index 000164f49..57d5b7831 100644 --- a/here/traffic/sql/function-cache_tt_results.sql +++ b/here/traffic/sql/function-cache_tt_results.sql @@ -1,8 +1,8 @@ --- FUNCTION: gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +-- FUNCTION: gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --- DROP FUNCTION IF EXISTS gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); -CREATE OR REPLACE FUNCTION gwolofs.cache_tt_results( +CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results( uri_string text, start_date date, end_date date, @@ -22,7 +22,7 @@ DECLARE map_version text; BEGIN -SELECT gwolofs.select_map_version(cache_tt_results.start_date, cache_tt_results.end_date) INTO map_version; +SELECT gwolofs.congestion_select_map_version(cache_tt_results.start_date, cache_tt_results.end_date) INTO map_version; EXECUTE format( $$ @@ -32,7 +32,7 @@ EXECUTE format( unnested.link_dir, unnested.length, total_length - FROM gwolofs.cache_tt_segment(%L, %L, %L), + FROM gwolofs.congestion_cache_corridor(%L, %L, %L), UNNEST(cache_tt_segment.link_dirs, cache_tt_segment.lengths) AS unnested(link_dir, length) ), @@ -144,10 +144,10 @@ EXECUTE format( unnested.link_dir, unnested.len --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - --HAVING s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' + --HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' ) - INSERT INTO gwolofs.dynamic_binning_results ( + INSERT INTO gwolofs.congestion_raw_corridors ( uri_string, time_grp, segment_uid, dt_start, dt_end, bin_range, tt, unadjusted_tt, total_length, length_w_data, num_obs @@ -208,5 +208,5 @@ EXECUTE format( END; $BODY$; -ALTER FUNCTION gwolofs.cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +ALTER FUNCTION gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) OWNER TO gwolofs; diff --git a/here/traffic/sql/function-cache_tt_segment.sql b/here/traffic/sql/function-cache_tt_segment.sql index 18f39eb4e..354f63426 100644 --- a/here/traffic/sql/function-cache_tt_segment.sql +++ b/here/traffic/sql/function-cache_tt_segment.sql @@ -1,19 +1,19 @@ --- FUNCTION: gwolofs.cache_tt_segment(bigint, bigint, text) +-- FUNCTION: gwolofs.congestion_cache_corridor(bigint, bigint, text) -DROP FUNCTION IF EXISTS gwolofs.cache_tt_segment(bigint, bigint, text); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_corridor(bigint, bigint, text); -CREATE OR REPLACE FUNCTION gwolofs.cache_tt_segment( - IN node_start bigint, - IN node_end bigint, - IN map_version text, - OUT uid smallint, - OUT link_dirs text[], - OUT lengths numeric[], - OUT total_length numeric) +CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_corridor( + node_start bigint, + node_end bigint, + map_version text, + OUT uid smallint, + OUT link_dirs text[], + OUT lengths numeric[], + OUT total_length numeric) + RETURNS record LANGUAGE 'plpgsql' COST 100 - VOLATILE PARALLEL SAFE - + VOLATILE PARALLEL SAFE AS $BODY$ DECLARE @@ -30,7 +30,7 @@ BEGIN tt.lengths, tt.total_length INTO uid, link_dirs, lengths, total_length - FROM gwolofs.tt_segments AS tt + FROM gwolofs.congestion_corridors AS tt WHERE tt.node_start = cache_tt_segment.node_start AND tt.node_end = cache_tt_segment.node_end @@ -47,7 +47,7 @@ EXECUTE format ( UNNEST (links) WITH ORDINALITY AS unnested (link_dir, seq) ) - INSERT INTO gwolofs.tt_segments ( + INSERT INTO gwolofs.congestion_corridors ( node_start, node_end, map_version, link_dirs, lengths, geom, total_length ) SELECT @@ -78,5 +78,5 @@ RETURN; END; $BODY$; -ALTER FUNCTION gwolofs.cache_tt_segment(bigint, bigint, text) +ALTER FUNCTION gwolofs.congestion_cache_corridor(bigint, bigint, text) OWNER TO gwolofs; diff --git a/here/traffic/sql/function-select_map_version.sql b/here/traffic/sql/function-select_map_version.sql index 1bbf56064..ab4ca1010 100644 --- a/here/traffic/sql/function-select_map_version.sql +++ b/here/traffic/sql/function-select_map_version.sql @@ -1,8 +1,8 @@ --- FUNCTION: gwolofs.select_map_version(date, date) +-- FUNCTION: gwolofs.congestion_select_map_version(date, date) --- DROP FUNCTION IF EXISTS gwolofs.select_map_version(date, date); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_select_map_version(date, date); -CREATE OR REPLACE FUNCTION gwolofs.select_map_version( +CREATE OR REPLACE FUNCTION gwolofs.congestion_select_map_version( start_date date, end_date date) RETURNS text @@ -22,5 +22,5 @@ LIMIT 1; $BODY$; -ALTER FUNCTION gwolofs.select_map_version(date, date) +ALTER FUNCTION gwolofs.congestion_select_map_version(date, date) OWNER TO gwolofs; diff --git a/here/traffic/sql/select-congestion_raw_segments.sql b/here/traffic/sql/select-congestion_raw_segments.sql deleted file mode 100644 index 30c60872d..000000000 --- a/here/traffic/sql/select-congestion_raw_segments.sql +++ /dev/null @@ -1,211 +0,0 @@ ---TRUNCATE gwolofs.congestion_raw_segments; - ---INSERT 0 771478 ---Query returned successfully in 2 min 51 secs. --- vs 7,756,256 rows in (SELECT COUNT(*) FROM here.ta_path WHERE dt = '2025-01-04') = 1/10 ---with addition of am/pm/midday time ranges: ---INSERT 0 1251472 (2024-01-04) ---Query returned successfully in 6 min 29 secs. - -WITH time_bins AS ( - SELECT - start_time, - start_time + '1 hour'::interval AS end_time, - tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp - FROM generate_series( - '2025-01-04'::date, - '2025-01-04'::date + interval '23 hours', - '1 hour'::interval - ) AS hours(start_time) - UNION - SELECT - start_time + '2025-01-04'::date, - end_time + '2025-01-04'::date, - tsrange(start_time + '2025-01-04'::date, end_time + '2025-01-04'::date, '[)') - FROM ( - VALUES - ('07:00'::time, '10:00'::time), - ('10:00', '16:00'), - ('16:00', '19:00') - ) AS time_periods(start_time, end_time) - ORDER BY start_time -), - -segment_5min_bins AS ( - SELECT - segments.segment_id, - tb.time_grp, - ta.tx, - RANK() OVER w AS bin_rank, - segments.total_length, - SUM(links.length) / segments.total_length AS sum_length, - SUM(links.length) AS length_w_data, - SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, - SUM(sample_size) AS num_obs, - ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, - ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, - ARRAY_AGG(links.length ORDER BY link_dir) AS lengths - FROM here.ta_path AS ta - JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time - JOIN congestion.network_links_23_4_geom AS links USING (link_dir) - JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) - WHERE ta.dt = '2025-01-04' - --AND tx < '2025-01-04 01:00:00' - --AND segment_id = 1 AND date_trunc('hour', ta.tx) = '2025-01-04 00:00:00' - GROUP BY - segments.segment_id, - tb.time_grp, - ta.tx, - segments.total_length - WINDOW w AS ( - PARTITION BY segments.segment_id, tb.time_grp - ORDER BY ta.tx - ) -), - -dynamic_bin_options AS ( - --within each segment/hour, generate all possible forward looking bin combinations - --don't generate options for bins with sufficient length - --also don't generate options past the next bin with 80% length - SELECT - tx, - time_grp, - segment_id, - bin_rank AS start_bin, - --generate all the options for the end bin within the group. - generate_series( - CASE - WHEN sum_length >= 0.8 THEN bin_rank - --if length is insufficient, need at least 1 more bin - ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) - END, - CASE - --dont need to generate options when start segment is already sufficient - WHEN sum_length >= 0.8 THEN bin_rank - --generate options until 1 bin has sufficient length, otherwise until last bin in group - ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) - END, - 1 - ) AS end_bin - FROM segment_5min_bins - WINDOW w AS ( - PARTITION BY time_grp, segment_id - ORDER BY tx - --look only forward for end_bin options - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING - ) -), - -unnested_db_options AS ( - SELECT - dbo.time_grp, - dbo.segment_id, - s5b.total_length, - dbo.tx AS dt_start, - --exclusive end bin - s5b_end.tx + interval '5 minutes' AS dt_end, - unnested.link_dir, - unnested.len, - AVG(unnested.tt) AS tt, --avg TT for each link_dir - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir - FROM dynamic_bin_options AS dbo - LEFT JOIN segment_5min_bins AS s5b - ON s5b.time_grp = dbo.time_grp - AND s5b.segment_id = dbo.segment_id - AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin - --this join is used to get the tx info about the last bin only - LEFT JOIN segment_5min_bins AS s5b_end - ON s5b_end.time_grp = dbo.time_grp - AND s5b_end.bin_rank = dbo.end_bin, - --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin - UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - WHERE - --we need to use nested data to determine length for these multi-period bins - dbo.start_bin != dbo.end_bin - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - AND s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' - GROUP BY - dbo.time_grp, - dbo.segment_id, - s5b.total_length, - dbo.tx, --stard_bin - s5b_end.tx, --end_bin - unnested.link_dir, - unnested.len -) - -INSERT INTO gwolofs.congestion_raw_segments ( - time_grp, segment_id, dt_start, dt_end, bin_range, tt, - unadjusted_tt, total_length, length_w_data, num_obs -) ---this query contains overlapping values which get eliminated ---via on conflict with the exclusion constraint on congestion_raw_segments table. -SELECT DISTINCT ON (time_grp, segment_id, dt_start) - time_grp, - segment_id, - dt_start, - dt_end, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(tt) AS unadjusted_tt, - total_length, - SUM(len) AS length_w_data, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment -FROM unnested_db_options -GROUP BY - time_grp, - segment_id, - dt_start, - dt_end, - total_length -HAVING SUM(len) >= 0.8 * total_length -UNION ---these 5 minute bins already have sufficient length ---don't need to use nested data to validate. -SELECT - time_grp, - segment_id, - tx AS dt_start, - tx + interval '5 minutes' AS dt_end, - tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, - total_length / length_w_data * unadjusted_tt AS tt, - unadjusted_tt, - total_length, - length_w_data, - num_obs --sum of here.ta_path sample_size for each segment -FROM segment_5min_bins ---we do not need to use nested data to determine length here. -WHERE sum_length >= 0.8 -ORDER BY - time_grp, - segment_id, - dt_start, - dt_end ---exclusion constraint + ordered insert to prevent overlapping bins -ON CONFLICT ON CONSTRAINT dynamic_bins_unique -DO NOTHING; - -/* ---bins which were not used. Might consider adding these on to bins that already have sufficient data. -SELECT * -FROM gwolofs.segment_5min_bins AS s5b -LEFT JOIN gwolofs.congestion_raw_segments AS dyb ON - s5b.time_grp = dyb.time_grp - AND s5b.source_node = dyb.source_node - AND s5b.dest_node = dyb.dest_node - AND s5b.tx <@ dyb.bin_range -WHERE dyb.bin_range IS NULL -*/ - -/* -WITH hourly_obs AS ( - SELECT time_grp, segment_id, AVG(tt) AS avg_hour_tt, COUNT(*) - FROM gwolofs.congestion_raw_segments - GROUP BY time_grp, segment_id -) - -SELECT segment_id, date_part('hour', time_grp), AVG(avg_hour_tt) AS avg_tt, SUM(count) -FROM hourly_obs -GROUP BY 1, 2 ORDER BY 1, 2; -*/ \ No newline at end of file From c65985ccf9c0144c2b9c8188ab481deab7523484 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 11 Feb 2025 20:55:57 +0000 Subject: [PATCH 18/74] #1132 rename files/functions continued --- .../create-table-congestion_corridors.sql} | 0 ...create-table-congestion_raw_corridors.sql} | 23 +-- .../create-table-congestion_raw_segments.sql | 20 +- .../function-congestion_cache_corridor.sql} | 16 +- .../function-congestion_cache_tt_results.sql} | 55 ++---- ...function-congestion_day_hr_segment_agg.sql | 184 ++++++++++++++++++ .../function-congestion_dynamic_bin_avg.sql} | 34 ++-- ...unction-congestion_select_map_version.sql} | 4 +- 8 files changed, 246 insertions(+), 90 deletions(-) rename here/traffic/sql/{create-table-tt_segments.sql => dynamic_bins/create-table-congestion_corridors.sql} (100%) rename here/traffic/sql/{create-table-dynamic_binning_results.sql => dynamic_bins/create-table-congestion_raw_corridors.sql} (65%) rename here/traffic/sql/{ => dynamic_bins}/create-table-congestion_raw_segments.sql (77%) rename here/traffic/sql/{function-cache_tt_segment.sql => dynamic_bins/function-congestion_cache_corridor.sql} (84%) rename here/traffic/sql/{function-cache_tt_results.sql => dynamic_bins/function-congestion_cache_tt_results.sql} (78%) create mode 100644 here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql rename here/traffic/sql/{create-function-here_dynamic_bin_avg.sql => dynamic_bins/function-congestion_dynamic_bin_avg.sql} (58%) rename here/traffic/sql/{function-select_map_version.sql => dynamic_bins/function-congestion_select_map_version.sql} (80%) diff --git a/here/traffic/sql/create-table-tt_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql similarity index 100% rename from here/traffic/sql/create-table-tt_segments.sql rename to here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql diff --git a/here/traffic/sql/create-table-dynamic_binning_results.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql similarity index 65% rename from here/traffic/sql/create-table-dynamic_binning_results.sql rename to here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index c0c101d1b..cf250ea1d 100644 --- a/here/traffic/sql/create-table-dynamic_binning_results.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -4,28 +4,25 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors ( + corridor_id smallint, + dt date, time_grp tsrange NOT NULL, bin_range tsrange NOT NULL, - dt_start timestamp without time zone, - dt_end timestamp without time zone, tt numeric, - unadjusted_tt numeric, - total_length numeric, - length_w_data numeric, num_obs integer, - segment_uid smallint, uri_string text COLLATE pg_catalog."default", CONSTRAINT congestion_raw_corridors_exclude EXCLUDE USING gist ( bin_range WITH &&, - segment_uid WITH =, + corridor_id WITH =, time_grp WITH =, - uri_string WITH =) + uri_string WITH = + ) ) TABLESPACE pg_default; ALTER TABLE IF EXISTS gwolofs.congestion_raw_corridors - OWNER to gwolofs; +OWNER TO gwolofs; REVOKE ALL ON TABLE gwolofs.congestion_raw_corridors FROM bdit_humans; @@ -33,12 +30,12 @@ GRANT SELECT ON TABLE gwolofs.congestion_raw_corridors TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_raw_corridors TO gwolofs; --- Index: dynamic_binning_results_time_grp_segment_uid_idx +-- Index: dynamic_binning_results_time_grp_corridor_id_idx --- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_time_grp_segment_uid_idx; +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_time_grp_corridor_id_idx; -CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_segment_uid_idx +CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_corridor_id_idx ON gwolofs.congestion_raw_corridors USING btree - (time_grp ASC NULLS LAST, segment_uid ASC NULLS LAST) + (time_grp ASC NULLS LAST, corridor_id ASC NULLS LAST) WITH (deduplicate_items=True) TABLESPACE pg_default; \ No newline at end of file diff --git a/here/traffic/sql/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql similarity index 77% rename from here/traffic/sql/create-table-congestion_raw_segments.sql rename to here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index 7a0ec772d..4104f9246 100644 --- a/here/traffic/sql/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -5,17 +5,17 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments ( segment_id integer NOT NULL, + dt date NOT NULL, + time_grp tsrange NOT NULL, bin_range tsrange NOT NULL, tt numeric, - unadjusted_tt numeric, - total_length numeric, - length_w_data numeric, num_obs integer, - hr timestamp without time zone, CONSTRAINT congestion_raw_segments_exclude EXCLUDE USING gist ( - hr WITH =, - bin_range WITH &&, - segment_id WITH =) + segment_id WITH =, + dt WITH =, + time_grp WITH =, + bin_range WITH && + ) ) TABLESPACE pg_default; @@ -32,9 +32,9 @@ GRANT ALL ON TABLE gwolofs.congestion_raw_segments TO gwolofs; -- DROP INDEX IF EXISTS gwolofs.dynamic_bin_hr_idx; -CREATE INDEX IF NOT EXISTS dynamic_bin_hr_idx +CREATE INDEX IF NOT EXISTS dynamic_bin_dt_idx ON gwolofs.congestion_raw_segments USING btree - (hr ASC NULLS LAST) + (dt ASC NULLS LAST) WITH (deduplicate_items=True) TABLESPACE pg_default; -- Index: dynamic_bin_idx @@ -43,6 +43,6 @@ CREATE INDEX IF NOT EXISTS dynamic_bin_hr_idx CREATE INDEX IF NOT EXISTS dynamic_bin_idx ON gwolofs.congestion_raw_segments USING btree - (segment_id ASC NULLS LAST, hr ASC NULLS LAST) + (segment_id ASC NULLS LAST, dt ASC NULLS LAST) WITH (deduplicate_items=True) TABLESPACE pg_default; \ No newline at end of file diff --git a/here/traffic/sql/function-cache_tt_segment.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql similarity index 84% rename from here/traffic/sql/function-cache_tt_segment.sql rename to here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index 354f63426..7f15f8183 100644 --- a/here/traffic/sql/function-cache_tt_segment.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -6,7 +6,7 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_corridor( node_start bigint, node_end bigint, map_version text, - OUT uid smallint, + OUT corridor_id smallint, OUT link_dirs text[], OUT lengths numeric[], OUT total_length numeric) @@ -25,16 +25,16 @@ BEGIN --check if the node pair and map_version have already been routed --and if so, return values SELECT - tt.uid, + tt.corridor_id, tt.link_dirs, tt.lengths, tt.total_length - INTO uid, link_dirs, lengths, total_length + INTO corridor_id, link_dirs, lengths, total_length FROM gwolofs.congestion_corridors AS tt WHERE - tt.node_start = cache_tt_segment.node_start - AND tt.node_end = cache_tt_segment.node_end - AND tt.map_version = cache_tt_segment.map_version; + tt.node_start = congestion_cache_corridor.node_start + AND tt.node_end = congestion_cache_corridor.node_end + AND tt.map_version = congestion_cache_corridor.map_version; IF FOUND THEN RETURN; END IF; @@ -68,12 +68,12 @@ EXECUTE format ( link_dirs = excluded.link_dirs, lengths = excluded.lengths, total_length = excluded.total_length - RETURNING uid, link_dirs, lengths, total_length + RETURNING corridor_id, link_dirs, lengths, total_length $$, routing_function, node_start, node_end, -- For routed_links map_version, -- For INSERT SELECT values street_geoms_table -- For JOIN table -) INTO uid, link_dirs, lengths, total_length; +) INTO corridor_id, link_dirs, lengths, total_length; RETURN; END; $BODY$; diff --git a/here/traffic/sql/function-cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql similarity index 78% rename from here/traffic/sql/function-cache_tt_results.sql rename to here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 57d5b7831..fbe8f64f1 100644 --- a/here/traffic/sql/function-cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -22,23 +22,23 @@ DECLARE map_version text; BEGIN -SELECT gwolofs.congestion_select_map_version(cache_tt_results.start_date, cache_tt_results.end_date) INTO map_version; +SELECT gwolofs.congestion_select_map_version(congestion_cache_tt_results.start_date, congestion_cache_tt_results.end_date) INTO map_version; EXECUTE format( $$ WITH segment AS ( SELECT - uid AS segment_uid, + corridor_id, unnested.link_dir, unnested.length, total_length FROM gwolofs.congestion_cache_corridor(%L, %L, %L), - UNNEST(cache_tt_segment.link_dirs, cache_tt_segment.lengths) AS unnested(link_dir, length) + UNNEST(congestion_cache_corridor.link_dirs, congestion_cache_corridor.lengths) AS unnested(link_dir, length) ), segment_5min_bins AS ( SELECT - seg.segment_uid, +seg.corridor_id, ta.tx, seg.total_length, tsrange( @@ -71,9 +71,9 @@ EXECUTE format( ta.tx, ta.dt, seg.total_length, - segment_uid + corridor_id WINDOW w AS ( - PARTITION BY seg.segment_uid, ta.dt + PARTITION BY seg.corridor_id, ta.dt ORDER BY ta.tx ) ), @@ -112,7 +112,7 @@ EXECUTE format( unnested_db_options AS ( SELECT - s5b.segment_uid, + s5b.corridor_id, dbo.time_grp, s5b.total_length, dbo.tx AS dt_start, @@ -133,10 +133,8 @@ EXECUTE format( AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --we need to use nested data to determine length for these multi-period bins - WHERE dbo.start_bin != dbo.end_bin GROUP BY - s5b.segment_uid, + s5b.corridor_id, dbo.time_grp, s5b.total_length, dbo.tx, --stard_bin @@ -148,61 +146,36 @@ EXECUTE format( ) INSERT INTO gwolofs.congestion_raw_corridors ( - uri_string, - time_grp, segment_uid, dt_start, dt_end, bin_range, tt, - unadjusted_tt, total_length, length_w_data, num_obs + uri_string, time_grp, corridor_id, bin_range, tt, num_obs ) --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. - SELECT DISTINCT ON (dt_start) + SELECT DISTINCT ON (dt_start) --distinct on ensures only the shortest option gets proposed for insert %L, time_grp, - segment_uid, - dt_start, - dt_end, + corridor_id, tsrange(dt_start, dt_end, '[)') AS bin_range, total_length / SUM(len) * SUM(tt) AS tt, - SUM(tt) AS unadjusted_tt, - total_length, - SUM(len) AS length_w_data, SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment FROM unnested_db_options GROUP BY time_grp, - segment_uid, + corridor_id, dt_start, dt_end, total_length HAVING SUM(len) >= 0.8 * total_length - UNION - --these 5 minute bins already have sufficient length - --don't need to use nested data to validate. - SELECT - %L, - time_grp, - segment_uid, - tx AS dt_start, - tx + interval '5 minutes' AS dt_end, - tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, - total_length / length_w_data * unadjusted_tt AS tt, - unadjusted_tt, - total_length, - length_w_data, - num_obs --sum of here.ta_path sample_size for each segment - FROM segment_5min_bins - --we do not need to use nested data to determine length here. - WHERE sum_length >= 0.8 ORDER BY dt_start, dt_end --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT dynamic_bins_unique_temp + ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude DO NOTHING; $$, node_start, node_end, map_version, --segment CTE start_tod, end_tod, --segment_5min_bins CTE SELECT start_tod, end_tod, dow_list, start_date, end_date, --segment_5min_bins CTE WHERE - cache_tt_results.uri_string, cache_tt_results.uri_string --INSERT + congestion_cache_tt_results.uri_string, congestion_cache_tt_results.uri_string --INSERT ); END; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql new file mode 100644 index 000000000..6f27989d3 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql @@ -0,0 +1,184 @@ +-- FUNCTION: gwolofs.congestion_day_hr_segment_agg(date) + +-- DROP FUNCTION IF EXISTS gwolofs.congestion_day_hr_segment_agg(date); + +CREATE OR REPLACE FUNCTION gwolofs.congestion_day_hr_segment_agg( + start_date date) + RETURNS void + LANGUAGE 'plpgsql' + COST 100 + VOLATILE PARALLEL UNSAFE +AS $BODY$ + +DECLARE + map_version text := gwolofs.congestion_select_map_version( + dynamic_bin_congestion_ntwrk_hrly.start_date, dynamic_bin_congestion_ntwrk_hrly.end_date); + congestion_network_table text := 'network_links_' || map_version; + +BEGIN + +EXECUTE FORMAT( + $$ + WITH time_bins AS ( + SELECT + start_time, + start_time + '1 hour'::interval AS end_time, + tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp + FROM generate_series( + %1$L::date + '00:00'::time, + %1$L::date + '23 hour'::interval, '1 hour'::interval) AS hours(start_time) + ), + + segments AS ( + SELECT + segment_id, + link_dir, + length, + SUM(length) OVER (PARTITION BY segment_id) AS total_length + FROM congestion.%2$I + ), + + segment_5min_bins AS ( + SELECT + links.segment_id, + tb.time_grp, + ta.tx, + RANK() OVER w AS bin_rank, + links.total_length, + SUM(links.length) / links.total_length AS sum_length, + SUM(links.length) AS length_w_data, + SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, + ARRAY_AGG(links.length ORDER BY link_dir) AS lengths + FROM here.ta_path AS ta + JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time +-- JOIN congestion.network_links_23_4_geom AS links USING (link_dir) + JOIN segments AS links USING (link_dir) +-- JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) + WHERE + ta.dt >= %1$L + AND ta.dt < %1$L + interval '1 day' + GROUP BY + links.segment_id, + tb.time_grp, + ta.tx, + links.total_length + WINDOW w AS ( + PARTITION BY links.segment_id, tb.time_grp + ORDER BY ta.tx + ) + ), + + dynamic_bin_options AS ( + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80%% length + SELECT + tx, + time_grp, + segment_id, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY time_grp, segment_id + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) + ), + + unnested_db_options AS ( + SELECT + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx AS dt_start, + --exclusive end bin + s5b_end.tx + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.segment_id = dbo.segment_id + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.segment_id = dbo.segment_id + AND s5b_end.bin_rank = dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' + GROUP BY + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin + unnested.link_dir, + unnested.len + ) + + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO gwolofs.congestion_raw_segments ( + dt, time_grp, segment_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (time_grp, segment_id, dt_start) + lower(time_grp)::date AS dt, + time_grp, + segment_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + segment_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + time_grp, + segment_id, + bin_range --uses the option that ends first + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT dynamic_bins_unique + DO NOTHING; + $$, + dynamic_bin_congestion_ntwrk_hrly.start_date, + congestion_network_table + ); + +END; +$BODY$; + +ALTER FUNCTION gwolofs.congestion_day_hr_segment_agg(date) + OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_day_hr_segment_agg(date) + IS 'Previously dynamic_bin_congestion_ntwrk_hrly'; diff --git a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql b/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql similarity index 58% rename from here/traffic/sql/create-function-here_dynamic_bin_avg.sql rename to here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql index d51b7f0f0..a87412ec0 100644 --- a/here/traffic/sql/create-function-here_dynamic_bin_avg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql @@ -18,28 +18,28 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_dynamic_bin_avg( AS $BODY$ DECLARE uri_string_func text := - here_dynamic_bin_avg.node_start::text || '/' || - here_dynamic_bin_avg.node_end::text || '/' || - here_dynamic_bin_avg.start_tod::text || '/' || - here_dynamic_bin_avg.end_tod::text || '/' || - here_dynamic_bin_avg.start_date::text || '/' || - here_dynamic_bin_avg.end_date::text || '/' || - here_dynamic_bin_avg.holidays::text || '/' || - here_dynamic_bin_avg.dow_list::text; + congestion_dynamic_bin_avg.node_start::text || '/' || + congestion_dynamic_bin_avg.node_end::text || '/' || + congestion_dynamic_bin_avg.start_tod::text || '/' || + congestion_dynamic_bin_avg.end_tod::text || '/' || + congestion_dynamic_bin_avg.start_date::text || '/' || + congestion_dynamic_bin_avg.end_date::text || '/' || + congestion_dynamic_bin_avg.holidays::text || '/' || + congestion_dynamic_bin_avg.dow_list::text; res numeric; BEGIN PERFORM gwolofs.congestion_cache_tt_results( uri_string := uri_string_func, - start_date := here_dynamic_bin_avg.start_date, - end_date := here_dynamic_bin_avg.end_date, - start_tod := here_dynamic_bin_avg.start_tod, - end_tod := here_dynamic_bin_avg.end_tod, - dow_list := here_dynamic_bin_avg.dow_list, - node_start := here_dynamic_bin_avg.node_start, - node_end := here_dynamic_bin_avg.node_end, - holidays := here_dynamic_bin_avg.holidays + start_date := congestion_dynamic_bin_avg.start_date, + end_date := congestion_dynamic_bin_avg.end_date, + start_tod := congestion_dynamic_bin_avg.start_tod, + end_tod := congestion_dynamic_bin_avg.end_tod, + dow_list := congestion_dynamic_bin_avg.dow_list, + node_start := congestion_dynamic_bin_avg.node_start, + node_end := congestion_dynamic_bin_avg.node_end, + holidays := congestion_dynamic_bin_avg.holidays ); WITH daily_means AS ( @@ -63,4 +63,4 @@ $BODY$; ALTER FUNCTION gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) OWNER TO gwolofs; -COMMENT ON FUNCTION gwolofs.congestion_dynamic_bin_avg IS 'Previously gwolofs.here_dynamic_bin_avg.'; +COMMENT ON FUNCTION gwolofs.congestion_dynamic_bin_avg IS 'Previously gwolofs.congestion_dynamic_bin_avg.'; diff --git a/here/traffic/sql/function-select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql similarity index 80% rename from here/traffic/sql/function-select_map_version.sql rename to here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index ab4ca1010..b1d3c3343 100644 --- a/here/traffic/sql/function-select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -14,7 +14,9 @@ AS $BODY$ SELECT street_version FROM here.street_valid_range AS svr, LATERAL ( - SELECT svr.valid_range * daterange(select_map_version.start_date, select_map_version.end_date, '[)') AS overlap + SELECT svr.valid_range * daterange( + congestion_select_map_version.start_date, + congestion_select_map_version.end_date, '[)') AS overlap ) AS lat WHERE UPPER(lat.overlap) - LOWER(lat.overlap) IS NOT NULL ORDER BY UPPER(lat.overlap) - LOWER(lat.overlap) DESC NULLS LAST From a59f2fd56ba87467e9f0c4b6d5b88c4cccaecba9 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 11 Feb 2025 21:01:07 +0000 Subject: [PATCH 19/74] #1132 fluff --- .../function-congestion_cache_corridor.sql | 14 +++++++------- .../function-congestion_cache_tt_results.sql | 18 +++++++++--------- .../function-congestion_day_hr_segment_agg.sql | 2 +- .../function-congestion_select_map_version.sql | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index 7f15f8183..5433dd398 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -3,13 +3,13 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_corridor(bigint, bigint, text); CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_corridor( - node_start bigint, - node_end bigint, - map_version text, - OUT corridor_id smallint, - OUT link_dirs text[], - OUT lengths numeric[], - OUT total_length numeric) + node_start bigint, + node_end bigint, + map_version text, + OUT corridor_id smallint, + OUT link_dirs text[], + OUT lengths numeric[], + OUT total_length numeric) RETURNS record LANGUAGE 'plpgsql' COST 100 diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index fbe8f64f1..cd66fda48 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -3,15 +3,15 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results( - uri_string text, - start_date date, - end_date date, - start_tod time without time zone, - end_tod time without time zone, - dow_list integer[], - node_start bigint, - node_end bigint, - holidays boolean) + uri_string text, + start_date date, + end_date date, + start_tod time without time zone, + end_tod time without time zone, + dow_list integer[], + node_start bigint, + node_end bigint, + holidays boolean) RETURNS void LANGUAGE 'plpgsql' COST 100 diff --git a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql index 6f27989d3..83009ac12 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql @@ -3,7 +3,7 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_day_hr_segment_agg(date); CREATE OR REPLACE FUNCTION gwolofs.congestion_day_hr_segment_agg( - start_date date) + start_date date) RETURNS void LANGUAGE 'plpgsql' COST 100 diff --git a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index b1d3c3343..1127a5249 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -3,8 +3,8 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_select_map_version(date, date); CREATE OR REPLACE FUNCTION gwolofs.congestion_select_map_version( - start_date date, - end_date date) + start_date date, + end_date date) RETURNS text LANGUAGE 'sql' COST 100 From b23d61115cf2bd205244b86d5d0730aa8968eda8 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 13 Feb 2025 15:40:03 +0000 Subject: [PATCH 20/74] tsrange -> timerange --- .../create-table-congestion_raw_corridors.sql | 2 +- .../create-table-congestion_raw_segments.sql | 2 +- .../function-congestion_cache_tt_results.sql | 26 +++++++++++++------ ...function-congestion_day_hr_segment_agg.sql | 7 ++--- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index cf250ea1d..866fcea1f 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -6,7 +6,7 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors ( corridor_id smallint, dt date, - time_grp tsrange NOT NULL, + time_grp timerange NOT NULL, bin_range tsrange NOT NULL, tt numeric, num_obs integer, diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index 4104f9246..3081bed57 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -6,7 +6,7 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments ( segment_id integer NOT NULL, dt date NOT NULL, - time_grp tsrange NOT NULL, + time_grp timerange NOT NULL, bin_range tsrange NOT NULL, tt numeric, num_obs integer, diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index cd66fda48..4e76bea4a 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -33,12 +33,15 @@ EXECUTE format( unnested.length, total_length FROM gwolofs.congestion_cache_corridor(%L, %L, %L), - UNNEST(congestion_cache_corridor.link_dirs, congestion_cache_corridor.lengths) AS unnested(link_dir, length) + UNNEST( + congestion_cache_corridor.link_dirs, + congestion_cache_corridor.lengths + ) AS unnested(link_dir, length) ), segment_5min_bins AS ( SELECT -seg.corridor_id, + seg.corridor_id, ta.tx, seg.total_length, tsrange( @@ -52,7 +55,7 @@ seg.corridor_id, ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(seg.length ORDER BY link_dir) AS lengths - FROM here.ta AS ta + FROM here.ta_path AS ta JOIN segment AS seg USING (link_dir) WHERE ( @@ -97,7 +100,10 @@ seg.corridor_id, --dont need to generate options when start segment is already sufficient WHEN sum_length >= 0.8 THEN bin_rank --generate options until 1 bin has sufficient length, otherwise until last bin in group - ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + ELSE COALESCE( + MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, + MAX(bin_rank) OVER w + ) END, 1 ) AS end_bin @@ -146,13 +152,14 @@ seg.corridor_id, ) INSERT INTO gwolofs.congestion_raw_corridors ( - uri_string, time_grp, corridor_id, bin_range, tt, num_obs + uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs ) --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. SELECT DISTINCT ON (dt_start) --distinct on ensures only the shortest option gets proposed for insert %L, - time_grp, + dt_start::date AS dt, + timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, corridor_id, tsrange(dt_start, dt_end, '[)') AS bin_range, total_length / SUM(len) * SUM(tt) AS tt, @@ -181,5 +188,8 @@ seg.corridor_id, END; $BODY$; -ALTER FUNCTION gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) - OWNER TO gwolofs; +ALTER FUNCTION gwolofs.congestion_cache_tt_results( + text, date, date, time without time zone, + time without time zone, integer[], bigint, bigint, boolean +) +OWNER TO gwolofs; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql index 83009ac12..bd5612a5c 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql @@ -23,7 +23,10 @@ EXECUTE FORMAT( SELECT start_time, start_time + '1 hour'::interval AS end_time, - tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp + timerange( + start_time::time, + CASE start_time::time WHEN '23:00' THEN '24:00' ELSE start_time::time + '1 hour'::interval END, + '[)') AS time_grp FROM generate_series( %1$L::date + '00:00'::time, %1$L::date + '23 hour'::interval, '1 hour'::interval) AS hours(start_time) @@ -54,9 +57,7 @@ EXECUTE FORMAT( ARRAY_AGG(links.length ORDER BY link_dir) AS lengths FROM here.ta_path AS ta JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time --- JOIN congestion.network_links_23_4_geom AS links USING (link_dir) JOIN segments AS links USING (link_dir) --- JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) WHERE ta.dt >= %1$L AND ta.dt < %1$L + interval '1 day' From b7a6f70536b401a5f02cc4b19ce57ad98822a906 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 13 Feb 2025 19:39:32 +0000 Subject: [PATCH 21/74] #1132 congestion ntwrk hrly and period agg updates --- ...ion-congestion_network_hr_segment_agg.sql} | 20 +- ...-congestion_network_period_segment_agg.sql | 193 ++++++++++++++++++ 2 files changed, 203 insertions(+), 10 deletions(-) rename here/traffic/sql/dynamic_bins/{function-congestion_day_hr_segment_agg.sql => function-congestion_network_hr_segment_agg.sql} (92%) create mode 100644 here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql diff --git a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql similarity index 92% rename from here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql rename to here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql index bd5612a5c..0eb08624a 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_day_hr_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql @@ -11,8 +11,7 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_day_hr_segment_agg( AS $BODY$ DECLARE - map_version text := gwolofs.congestion_select_map_version( - dynamic_bin_congestion_ntwrk_hrly.start_date, dynamic_bin_congestion_ntwrk_hrly.end_date); + map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1); congestion_network_table text := 'network_links_' || map_version; BEGIN @@ -59,8 +58,8 @@ EXECUTE FORMAT( JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time JOIN segments AS links USING (link_dir) WHERE - ta.dt >= %1$L - AND ta.dt < %1$L + interval '1 day' + ta.dt >= %1$L::date + AND ta.dt < %1$L::date + interval '1 day' GROUP BY links.segment_id, tb.time_grp, @@ -149,7 +148,7 @@ EXECUTE FORMAT( ) --distinct on ensures only the shortest option gets proposed for insert SELECT DISTINCT ON (time_grp, segment_id, dt_start) - lower(time_grp)::date AS dt, + dt_start::date AS dt, time_grp, segment_id, tsrange(dt_start, dt_end, '[)') AS bin_range, @@ -166,12 +165,13 @@ EXECUTE FORMAT( ORDER BY time_grp, segment_id, - bin_range --uses the option that ends first + dt_start, + dt_end --uses the option that ends first --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT dynamic_bins_unique + ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude DO NOTHING; $$, - dynamic_bin_congestion_ntwrk_hrly.start_date, + start_date, congestion_network_table ); @@ -179,7 +179,7 @@ END; $BODY$; ALTER FUNCTION gwolofs.congestion_day_hr_segment_agg(date) - OWNER TO gwolofs; +OWNER TO gwolofs; COMMENT ON FUNCTION gwolofs.congestion_day_hr_segment_agg(date) - IS 'Previously dynamic_bin_congestion_ntwrk_hrly'; +IS 'Dynamic bin aggregation of the congestion network by hourly periods.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql new file mode 100644 index 000000000..8091e55fc --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql @@ -0,0 +1,193 @@ +-- FUNCTION: gwolofs.congestion_network_period_segment_agg(date) + +-- DROP FUNCTION IF EXISTS gwolofs.congestion_network_period_segment_agg(date); + +CREATE OR REPLACE FUNCTION gwolofs.congestion_network_period_segment_agg( + start_date date) + RETURNS void + LANGUAGE 'plpgsql' + COST 100 + VOLATILE PARALLEL UNSAFE +AS $BODY$ + +DECLARE + map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1); + congestion_network_table text := 'network_links_' || map_version; + +BEGIN + +EXECUTE FORMAT( + $$ + WITH time_bins AS ( + SELECT + %1$L::date + start_tod AS start_time, --start_date + %1$L::date + end_tod AS end_time, --start_date + timerange(start_tod, end_tod, '[)') AS time_grp + FROM + (VALUES + ('00:00:00'::time, '06:00:00'::time), + ('06:00:00'::time, '10:00:00'::time), + ('10:00:00'::time, '15:00:00'::time), + ('15:00:00'::time, '19:00:00'::time), + ('19:00:00'::time, '24:00:00'::time) + ) AS times(start_tod, end_tod) + ), + + segments AS ( + SELECT + segment_id, + link_dir, + length, + SUM(length) OVER (PARTITION BY segment_id) AS total_length + FROM congestion.%2$I + ), + + segment_5min_bins AS ( + SELECT + links.segment_id, + tb.time_grp, + ta.tx, + RANK() OVER w AS bin_rank, + links.total_length, + SUM(links.length) / links.total_length AS sum_length, + SUM(links.length) AS length_w_data, + SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, + ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, + ARRAY_AGG(links.length ORDER BY link_dir) AS lengths + FROM here.ta_path AS ta + JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time + JOIN segments AS links USING (link_dir) + WHERE + ta.dt >= %1$L::date + AND ta.dt < %1$L::date + interval '1 day' + GROUP BY + links.segment_id, + tb.time_grp, + ta.tx, + links.total_length + WINDOW w AS ( + PARTITION BY links.segment_id, tb.time_grp + ORDER BY ta.tx + ) + ), + + dynamic_bin_options AS ( + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80%% length + SELECT + tx, + time_grp, + segment_id, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY time_grp, segment_id + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) + ), + + unnested_db_options AS ( + SELECT + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx AS dt_start, + --exclusive end bin + s5b_end.tx + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.segment_id = dbo.segment_id + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.segment_id = dbo.segment_id + AND s5b_end.bin_rank = dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --makes sense to relax this constraint for periods + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + --WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' + GROUP BY + dbo.time_grp, + dbo.segment_id, + s5b.total_length, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin + unnested.link_dir, + unnested.len + ) + + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO gwolofs.congestion_raw_segments ( + dt, time_grp, segment_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (time_grp, segment_id, dt_start) + dt_start::date AS dt, + time_grp, + segment_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + segment_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + time_grp, + segment_id, + dt_start, + dt_end --uses the option that ends first + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude + DO NOTHING; + $$, + start_date, + congestion_network_table + ); + +END; +$BODY$; + +ALTER FUNCTION gwolofs.congestion_network_period_segment_agg(date) + OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_network_period_segment_agg(date) + IS '''Dynamic bin aggregation of the congestion network by periods: + (''00:00:00''::time, ''06:00:00''::time), + (''06:00:00''::time, ''10:00:00''::time), + (''10:00:00''::time, ''15:00:00''::time), + (''15:00:00''::time, ''19:00:00''::time), + (''19:00:00''::time, ''24:00:00''::time)'''; From 674b0c1db503af59a8bf2fea203e494f649b9289 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 14 Feb 2025 17:17:42 +0000 Subject: [PATCH 22/74] #1132 rename function for consistency --- .../function-congestion_network_hr_segment_agg.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql index 0eb08624a..78d69b198 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql @@ -1,8 +1,8 @@ --- FUNCTION: gwolofs.congestion_day_hr_segment_agg(date) +-- FUNCTION: gwolofs.congestion_network_hr_segment_agg(date) --- DROP FUNCTION IF EXISTS gwolofs.congestion_day_hr_segment_agg(date); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_network_hr_segment_agg(date); -CREATE OR REPLACE FUNCTION gwolofs.congestion_day_hr_segment_agg( +CREATE OR REPLACE FUNCTION gwolofs.congestion_network_hr_segment_agg( start_date date) RETURNS void LANGUAGE 'plpgsql' @@ -178,8 +178,8 @@ EXECUTE FORMAT( END; $BODY$; -ALTER FUNCTION gwolofs.congestion_day_hr_segment_agg(date) +ALTER FUNCTION gwolofs.congestion_network_hr_segment_agg(date) OWNER TO gwolofs; -COMMENT ON FUNCTION gwolofs.congestion_day_hr_segment_agg(date) +COMMENT ON FUNCTION gwolofs.congestion_network_hr_segment_agg(date) IS 'Dynamic bin aggregation of the congestion network by hourly periods.'; From 61fdfdc0e2887dfff6c284fdb89b68bebd41e90e Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:58:54 +0000 Subject: [PATCH 23/74] #1132 move time_grps to view, combine hourly and period agg --- .../create-view-congestion_time_grps.sql | 32 +++ ...tion-congestion_network_hr_segment_agg.sql | 41 ++-- ...-congestion_network_period_segment_agg.sql | 193 ------------------ 3 files changed, 48 insertions(+), 218 deletions(-) create mode 100644 here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql delete mode 100644 here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql diff --git a/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql b/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql new file mode 100644 index 000000000..1871889d0 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql @@ -0,0 +1,32 @@ +CREATE VIEW gwolofs.congestion_time_grps AS + +SELECT + start_tod, + end_tod, + 1 AS table_order +FROM ( + VALUES + ('00:00:00'::time, '06:00:00'::time), + ('06:00:00'::time, '10:00:00'::time), + ('10:00:00'::time, '15:00:00'::time), + ('15:00:00'::time, '19:00:00'::time), + ('19:00:00'::time, '24:00:00'::time) +) AS times(start_tod, end_tod) +UNION +SELECT + (start_hour || ':00')::time AS start_tod, + (start_hour + 1 || ':00')::time AS end_tod, + 2 AS table_order +FROM generate_series(0, 23, 1) AS start_hour +ORDER BY + table_order, + start_tod, + end_tod; + +COMMENT ON VIEW gwolofs.congestion_time_grps +IS 'Hours and time periods for congestion aggregation.'; + +ALTER VIEW gwolofs.congestion_time_grps OWNER TO gwolofs; + +GRANT SELECT ON TABLE gwolofs.congestion_time_grps TO bdit_humans; +GRANT ALL ON TABLE gwolofs.congestion_time_grps TO gwolofs; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql index 78d69b198..e68ead7eb 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql @@ -1,8 +1,8 @@ --- FUNCTION: gwolofs.congestion_network_hr_segment_agg(date) +-- FUNCTION: gwolofs.congestion_network_segment_agg(date) --- DROP FUNCTION IF EXISTS gwolofs.congestion_network_hr_segment_agg(date); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_network_segment_agg(date); -CREATE OR REPLACE FUNCTION gwolofs.congestion_network_hr_segment_agg( +CREATE OR REPLACE FUNCTION gwolofs.congestion_network_segment_agg( start_date date) RETURNS void LANGUAGE 'plpgsql' @@ -12,26 +12,14 @@ AS $BODY$ DECLARE map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1); - congestion_network_table text := 'network_links_' || map_version; + congestion_network_table text := 'network_links_' || map_version + || CASE map_version WHEN '23_4' THEN '_geom' ELSE '' END; --temp fix version BEGIN EXECUTE FORMAT( $$ - WITH time_bins AS ( - SELECT - start_time, - start_time + '1 hour'::interval AS end_time, - timerange( - start_time::time, - CASE start_time::time WHEN '23:00' THEN '24:00' ELSE start_time::time + '1 hour'::interval END, - '[)') AS time_grp - FROM generate_series( - %1$L::date + '00:00'::time, - %1$L::date + '23 hour'::interval, '1 hour'::interval) AS hours(start_time) - ), - - segments AS ( + WITH segments AS ( SELECT segment_id, link_dir, @@ -43,7 +31,7 @@ EXECUTE FORMAT( segment_5min_bins AS ( SELECT links.segment_id, - tb.time_grp, + timerange(tg.start_tod, tg.end_tod, '[)') AS time_grp, ta.tx, RANK() OVER w AS bin_rank, links.total_length, @@ -55,18 +43,21 @@ EXECUTE FORMAT( ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(links.length ORDER BY link_dir) AS lengths FROM here.ta_path AS ta - JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time + JOIN gwolofs.congestion_time_grps AS tg ON + ta.tx >= %1$L::date + tg.start_tod + AND ta.tx < %1$L::date + tg.end_tod JOIN segments AS links USING (link_dir) WHERE ta.dt >= %1$L::date AND ta.dt < %1$L::date + interval '1 day' GROUP BY links.segment_id, - tb.time_grp, + tg.start_tod, + tg.end_tod, ta.tx, links.total_length WINDOW w AS ( - PARTITION BY links.segment_id, tb.time_grp + PARTITION BY links.segment_id, tg.start_tod, tg.end_tod ORDER BY ta.tx ) ), @@ -178,8 +169,8 @@ EXECUTE FORMAT( END; $BODY$; -ALTER FUNCTION gwolofs.congestion_network_hr_segment_agg(date) +ALTER FUNCTION gwolofs.congestion_network_segment_agg(date) OWNER TO gwolofs; -COMMENT ON FUNCTION gwolofs.congestion_network_hr_segment_agg(date) -IS 'Dynamic bin aggregation of the congestion network by hourly periods.'; +COMMENT ON FUNCTION gwolofs.congestion_network_segment_agg(date) +IS 'Dynamic bin aggregation of the congestion network by hour and time periods.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql deleted file mode 100644 index 8091e55fc..000000000 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_period_segment_agg.sql +++ /dev/null @@ -1,193 +0,0 @@ --- FUNCTION: gwolofs.congestion_network_period_segment_agg(date) - --- DROP FUNCTION IF EXISTS gwolofs.congestion_network_period_segment_agg(date); - -CREATE OR REPLACE FUNCTION gwolofs.congestion_network_period_segment_agg( - start_date date) - RETURNS void - LANGUAGE 'plpgsql' - COST 100 - VOLATILE PARALLEL UNSAFE -AS $BODY$ - -DECLARE - map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1); - congestion_network_table text := 'network_links_' || map_version; - -BEGIN - -EXECUTE FORMAT( - $$ - WITH time_bins AS ( - SELECT - %1$L::date + start_tod AS start_time, --start_date - %1$L::date + end_tod AS end_time, --start_date - timerange(start_tod, end_tod, '[)') AS time_grp - FROM - (VALUES - ('00:00:00'::time, '06:00:00'::time), - ('06:00:00'::time, '10:00:00'::time), - ('10:00:00'::time, '15:00:00'::time), - ('15:00:00'::time, '19:00:00'::time), - ('19:00:00'::time, '24:00:00'::time) - ) AS times(start_tod, end_tod) - ), - - segments AS ( - SELECT - segment_id, - link_dir, - length, - SUM(length) OVER (PARTITION BY segment_id) AS total_length - FROM congestion.%2$I - ), - - segment_5min_bins AS ( - SELECT - links.segment_id, - tb.time_grp, - ta.tx, - RANK() OVER w AS bin_rank, - links.total_length, - SUM(links.length) / links.total_length AS sum_length, - SUM(links.length) AS length_w_data, - SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, - SUM(sample_size) AS num_obs, - ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, - ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, - ARRAY_AGG(links.length ORDER BY link_dir) AS lengths - FROM here.ta_path AS ta - JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time - JOIN segments AS links USING (link_dir) - WHERE - ta.dt >= %1$L::date - AND ta.dt < %1$L::date + interval '1 day' - GROUP BY - links.segment_id, - tb.time_grp, - ta.tx, - links.total_length - WINDOW w AS ( - PARTITION BY links.segment_id, tb.time_grp - ORDER BY ta.tx - ) - ), - - dynamic_bin_options AS ( - --within each segment/hour, generate all possible forward looking bin combinations - --don't generate options for bins with sufficient length - --also don't generate options past the next bin with 80%% length - SELECT - tx, - time_grp, - segment_id, - bin_rank AS start_bin, - --generate all the options for the end bin within the group. - generate_series( - CASE - WHEN sum_length >= 0.8 THEN bin_rank - --if length is insufficient, need at least 1 more bin - ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) - END, - CASE - --dont need to generate options when start segment is already sufficient - WHEN sum_length >= 0.8 THEN bin_rank - --generate options until 1 bin has sufficient length, otherwise until last bin in group - ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) - END, - 1 - ) AS end_bin - FROM segment_5min_bins - WINDOW w AS ( - PARTITION BY time_grp, segment_id - ORDER BY tx - --look only forward for end_bin options - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING - ) - ), - - unnested_db_options AS ( - SELECT - dbo.time_grp, - dbo.segment_id, - s5b.total_length, - dbo.tx AS dt_start, - --exclusive end bin - s5b_end.tx + interval '5 minutes' AS dt_end, - unnested.link_dir, - unnested.len, - AVG(unnested.tt) AS tt, --avg TT for each link_dir - SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir - FROM dynamic_bin_options AS dbo - LEFT JOIN segment_5min_bins AS s5b - ON s5b.time_grp = dbo.time_grp - AND s5b.segment_id = dbo.segment_id - AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin - --this join is used to get the tx info about the last bin only - LEFT JOIN segment_5min_bins AS s5b_end - ON s5b_end.time_grp = dbo.time_grp - AND s5b_end.segment_id = dbo.segment_id - AND s5b_end.bin_rank = dbo.end_bin, - --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin - UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --makes sense to relax this constraint for periods - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - --WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' - GROUP BY - dbo.time_grp, - dbo.segment_id, - s5b.total_length, - dbo.tx, --stard_bin - s5b_end.tx, --end_bin - unnested.link_dir, - unnested.len - ) - - --this query contains overlapping values which get eliminated - --via on conflict with the exclusion constraint on congestion_raw_segments table. - INSERT INTO gwolofs.congestion_raw_segments ( - dt, time_grp, segment_id, bin_range, tt, num_obs - ) - --distinct on ensures only the shortest option gets proposed for insert - SELECT DISTINCT ON (time_grp, segment_id, dt_start) - dt_start::date AS dt, - time_grp, - segment_id, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment - FROM unnested_db_options - GROUP BY - time_grp, - segment_id, - dt_start, - dt_end, - total_length - HAVING SUM(len) >= 0.8 * total_length - ORDER BY - time_grp, - segment_id, - dt_start, - dt_end --uses the option that ends first - --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude - DO NOTHING; - $$, - start_date, - congestion_network_table - ); - -END; -$BODY$; - -ALTER FUNCTION gwolofs.congestion_network_period_segment_agg(date) - OWNER TO gwolofs; - -COMMENT ON FUNCTION gwolofs.congestion_network_period_segment_agg(date) - IS '''Dynamic bin aggregation of the congestion network by periods: - (''00:00:00''::time, ''06:00:00''::time), - (''06:00:00''::time, ''10:00:00''::time), - (''10:00:00''::time, ''15:00:00''::time), - (''15:00:00''::time, ''19:00:00''::time), - (''19:00:00''::time, ''24:00:00''::time)'''; From e9049a4fb3e0f88685746d19ae20cb060d34401a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:59:27 +0000 Subject: [PATCH 24/74] #1132 rename congestion_network_segment_agg --- ...egment_agg.sql => function-congestion_network_segment_agg.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename here/traffic/sql/dynamic_bins/{function-congestion_network_hr_segment_agg.sql => function-congestion_network_segment_agg.sql} (100%) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql similarity index 100% rename from here/traffic/sql/dynamic_bins/function-congestion_network_hr_segment_agg.sql rename to here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql From 4e93259b34c8bd34c32d9d25585a63670c038878 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 3 Mar 2025 17:30:15 +0000 Subject: [PATCH 25/74] #1132 fluff and add comments --- .../create-table-congestion_corridors.sql | 7 ++- .../create-table-congestion_raw_corridors.sql | 30 ++++++++++--- .../create-table-congestion_raw_segments.sql | 30 +++++++------ .../create-view-congestion_time_grps.sql | 17 ++++--- .../function-congestion_cache_corridor.sql | 29 +++++++----- .../function-congestion_cache_tt_results.sql | 45 +++++++++++-------- .../function-congestion_dynamic_bin_avg.sql | 29 +++++++----- ...unction-congestion_network_segment_agg.sql | 11 ++--- ...function-congestion_select_map_version.sql | 16 ++++--- 9 files changed, 134 insertions(+), 80 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql index 327c5cb8c..529d26905 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql @@ -4,8 +4,8 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_corridors ( - link_dirs text[] COLLATE pg_catalog."default", - lengths numeric[], + link_dirs text [] COLLATE pg_catalog."default", + lengths numeric [], geom geometry, total_length numeric, corridor_id smallint NOT NULL DEFAULT nextval('congestion_corridors_uid_seq'::regclass), @@ -25,3 +25,6 @@ REVOKE ALL ON TABLE gwolofs.congestion_corridors FROM bdit_humans; GRANT SELECT ON TABLE gwolofs.congestion_corridors TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_corridors TO gwolofs; + +COMMENT ON TABLE gwolofs.congestion_corridors IS +'Stores cached travel time corridors to reduce routing time.'; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index 866fcea1f..9a823249c 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -30,12 +30,32 @@ GRANT SELECT ON TABLE gwolofs.congestion_raw_corridors TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_raw_corridors TO gwolofs; +-- Index: congestion_raw_corridors_dt_idx + +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_dt_idx; + +CREATE INDEX IF NOT EXISTS congestion_raw_corridors_dt_idx +ON gwolofs.congestion_raw_corridors USING brin +(dt) +TABLESPACE pg_default; +-- Index: congestion_raw_corridors_uri_string + +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_uri_string; + +CREATE INDEX IF NOT EXISTS congestion_raw_corridors_uri_string +ON gwolofs.congestion_raw_corridors USING btree +(uri_string COLLATE pg_catalog."default" ASC NULLS LAST) +WITH (deduplicate_items = TRUE) +TABLESPACE pg_default; -- Index: dynamic_binning_results_time_grp_corridor_id_idx --- DROP INDEX IF EXISTS gwolofs.congestion_raw_corridors_time_grp_corridor_id_idx; +-- DROP INDEX IF EXISTS gwolofs.dynamic_binning_results_time_grp_corridor_id_idx; CREATE INDEX IF NOT EXISTS dynamic_binning_results_time_grp_corridor_id_idx - ON gwolofs.congestion_raw_corridors USING btree - (time_grp ASC NULLS LAST, corridor_id ASC NULLS LAST) - WITH (deduplicate_items=True) - TABLESPACE pg_default; \ No newline at end of file +ON gwolofs.congestion_raw_corridors USING btree +(time_grp ASC NULLS LAST, corridor_id ASC NULLS LAST, dt ASC NULLS LAST) +WITH (deduplicate_items = TRUE) +TABLESPACE pg_default; + +COMMENT ON TABLE gwolofs.congestion_raw_corridors IS +'Stores dynamic binning results for custom corridor based travel time requests.'; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index 3081bed57..f73ecb566 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -28,21 +28,23 @@ REVOKE ALL ON TABLE gwolofs.congestion_raw_segments FROM bdit_humans; GRANT SELECT ON TABLE gwolofs.congestion_raw_segments TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_raw_segments TO gwolofs; --- Index: dynamic_bin_hr_idx --- DROP INDEX IF EXISTS gwolofs.dynamic_bin_hr_idx; +-- Index: congestion_raw_segments_dt_idx -CREATE INDEX IF NOT EXISTS dynamic_bin_dt_idx - ON gwolofs.congestion_raw_segments USING btree - (dt ASC NULLS LAST) - WITH (deduplicate_items=True) - TABLESPACE pg_default; --- Index: dynamic_bin_idx +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_segments_dt_idx; --- DROP INDEX IF EXISTS gwolofs.dynamic_bin_idx; +CREATE INDEX IF NOT EXISTS congestion_raw_segments_dt_idx +ON gwolofs.congestion_raw_segments USING brin +(dt) +TABLESPACE pg_default; +-- Index: congestion_raw_segments_segment_dt_idx + +-- DROP INDEX IF EXISTS gwolofs.congestion_raw_segments_segment_dt_idx; + +CREATE INDEX IF NOT EXISTS congestion_raw_segments_segment_dt_idx +ON gwolofs.congestion_raw_segments USING btree +(segment_id ASC NULLS LAST, dt ASC NULLS LAST) +TABLESPACE pg_default; -CREATE INDEX IF NOT EXISTS dynamic_bin_idx - ON gwolofs.congestion_raw_segments USING btree - (segment_id ASC NULLS LAST, dt ASC NULLS LAST) - WITH (deduplicate_items=True) - TABLESPACE pg_default; \ No newline at end of file +COMMENT ON TABLE gwolofs.congestion_raw_corridors IS +'Stores dynamic binning results from standard HERE congestion network travel time aggregations.'; diff --git a/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql b/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql index 1871889d0..98d828139 100644 --- a/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql +++ b/here/traffic/sql/dynamic_bins/create-view-congestion_time_grps.sql @@ -1,3 +1,6 @@ +--these time periods should be scrutinized more. +--structure may also need changes if we want different weekday and weekend time periods. + CREATE VIEW gwolofs.congestion_time_grps AS SELECT @@ -6,12 +9,12 @@ SELECT 1 AS table_order FROM ( VALUES - ('00:00:00'::time, '06:00:00'::time), - ('06:00:00'::time, '10:00:00'::time), - ('10:00:00'::time, '15:00:00'::time), - ('15:00:00'::time, '19:00:00'::time), - ('19:00:00'::time, '24:00:00'::time) -) AS times(start_tod, end_tod) + ('00:00:00'::time, '06:00:00'::time), + ('06:00:00'::time, '10:00:00'::time), + ('10:00:00'::time, '15:00:00'::time), + ('15:00:00'::time, '19:00:00'::time), + ('19:00:00'::time, '24:00:00'::time) +) AS times (start_tod, end_tod) UNION SELECT (start_hour || ':00')::time AS start_tod, @@ -22,7 +25,7 @@ ORDER BY table_order, start_tod, end_tod; - + COMMENT ON VIEW gwolofs.congestion_time_grps IS 'Hours and time periods for congestion aggregation.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index 5433dd398..9840dfd7b 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -7,13 +7,14 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_corridor( node_end bigint, map_version text, OUT corridor_id smallint, - OUT link_dirs text[], - OUT lengths numeric[], - OUT total_length numeric) - RETURNS record - LANGUAGE 'plpgsql' - COST 100 - VOLATILE PARALLEL SAFE + OUT link_dirs text [], + OUT lengths numeric [], + OUT total_length numeric +) +RETURNS record +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL SAFE AS $BODY$ DECLARE @@ -23,7 +24,7 @@ DECLARE BEGIN --check if the node pair and map_version have already been routed - --and if so, return values + --and if so, return values, saving routing time SELECT tt.corridor_id, tt.link_dirs, @@ -56,9 +57,9 @@ EXECUTE format ( %4$L AS map_version, ARRAY_AGG(rl.link_dir ORDER BY rl.seq) AS link_dirs, --lengths in m - ARRAY_AGG(ST_Length(ST_Transform(streets.geom,2952)) ORDER BY rl.seq) AS lengths, + ARRAY_AGG(st_length(st_transform(streets.geom, 2952)) ORDER BY rl.seq) AS lengths, st_union(st_linemerge(streets.geom)) AS geom, - SUM(ST_Length(ST_Transform(streets.geom,2952))) AS total_length + SUM(ST_Length(ST_Transform(streets.geom, 2952))) AS total_length FROM routed_links AS rl JOIN here.%5$I AS streets USING (link_dir) --conflict would occur because of null values @@ -79,4 +80,10 @@ END; $BODY$; ALTER FUNCTION gwolofs.congestion_cache_corridor(bigint, bigint, text) - OWNER TO gwolofs; +OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_cache_corridor IS +'Returns definition of a HERE corridor, given input nodes and map_version. +First checks if corridor has already been cached and if so retrieves the +cached values. If not, a new entry is added to gwolofs.congestion_corridors +table and returned.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 4e76bea4a..4529296df 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -1,6 +1,6 @@ --- FUNCTION: gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +-- FUNCTION: gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --noqa: LT05 --- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_cache_tt_results(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); --noqa: LT05 CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results( uri_string text, @@ -8,21 +8,25 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results( end_date date, start_tod time without time zone, end_tod time without time zone, - dow_list integer[], + dow_list integer [], node_start bigint, node_end bigint, - holidays boolean) - RETURNS void - LANGUAGE 'plpgsql' - COST 100 - VOLATILE PARALLEL UNSAFE + holidays boolean +) +RETURNS void +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL UNSAFE AS $BODY$ DECLARE map_version text; BEGIN -SELECT gwolofs.congestion_select_map_version(congestion_cache_tt_results.start_date, congestion_cache_tt_results.end_date) INTO map_version; +SELECT gwolofs.congestion_select_map_version( + congestion_cache_tt_results.start_date, + congestion_cache_tt_results.end_date +) INTO map_version; EXECUTE format( $$ @@ -52,20 +56,20 @@ EXECUTE format( SUM(seg.length) AS length_w_data, SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, SUM(sample_size) AS num_obs, - ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, - ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, - ARRAY_AGG(seg.length ORDER BY link_dir) AS lengths + ARRAY_AGG(ta.link_dir ORDER BY ta.link_dir) AS link_dirs, + ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, + ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths FROM here.ta_path AS ta JOIN segment AS seg USING (link_dir) WHERE ( - tod >= %L + ta.tod >= %L AND --{ToD_and_or} - tod < %L + ta.tod < %L ) - AND date_part('isodow', dt) = ANY(%L::int[]) - AND dt >= %L - AND dt < %L + AND date_part('isodow', ta.dt) = ANY(%L::int[]) + AND ta.dt >= %L + AND ta.dt < %L /*--{holiday_clause} AND NOT EXISTS ( SELECT 1 FROM ref.holiday WHERE ta.dt = holiday.dt @@ -74,7 +78,7 @@ EXECUTE format( ta.tx, ta.dt, seg.total_length, - corridor_id + seg.corridor_id WINDOW w AS ( PARTITION BY seg.corridor_id, ta.dt ORDER BY ta.tx @@ -190,6 +194,9 @@ $BODY$; ALTER FUNCTION gwolofs.congestion_cache_tt_results( text, date, date, time without time zone, - time without time zone, integer[], bigint, bigint, boolean + time without time zone, integer [], bigint, bigint, boolean ) OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_cache_tt_results IS +'Caches the dynamic binning results for a request.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql b/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql index a87412ec0..338076d44 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_dynamic_bin_avg.sql @@ -1,20 +1,21 @@ --- FUNCTION: gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) +-- FUNCTION: gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --noqa: LT05 --- DROP FUNCTION IF EXISTS gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); +-- DROP FUNCTION IF EXISTS gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); --noqa: LT05 CREATE OR REPLACE FUNCTION gwolofs.congestion_dynamic_bin_avg( start_date date, end_date date, start_tod time without time zone, end_tod time without time zone, - dow_list integer[], + dow_list integer [], node_start bigint, node_end bigint, - holidays boolean) - RETURNS numeric - LANGUAGE 'plpgsql' - COST 100 - VOLATILE PARALLEL UNSAFE + holidays boolean +) +RETURNS numeric +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL UNSAFE AS $BODY$ DECLARE uri_string_func text := @@ -30,6 +31,7 @@ DECLARE uri_string_func text := BEGIN +--caches the dynamic binning results for this query PERFORM gwolofs.congestion_cache_tt_results( uri_string := uri_string_func, start_date := congestion_dynamic_bin_avg.start_date, @@ -42,6 +44,7 @@ PERFORM gwolofs.congestion_cache_tt_results( holidays := congestion_dynamic_bin_avg.holidays ); +--the way we currently do it; find daily averages and then average. WITH daily_means AS ( SELECT dt_start::date, @@ -60,7 +63,11 @@ END; $BODY$; -ALTER FUNCTION gwolofs.congestion_dynamic_bin_avg(date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) - OWNER TO gwolofs; +ALTER FUNCTION gwolofs.congestion_dynamic_bin_avg( + date, date, time without time zone, time without time zone, integer [], bigint, bigint, boolean +) +OWNER TO gwolofs; -COMMENT ON FUNCTION gwolofs.congestion_dynamic_bin_avg IS 'Previously gwolofs.congestion_dynamic_bin_avg.'; +COMMENT ON FUNCTION gwolofs.congestion_dynamic_bin_avg IS +'Meant to mimic the TT app process; caches results for a specific request and +then returns average TT.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index e68ead7eb..0713b6c8f 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -3,11 +3,12 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_network_segment_agg(date); CREATE OR REPLACE FUNCTION gwolofs.congestion_network_segment_agg( - start_date date) - RETURNS void - LANGUAGE 'plpgsql' - COST 100 - VOLATILE PARALLEL UNSAFE + start_date date +) +RETURNS void +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL UNSAFE AS $BODY$ DECLARE diff --git a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index 1127a5249..5cdcb6ac7 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -4,11 +4,12 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_select_map_version( start_date date, - end_date date) - RETURNS text - LANGUAGE 'sql' - COST 100 - STABLE PARALLEL SAFE + end_date date +) +RETURNS text +LANGUAGE sql +COST 100 +STABLE PARALLEL SAFE AS $BODY$ SELECT street_version @@ -25,4 +26,7 @@ LIMIT 1; $BODY$; ALTER FUNCTION gwolofs.congestion_select_map_version(date, date) - OWNER TO gwolofs; +OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_select_map_version IS +'Implement TT App selectMapVersion.py'; From 82bf16a815d00e0d053f4b0bf3b2c34bc4a4e63e Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 3 Mar 2025 17:31:27 +0000 Subject: [PATCH 26/74] #1132 change path --- .../sql/{ => dynamic_bins}/select-congestion_raw_segments.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename here/traffic/sql/{ => dynamic_bins}/select-congestion_raw_segments.md (100%) diff --git a/here/traffic/sql/select-congestion_raw_segments.md b/here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md similarity index 100% rename from here/traffic/sql/select-congestion_raw_segments.md rename to here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md From 8bed8c267fb18a0facd72b7483c6da9b49353e1f Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 3 Mar 2025 19:20:29 +0000 Subject: [PATCH 27/74] #1132 update md with new examples --- ...unction-congestion_network_segment_agg.sql | 3 +- .../select-congestion_raw_segments.md | 299 ++++++++---------- 2 files changed, 140 insertions(+), 162 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index 0713b6c8f..1b1724675 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -174,4 +174,5 @@ ALTER FUNCTION gwolofs.congestion_network_segment_agg(date) OWNER TO gwolofs; COMMENT ON FUNCTION gwolofs.congestion_network_segment_agg(date) -IS 'Dynamic bin aggregation of the congestion network by hour and time periods.'; +IS 'Dynamic bin aggregation of the congestion network by hour and time periods. +Takes around 10 minutes to run for one day (hourly and period based aggregation)'; diff --git a/here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md b/here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md index 35f5a3e02..48370aeee 100644 --- a/here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md +++ b/here/traffic/sql/dynamic_bins/select-congestion_raw_segments.md @@ -1,60 +1,33 @@ -This is a readme to describe the complex query [here](./select-congestion_raw_segments.sql). +This is a readme to describe the complex query [here](./function-congestion_network_segment_agg.sql). Samples from each of the CTEs are shown for one segment/time_grp. Not all columns are shown from each CTE result. -### time_bins -Contains hourly and period definitions, known as `time_grp`s. These define the extents within which to evaluate dynamic bin options. A dynamic bin must be fully within the time_grp. +### segments + +Identifies the links that make up each segment, along with total segment length from `congestion.network_links_*` table. ```sql -WITH time_bins AS ( - SELECT - start_time, - start_time + '1 hour'::interval AS end_time, - tsrange(start_time, start_time + '1 hour'::interval, '[)') AS time_grp - FROM generate_series( - '2025-01-04'::date, - '2025-01-04'::date + interval '23 hours', - '1 hour'::interval - ) AS hours(start_time) - UNION +WITH segments AS ( SELECT - start_time + '2025-01-04'::date, - end_time + '2025-01-04'::date, - tsrange(start_time + '2025-01-04'::date, end_time + '2025-01-04'::date, '[)') - FROM ( - VALUES - ('07:00'::time, '10:00'::time), - ('10:00', '16:00'), - ('16:00', '19:00') - ) AS time_periods(start_time, end_time) - ORDER BY start_time -), + segment_id, + link_dir, + length, + SUM(length) OVER (PARTITION BY segment_id) AS total_length + FROM congestion.%2$I --eg. congestion.network_links_23_4_geom +) ``` -| "start_time" | "end_time" | "time_grp" | -|-----------------------|-----------------------|-----------------------------------------------------| -| "2025-01-04 00:00:00" | "2025-01-04 01:00:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | -| "2025-01-04 01:00:00" | "2025-01-04 02:00:00" | "[""2025-01-04 01:00:00"",""2025-01-04 02:00:00"")" | -| "2025-01-04 02:00:00" | "2025-01-04 03:00:00" | "[""2025-01-04 02:00:00"",""2025-01-04 03:00:00"")" | -| "2025-01-04 03:00:00" | "2025-01-04 04:00:00" | "[""2025-01-04 03:00:00"",""2025-01-04 04:00:00"")" | -| "2025-01-04 04:00:00" | "2025-01-04 05:00:00" | "[""2025-01-04 04:00:00"",""2025-01-04 05:00:00"")" | -| "2025-01-04 05:00:00" | "2025-01-04 06:00:00" | "[""2025-01-04 05:00:00"",""2025-01-04 06:00:00"")" | -| "2025-01-04 06:00:00" | "2025-01-04 07:00:00" | "[""2025-01-04 06:00:00"",""2025-01-04 07:00:00"")" | -| "2025-01-04 07:00:00" | "2025-01-04 08:00:00" | "[""2025-01-04 07:00:00"",""2025-01-04 08:00:00"")" | -| "2025-01-04 07:00:00" | "2025-01-04 10:00:00" | "[""2025-01-04 07:00:00"",""2025-01-04 10:00:00"")" | -| "2025-01-04 08:00:00" | "2025-01-04 09:00:00" | "[""2025-01-04 08:00:00"",""2025-01-04 09:00:00"")" | - ### segment_5min_bins -In this step we pull the relevant data from `here.ta_path` for each segment / time_grp. We save the disaggregate travel time data by link in 3 arrays (link_dirs, tts, lengths), so that in future steps we can reaggregate average segment travel time and distinct length over different ranges without referring back to the here.ta_path table. The time bins (`tx`) are also ranked to make it easier to enumerate possible bin extents using generate_series in the next step. +In this step we pull the relevant data from `here.ta_path` for each segment / time_grp (gwolofs.congestion_time_grps). We save the disaggregate travel time data by link in 3 arrays (link_dirs, tts, lengths), so that in future steps we can reaggregate average segment travel time and distinct length over different ranges without referring back to the here.ta_path table. The time bins (`tx`) are also ranked to make it easier to enumerate possible bin extents using `generate_series` in the next step. ```sql segment_5min_bins AS ( SELECT - segments.segment_id, - tb.time_grp, + links.segment_id, + timerange(tg.start_tod, tg.end_tod, '[)') AS time_grp, ta.tx, RANK() OVER w AS bin_rank, - segments.total_length, - SUM(links.length) / segments.total_length AS sum_length, + links.total_length, + SUM(links.length) / links.total_length AS sum_length, SUM(links.length) AS length_w_data, SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, SUM(sample_size) AS num_obs, @@ -62,33 +35,37 @@ segment_5min_bins AS ( ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(links.length ORDER BY link_dir) AS lengths FROM here.ta_path AS ta - JOIN time_bins AS tb ON ta.tx >= tb.start_time AND ta.tx < tb.end_time - JOIN congestion.network_links_23_4_geom AS links USING (link_dir) - JOIN congestion.network_segments_23_4_geom AS segments USING (segment_id) - WHERE ta.dt = '2025-01-04' - --AND tx < '2025-01-04 01:00:00' - AND segment_id = 29 AND date_trunc('hour', ta.tx) = '2025-01-04 00:00:00' + JOIN gwolofs.congestion_time_grps AS tg ON + ta.tx >= %1$L::date + tg.start_tod + AND ta.tx < %1$L::date + tg.end_tod + JOIN segments AS links USING (link_dir) + WHERE + ta.dt >= %1$L::date + AND ta.dt < %1$L::date + interval '1 day' GROUP BY - segments.segment_id, - tb.time_grp, + links.segment_id, + tg.start_tod, + tg.end_tod, ta.tx, - segments.total_length - WINDOW w AS ( - PARTITION BY segments.segment_id, tb.time_grp + links.total_length + WINDOW w AS ( + PARTITION BY links.segment_id, tg.start_tod, tg.end_tod ORDER BY ta.tx - ) + ) ), ``` -`SELECT bin_rank, tx, round(sum_length, 2) AS sum_length, link_dirs, tts FROM segment_5min_bins;` +`SELECT * FROM gwolofs.congestion_raw_segments WHERE segment_id = 1 AND dt = '2025-01-10' AND time_grp = '[00:00:00,01:00:00)';` -| "bin_rank" | "tx" | "sum_length" | "link_dirs" | "tts" | -|------------|-----------------------|--------------|----------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| -| 1 | "2025-01-04 00:00:00" | 1.01 | "{1000589822T,1000589823T,1280577167T,792343539T, 792343541T, 836248875T,836248876T,845737718T,845737719T}" | {4.59624489795918360,1.274693877551020408164, 4.96575000000000000,6.68329411764705876, 1.101306122448979591836, 1.526693877551020408164,1.196816326530612244884,4.79172413793103452,9.12626086956521724} | -| 2 | "2025-01-04 00:05:00" | 0.12 | "{845737718T}" | {19.85142857142857148} | -| 3 | "2025-01-04 00:15:00" | 0.07 | "{1280577167T}" | {2.787789473684210526300} | -| 4 | "2025-01-04 00:50:00" | 0.39 | "{845737718T,845737719T}" | {34.74000000000000000,28.62327272727272724} | -| 5 | "2025-01-04 00:55:00" | 1.01 | "{1000589822T,1000589823T,1280577167T,792343539T,792343541T, 836248875T,836248876T,845737718T,845737719T}" | {5.17737931034482764,1.435862068965517241376,1.826482758620689655172,3.91779310344827580,1.240551724137931034496,1.719724137931034482752, 1.348137931034482758612,2.459469026548672566372,6.42563265306122460} | +| segment_id | time_grp | tx | bin_rank | total_length | sum_length | length_w_data | unadjusted_tt | num_obs | link_dirs | tts | lengths | +|------------|---------------------|-------------------------|----------|--------------|------------------------|---------------|--------------------------|---------|---------------------------------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------| +| 1 | [00:00:00,06:00:00) | 2025-01-10 00:20:00.000 | 1 | 374.22 | 1.00000000000000000000 | 374.22 | 29.200422445479049559580 | 5 | {1328374158F,1328374159F,1328374160F,1328374165F,1328374166F} | {3.739245283018868,4.845056603773585,1.8298867924528301,3.109090909090909,15.677142857142858} | {55.05,71.33,26.94,38.0,182.9} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 00:25:00.000 | 2 | 374.22 | 0.48874993319437763882 | 182.90 | 131.68800000000000000 | 1 | {1328374166F} | {131.688} | {182.9} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 00:35:00.000 | 3 | 374.22 | 1.00000000000000000000 | 374.22 | 76.657011086474501198040 | 5 | {1328374158F,1328374159F,1328374160F,1328374165F,1328374166F} | {4.833658536585366,6.263121951219512,2.365463414634146,3.3365853658536584,59.85818181818182} | {55.05,71.33,26.94,38.0,182.9} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 05:00:00.000 | 4 | 374.22 | 0.19060980172091283202 | 71.33 | 6.26312195121951216 | 1 | {1328374159F} | {6.263121951219512} | {71.33} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 05:05:00.000 | 5 | 374.22 | 1.00000000000000000000 | 374.22 | 35.452421052631578833688 | 5 | {1328374158F,1328374159F,1328374160F,1328374165F,1328374166F} | {5.215263157894737,6.757578947368421,2.5522105263157893,3.6,17.327368421052633} | {55.05,71.33,26.94,38.0,182.9} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 05:15:00.000 | 6 | 374.22 | 1.00000000000000000000 | 374.22 | 48.013722580645161104508 | 5 | {1328374158F,1328374159F,1328374160F,1328374165F,1328374166F} | {6.392903225806451,12.8394,3.128516129032258,4.412903225806452,21.24} | {55.05,71.33,26.94,38.0,182.9} | +| 1 | [00:00:00,06:00:00) | 2025-01-10 05:20:00.000 | 7 | 374.22 | 0.48874993319437763882 | 182.90 | 13.16880000000000000 | 1 | {1328374166F} | {13.1688} | {182.9} | ### dynamic_bin_options Here we enumerate all the possible dynamic bin options for each starting point. The number of combinations are cut down significantly with the `CASE` statements inside the `generate_series`: @@ -99,7 +76,7 @@ Here we enumerate all the possible dynamic bin options for each starting point. dynamic_bin_options AS ( --within each segment/hour, generate all possible forward looking bin combinations --don't generate options for bins with sufficient length - --also don't generate options past the next bin with 80% length + --also don't generate options past the next bin with 80%% length SELECT tx, time_grp, @@ -130,18 +107,23 @@ dynamic_bin_options AS ( ), ``` -In this case we find 8 dynamic bin options with the pruning conditions, down from max of 5+4+3+2+1 = 15. +In this case we find 13 dynamic bin options with the pruning conditions, down from max of 10+9+8+7+6+5+4+3+2+1 = 55. -| "tx" | "time_grp" | "segment_id" | "start_bin" | "end_bin" | -|-----------------------|-----------------------------------------------------|--------------|-------------|-----------| -| "2025-01-04 00:00:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 1 | 1 | -| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 3 | -| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 4 | -| "2025-01-04 00:05:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 2 | 5 | -| "2025-01-04 00:15:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 3 | 4 | -| "2025-01-04 00:15:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 3 | 5 | -| "2025-01-04 00:50:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 4 | 5 | -| "2025-01-04 00:55:00" | "[""2025-01-04 00:00:00"",""2025-01-04 01:00:00"")" | 29 | 5 | 5 | +| tx | time_grp | segment_id | start_bin | end_bin | +|-------------------------|---------------------|------------|-----------|---------| +| 2025-01-10 00:20:00.000 | [00:00:00,06:00:00) | 1 | 1 | 1 | +| 2025-01-10 00:25:00.000 | [00:00:00,06:00:00) | 1 | 2 | 3 | +| 2025-01-10 00:35:00.000 | [00:00:00,06:00:00) | 1 | 3 | 3 | +| 2025-01-10 05:00:00.000 | [00:00:00,06:00:00) | 1 | 4 | 5 | +| 2025-01-10 05:05:00.000 | [00:00:00,06:00:00) | 1 | 5 | 5 | +| 2025-01-10 05:15:00.000 | [00:00:00,06:00:00) | 1 | 6 | 6 | +| 2025-01-10 05:20:00.000 | [00:00:00,06:00:00) | 1 | 7 | 8 | +| 2025-01-10 05:20:00.000 | [00:00:00,06:00:00) | 1 | 7 | 9 | +| 2025-01-10 05:20:00.000 | [00:00:00,06:00:00) | 1 | 7 | 10 | +| 2025-01-10 05:25:00.000 | [00:00:00,06:00:00) | 1 | 8 | 9 | +| 2025-01-10 05:25:00.000 | [00:00:00,06:00:00) | 1 | 8 | 10 | +| 2025-01-10 05:50:00.000 | [00:00:00,06:00:00) | 1 | 9 | 10 | +| 2025-01-10 05:55:00.000 | [00:00:00,06:00:00) | 1 | 10 | 10 | ### unnested_db_options Combining the previous two steps, we have enumerated all the possible bin start/end ranges (`dynamic_bin_options`), now we can unnest the disaggregate data (`segment_5min_bins`) and evaluate them. @@ -156,45 +138,51 @@ unnested_db_options AS ( s5b.total_length, dbo.tx AS dt_start, --exclusive end bin - MAX(s5b.tx) + interval '5 minutes' AS dt_end, + s5b_end.tx + interval '5 minutes' AS dt_end, unnested.link_dir, unnested.len, AVG(unnested.tt) AS tt, --avg TT for each link_dir - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir FROM dynamic_bin_options AS dbo LEFT JOIN segment_5min_bins AS s5b ON s5b.time_grp = dbo.time_grp AND s5b.segment_id = dbo.segment_id AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin, + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.segment_id = dbo.segment_id + AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --we need to use nested data to determine length for these multi-period bins - WHERE dbo.start_bin != dbo.end_bin + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' GROUP BY dbo.time_grp, dbo.segment_id, s5b.total_length, - dbo.tx, - dbo.end_bin, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin unnested.link_dir, unnested.len ) ``` -`SELECT dt_start, dt_end, link_dir, len, tt, num_obs FROM unnested_db_options WHERE dt_start = '2025-01-04 00:05:00' AND dt_end = '2025-01-04 01:00:00'` +`SELECT * FROM unnested_db_options WHERE time_grp = '[00:00:00,06:00:00)' LIMIT 10` -| "dt_start" | "dt_end" | "link_dir" | "len" | "tt" | "num_obs" | -|-----------------------|-----------------------|---------------|-------|--------------------------|-----------| -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1000589822T" | 62.56 | 5.17737931034482764 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1000589823T" | 17.35 | 1.435862068965517241376 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "1280577167T" | 22.07 | 2.307136116152450090736 | 20 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "792343539T" | 47.34 | 3.91779310344827580 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "792343541T" | 14.99 | 1.240551724137931034496 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "836248875T" | 20.78 | 1.719724137931034482752 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "836248876T" | 16.29 | 1.348137931034482758612 | 18 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "845737718T" | 38.60 | 19.016965865992414682124 | 21 | -| "2025-01-04 00:05:00" | "2025-01-04 01:00:00" | "845737719T" | 87.46 | 17.52445269016697592 | 20 | +| time_grp | segment_id | total_length | dt_start | dt_end | link_dir | len | tt | num_obs | +|---------------------|------------|--------------|-------------------------|-------------------------|-------------|--------|-------------------------|---------| +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:20:00.000 | 2025-01-10 00:25:00.000 | 1328374158F | 55.05 | 3.739 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:20:00.000 | 2025-01-10 00:25:00.000 | 1328374159F | 71.33 | 4.845 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:20:00.000 | 2025-01-10 00:25:00.000 | 1328374160F | 26.94 | 1.829 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:20:00.000 | 2025-01-10 00:25:00.000 | 1328374165F | 38.00 | 3.109 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:20:00.000 | 2025-01-10 00:25:00.000 | 1328374166F | 182.90 | 15.677 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:25:00.000 | 2025-01-10 00:40:00.000 | 1328374158F | 55.05 | 4.833 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:25:00.000 | 2025-01-10 00:40:00.000 | 1328374159F | 71.33 | 6.263 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:25:00.000 | 2025-01-10 00:40:00.000 | 1328374160F | 26.94 | 2.365 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:25:00.000 | 2025-01-10 00:40:00.000 | 1328374165F | 38.00 | 3.336 | 5 | +| [00:00:00,06:00:00) | 1 | 374.22 | 2025-01-10 00:25:00.000 | 2025-01-10 00:40:00.000 | 1328374166F | 182.90 | 95.773 | 6 | ### Insert statement Here we find bins with sufficient length, for the two cases: @@ -202,81 +190,70 @@ Here we find bins with sufficient length, for the two cases: - An original 5min bin, no group by needed to check length. ```sql -INSERT INTO gwolofs.congestion_raw_segments ( - time_grp, segment_id, dt_start, dt_end, bin_range, tt, - unadjusted_tt, total_length, length_w_data, num_obs -) ---this query contains overlapping values which get eliminated ---via on conflict with the exclusion constraint on congestion_raw_segments table. -SELECT DISTINCT ON (time_grp, segment_id, dt_start) - time_grp, - segment_id, - dt_start, - dt_end, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(tt) AS unadjusted_tt, - total_length, - SUM(len) AS length_w_data, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment -FROM unnested_db_options AS udbo -GROUP BY - time_grp, - segment_id, - dt_start, - dt_end, - total_length -HAVING SUM(len) >= 0.8 * total_length -UNION ---these 5 minute bins already have sufficient length ---don't need to use nested data to validate. -SELECT - time_grp, - segment_id, - tx AS dt_start, - tx + interval '5 minutes' AS dt_end, - tsrange(tx, tx + interval '5 minutes', '[)') AS bin_range, - total_length / length_w_data * unadjusted_tt AS tt, - unadjusted_tt, - total_length, - length_w_data, - num_obs --sum of here.ta_path sample_size for each segment -FROM segment_5min_bins ---we do not need to use nested data to determine length here. -WHERE sum_length >= 0.8 -ORDER BY - time_grp, - segment_id, - dt_start, - dt_end ---exclusion constraint + ordered insert to prevent overlapping bins -ON CONFLICT ON CONSTRAINT dynamic_bins_unique -DO NOTHING; + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO gwolofs.congestion_raw_segments ( + dt, time_grp, segment_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (time_grp, segment_id, dt_start) + dt_start::date AS dt, + time_grp, + segment_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + segment_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + time_grp, + segment_id, + dt_start, + dt_end --uses the option that ends first + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude + DO NOTHING; ``` -`SELECT segment_id, bin_range, round(tt, 2) AS tt, total_length, length_w_data FROM inserted;` +`SELECT dt, time_grp, segment_id, bin_range, round(tt, 2), num_obs FROM inserted WHERE time_grp = '[00:00:00,06:00:00)';` -| "segment_id" | "bin_range" | "tt" | "total_length" | "length_w_data" | -|--------------|-----------------------------------------------------|-------|----------------|-----------------| -| 29 | "[""2025-01-04 00:00:00"",""2025-01-04 00:05:00"")" | 34.93 | 324.33 | 327.44 | -| 29 | "[""2025-01-04 00:05:00"",""2025-01-04 01:00:00"")" | 53.18 | 324.33 | 327.44 | -| 29 | "[""2025-01-04 00:15:00"",""2025-01-04 01:00:00"")" | 52.76 | 324.33 | 327.44 | -| 29 | "[""2025-01-04 00:50:00"",""2025-01-04 01:00:00"")" | 52.29 | 324.33 | 327.44 | -| 29 | "[""2025-01-04 00:55:00"",""2025-01-04 01:00:00"")" | 25.31 | 324.33 | 327.44 | +| dt | time_grp | segment_id | bin_range | round | num_obs | +|------------|---------------------|------------|-----------------------------------------------|--------|---------| +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 00:20:00","2025-01-10 00:25:00") | 29.20 | 25 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 00:25:00","2025-01-10 00:40:00") | 112.57 | 26 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 00:35:00","2025-01-10 00:40:00") | 76.66 | 25 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:00:00","2025-01-10 05:10:00") | 35.21 | 26 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:05:00","2025-01-10 05:10:00") | 35.45 | 25 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:15:00","2025-01-10 05:20:00") | 48.01 | 25 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:20:00","2025-01-10 05:55:00") | 51.34 | 14 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:25:00","2025-01-10 05:55:00") | 69.59 | 13 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:50:00","2025-01-10 06:00:00") | 53.28 | 62 | +| 2025-01-10 | [00:00:00,06:00:00) | 1 | ["2025-01-10 05:55:00","2025-01-10 06:00:00") | 39.36 | 50 | -After insert against exclusion constraint, only 2 remain, since records 3,4,5 overlap with record 2 above. -`SELECT segment_id, bin_range, round(tt, 2) AS tt, total_length, length_w_data FROM gwolofs.congestion_raw_segments WHERE segment_id = 29 AND time_grp = '["2025-01-04 00:00:00","2025-01-04 01:00:00")'::tsrange` +After insert against exclusion constraint, only 6 remain (of 10 above), since rows #3,5,8,9 overlap other records. +`SELECT segment_id, bin_range, round(tt, 2) AS tt, total_length, length_w_data FROM gwolofs.congestion_raw_segments WHERE segment_id = 29 AND time_grp = '["2025-01-04 00:00:00","2025-01-04 06:00:00")'::tsrange` Constraint: ```sql - CONSTRAINT dynamic_bins_unique EXCLUDE USING gist ( - segment_id WITH =, + CONSTRAINT congestion_raw_segments_exclude EXCLUDE USING gist ( bin_range WITH &&, - time_grp WITH = + segment_id WITH =, + time_grp WITH =, + dt WITH = ) ``` -| "segment_id" | "bin_range" | "tt" | "total_length" | "length_w_data" | -|--------------|-----------------------------------------------------|-------|----------------|-----------------| -| 29 | "[""2025-01-04 00:00:00"",""2025-01-04 00:05:00"")" | 34.93 | 324.33 | 327.44 | -| 29 | "[""2025-01-04 00:05:00"",""2025-01-04 01:00:00"")" | 53.18 | 324.33 | 327.44 | \ No newline at end of file +| segment_id | bin_range | tt | num_obs | dt | time_grp | +|------------|-----------------------------------------------|-------------------|---------|------------|---------------------| +| 1 | ["2025-01-10 00:20:00","2025-01-10 00:25:00") | 29.2004224454790 | 25 | 2025-01-10 | [00:00:00,06:00:00) | +| 1 | ["2025-01-10 00:25:00","2025-01-10 00:40:00") | 112.5719201773835 | 26 | 2025-01-10 | [00:00:00,06:00:00) | +| 1 | ["2025-01-10 05:00:00","2025-01-10 05:10:00") | 35.2051925545571 | 26 | 2025-01-10 | [00:00:00,06:00:00) | +| 1 | ["2025-01-10 05:15:00","2025-01-10 05:20:00") | 48.0137225806451 | 25 | 2025-01-10 | [00:00:00,06:00:00) | +| 1 | ["2025-01-10 05:20:00","2025-01-10 05:55:00") | 51.34214 | 14 | 2025-01-10 | [00:00:00,06:00:00) | +| 1 | ["2025-01-10 05:55:00","2025-01-10 06:00:00") | 39.3552705888070 | 50 | 2025-01-10 | [00:00:00,06:00:00) | \ No newline at end of file From 401dada9ec05378f5ca8b9a952af93c47ef89428 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 18 Mar 2025 18:44:19 +0000 Subject: [PATCH 28/74] #1132 explore binning extents ipynb --- .../here_dynamic_binning_explore.ipynb | 406 ++++++++++++++++++ 1 file changed, 406 insertions(+) create mode 100644 here/traffic/here_dynamic_binning_explore.ipynb diff --git a/here/traffic/here_dynamic_binning_explore.ipynb b/here/traffic/here_dynamic_binning_explore.ipynb new file mode 100644 index 000000000..b4c1b2a5f --- /dev/null +++ b/here/traffic/here_dynamic_binning_explore.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8c6f35d7-fbd6-4336-91e3-4ab18b4009e5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/jupyterhub/.venv/lib/python3.10/site-packages/geopandas/io/sql.py:170: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import configparser\n", + "from psycopg2 import connect\n", + "import struct\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import seaborn as sns\n", + "import geopandas as gpd\n", + "\n", + "CONFIG = configparser.ConfigParser()\n", + "CONFIG.read(str(Path.home().joinpath('db.cfg'))) #Creates a path to your db.cfg file\n", + "dbset = CONFIG['SQLALCHEMY']\n", + "\n", + "with connect(**dbset) as con:\n", + " basemap_query = '''select gis.geopandas_transform(ST_union(geom)) as geom from gis.neighbourhood'''\n", + " basemap = gpd.GeoDataFrame.from_postgis(basemap_query, con, geom_col='geom')\n", + " basemap = basemap.to_crs('epsg:26917')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "def42f51-59b0-42bd-b300-5d1f3e3dee15", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_818626/423225491.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + } + ], + "source": [ + "sql = '''SELECT hr, bin_length, count, legend\n", + "FROM gwolofs.congestion_bin_length_explore'''\n", + "\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a1a4f215-4fc4-4b0b-8223-870934a9c7f0", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Convert bin_length to string for categorical x-axis\n", + "df['bin_length'] = df['bin_length'].astype(str)\n", + "\n", + "# Compute proportions within each bin_size group\n", + "df['proportion'] = df.groupby('hr')['count'].transform(lambda x: x / x.sum())\n", + "\n", + "# Plot multi-bar chart\n", + "plt.figure(figsize=(12, 6))\n", + "sns.barplot(x=df['bin_length'], y=df['proportion'], hue=df['hr'], palette='viridis')\n", + "plt.xlabel('Bin Size')\n", + "plt.ylabel('Proportion')\n", + "plt.title('Congestion Proportions by Bin Size and Hour')\n", + "plt.yticks([i * 0.01 for i in range(0, 100, 5)])\n", + "plt.xticks(rotation=45)\n", + "plt.legend(title='Hour of the Day')\n", + "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3e9c7d6d-6e1f-4214-b9cf-dd0b8e16daf5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_842647/1604576776.py:15: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + } + ], + "source": [ + "sql = '''SELECT\n", + " CASE\n", + " WHEN time_grp = '[00:00:00,24:00:00)' THEN '24hr'\n", + " WHEN (upper(time_grp) - lower(time_grp)) = '01:00:00'::interval THEN '1hr'\n", + " ELSE 'Periods'\n", + " END AS legend,\n", + " lower(bin_range)::time AS bin_start,\n", + " upper(bin_range)::time AS bin_end\n", + "FROM gwolofs.congestion_raw_segments\n", + "WHERE dt >= '2024-12-01' AND dt < '2024-12-02' AND segment_id = 2511\n", + "ORDER BY 1, 2'''\n", + "\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + " # Convert time columns to datetime\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()\n", + "\n", + "# Convert time columns to seconds since midnight\n", + "def time_to_seconds(t):\n", + " return t.hour * 3600 + t.minute * 60 + t.second\n", + "\n", + "df['bin_start'] = df['bin_start'].apply(time_to_seconds)\n", + "df['bin_end'] = df['bin_end'].apply(time_to_seconds)\n", + "df['duration'] = df['bin_end'] - df['bin_start']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8d2ba18c-dc79-4a04-8da6-8139f5772b18", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Assign numeric values for y-axis based on legend\n", + "y_labels = df['legend'].unique()\n", + "y_mapping = {label: i for i, label in enumerate(y_labels)}\n", + "df['y_pos'] = df['legend'].map(y_mapping)\n", + "\n", + "# Define color mapping\n", + "palette = sns.color_palette('viridis', n_colors=len(y_labels))\n", + "legend_colors = {label: palette[i] for i, label in enumerate(y_labels)}\n", + "\n", + "# Plot timeline graph using broken_barh\n", + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "for i, row in df.iterrows():\n", + " ax.broken_barh([(row['bin_start'], row['duration'])], (row['y_pos'] - 0.4, 0.8),\n", + " color=legend_colors[row['legend']], edgecolor='white')\n", + "\n", + "ax.set_xlabel('Time of Day')\n", + "ax.set_ylabel('Legend')\n", + "ax.set_title('Timeline of Bin Ranges Colored by Legend')\n", + "ax.set_yticks(range(len(y_labels)))\n", + "ax.set_yticklabels(y_labels)\n", + "ax.grid(axis='x', linestyle='--', alpha=0.7)\n", + "\n", + "# Adjust x-ticks to every 3600 seconds (1 hour)\n", + "ax.set_xticks(range(0, 86400, 3600))\n", + "ax.set_xticklabels([f\"{h:02d}:00\" for h in range(24)])\n", + "plt.xticks(rotation=45)\n", + "\n", + "# Create legend\n", + "handles = [plt.Rectangle((0, 0), 1, 1, color=legend_colors[label]) for label in y_labels]\n", + "ax.legend(handles, y_labels, title='Legend')\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "697474c7-e0ba-49d0-9dcb-65e4538e9c06", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1286164/2162353716.py:42: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + } + ], + "source": [ + "#query to identify which segments have a big discrepency.\n", + "sql = '''WITH bins AS (\n", + " SELECT\n", + " CASE\n", + " WHEN time_grp = '[00:00:00,24:00:00)' THEN '24hr'\n", + " WHEN (upper(time_grp) - lower(time_grp)) = '01:00:00'::interval THEN '1hr'\n", + " ELSE 'Periods'\n", + " END AS legend,\n", + " bin_range,\n", + " segment_id,\n", + " dt,\n", + " lower(bin_range) AS bin_start,\n", + " upper(bin_range) AS bin_end\n", + " FROM gwolofs.congestion_raw_segments\n", + " WHERE dt >= '2024-12-01' AND dt < '2024-12-02'\n", + " ORDER BY 1, 2\n", + "),\n", + "\n", + "overlap AS (\n", + " SELECT\n", + " segment_id,\n", + " bin_range,\n", + " dt,\n", + " COUNT(*) FILTER (WHERE legend = '24hr') AS count_24hr,\n", + " COUNT(*) FILTER (WHERE legend = '1hr') AS count_1hr,\n", + " COUNT(*) FILTER (WHERE legend = 'Periods') AS count_period\n", + " FROM bins\n", + " GROUP BY 1, 2, 3\n", + ")\n", + "\n", + "SELECT\n", + " segment_id,\n", + " dt,\n", + " COUNT(*) FILTER (WHERE count_24hr = 1 AND count_1hr = 1) / SUM(count_24hr) AS overlap_24hr_1_hr,\n", + " COUNT(*) FILTER (WHERE count_24hr = 1 AND count_period = 1) / SUM(count_24hr) AS overlap_24hr_period\n", + "FROM overlap\n", + "GROUP BY segment_id, dt\n", + "ORDER BY 3'''\n", + "\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + " df.head\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5a4f97c3-b3c3-4474-9925-fc908de6e09a", + "metadata": {}, + "outputs": [], + "source": [ + "def make_map(data, basemap_b, centreline_b, highway_b, color, title):\n", + " f, ax = plt.subplots(figsize=(20,20))\n", + " if basemap_b == True:\n", + " basemap.plot(ax=ax, color = 'grey', alpha=0.2)\n", + " if centreline_b == True:\n", + " centreline.plot(ax=ax, color = 'white', alpha=0.2)\n", + " if highway_b == True:\n", + " highway.plot(ax=ax, color = 'white', alpha=0.2) \n", + " \n", + " data.plot(column = 'id', ax=ax, cmap=color)\n", + " NUM_COLORS = len(data.col.unique())\n", + " cm = plt.get_cmap(color)\n", + " colors = [cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)]\n", + " handles, labels = [], []\n", + "\n", + " legend_id = data.col.unique()\n", + " for i in range(0, NUM_COLORS):\n", + " label_name = legend_id[i]\n", + " handles.append(mpl.patches.Patch(color=colors[i],label=label_name))\n", + " ax.legend(handles=handles,loc='lower right', ncol=1, title = title) \n", + " ax.set_axis_off()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cce5f66c-01d9-4be6-9881-70eb8d2c994a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/jupyterhub/.venv/lib/python3.10/site-packages/geopandas/io/sql.py:170: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(\n" + ] + } + ], + "source": [ + "import geopandas as gpd\n", + "new_links = '''\n", + "WITH all_link_dirs_24_4 AS(\n", + "\tSELECT link_id || 'F'::text AS link_dir, func_class\n", + "\tFROM here_gis.traffic_streets_24_4\n", + "\tWHERE dir_travel = ANY (ARRAY['F', 'B'])\n", + "\tUNION ALL\n", + "\tSELECT link_id || 'T'::text AS link_dir, func_class\n", + "\tFROM here_gis.traffic_streets_24_4\n", + "\tWHERE dir_travel = ANY (ARRAY['T', 'B'])\n", + "\t)\n", + "\t,all_link_dirs_23_4 AS(\n", + "\tSELECT link_id || 'F'::text AS link_dir, func_class\n", + "\tFROM here_gis.traffic_streets_23_4\n", + "\tWHERE dir_travel = ANY (ARRAY['F', 'B'])\n", + "\tUNION ALL\n", + "\tSELECT link_id || 'T'::text AS link_dir, func_class\n", + "\tFROM here_gis.traffic_streets_23_4\n", + "\tWHERE dir_travel = ANY (ARRAY['T', 'B'])\n", + "\t)\n", + "SELECT link_dir, a.func_class::integer as id, a.func_class::text as col,gis.geopandas_transform(geom) as geom\n", + "FROM (select nu.* from all_link_dirs_23_4 ole\n", + "FULL OUTER JOIN all_link_dirs_24_4 nu on nu.link_dir = ole.link_dir\n", + "WHERE ole.link_dir IS NULL AND nu.link_dir IS NOT NULL)a\n", + "inner join (SELECT sts.link_id || 'F'::text AS link_dir,\n", + " sts.link_id,\n", + "\t\t\tfunc_class,\n", + " sts.geom\n", + " FROM here_gis.streets_24_4 sts\n", + "\t\t \t\tinner join here_gis.traffic_streets_24_4 using (link_id)\n", + " WHERE dir_travel in ('F', 'B')\n", + " UNION ALL\n", + " SELECT sts.link_id || 'T'::text AS link_dir,\n", + " sts.link_id,\n", + "\t\t\tfunc_class,\n", + " st_reverse(sts.geom) AS geom\n", + " FROM here_gis.streets_24_4 sts\n", + "\t\t inner join here_gis.traffic_streets_24_4 using (link_id)\n", + " WHERE dir_travel in ('T', 'B') )streets\n", + "using (link_dir)\n", + "--where ST_geometrytype(geom) = 'ST_LineString' \n", + "order by a.func_class\n", + " '''\n", + "\n", + "new_links = gpd.GeoDataFrame.from_postgis(new_links, con, geom_col='geom')\n", + "new_links = new_links.to_crs('epsg:26917')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "fd3d8266-58d8-47fe-af42-cd163961e90c", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "make_map(new_links, True, False, False, 'magma', 'Functional Class');" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 84b68f29cf7a76b5c9e89ed4eac1198a332b48fd Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 18 Mar 2025 19:08:14 +0000 Subject: [PATCH 29/74] #1132 comments --- here/traffic/here_dynamic_binning_explore.ipynb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/here/traffic/here_dynamic_binning_explore.ipynb b/here/traffic/here_dynamic_binning_explore.ipynb index b4c1b2a5f..1098f7ef9 100644 --- a/here/traffic/here_dynamic_binning_explore.ipynb +++ b/here/traffic/here_dynamic_binning_explore.ipynb @@ -201,6 +201,17 @@ "plt.show()\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "14483f0e-2cdf-46df-8284-a4662e7c4780", + "metadata": {}, + "outputs": [], + "source": [ + "#show the 95% context..\n", + "#raph's idea: cut off at 15 minutes, use the starting bin hour." + ] + }, { "cell_type": "code", "execution_count": 2, From c5e5f918964cb654c906314f22f19d3e1d8ed6a5 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 24 Mar 2025 17:42:36 +0000 Subject: [PATCH 30/74] #1132 update graph title --- .../here_dynamic_binning_explore.ipynb | 23 ++++++------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/here/traffic/here_dynamic_binning_explore.ipynb b/here/traffic/here_dynamic_binning_explore.ipynb index 1098f7ef9..3658b4290 100644 --- a/here/traffic/here_dynamic_binning_explore.ipynb +++ b/here/traffic/here_dynamic_binning_explore.ipynb @@ -2,19 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "8c6f35d7-fbd6-4336-91e3-4ab18b4009e5", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/jupyterhub/.venv/lib/python3.10/site-packages/geopandas/io/sql.py:170: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", - " df = pd.read_sql(\n" - ] - } - ], + "outputs": [], "source": [ "from pathlib import Path\n", "import configparser\n", @@ -104,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "3e9c7d6d-6e1f-4214-b9cf-dd0b8e16daf5", "metadata": { "scrolled": true @@ -114,7 +105,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_842647/1604576776.py:15: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + "/tmp/ipykernel_1286164/1604576776.py:15: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(sql, con)\n" ] } @@ -151,13 +142,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "8d2ba18c-dc79-4a04-8da6-8139f5772b18", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -184,7 +175,7 @@ "\n", "ax.set_xlabel('Time of Day')\n", "ax.set_ylabel('Legend')\n", - "ax.set_title('Timeline of Bin Ranges Colored by Legend')\n", + "ax.set_title('Timeline of Bin Ranges by time_grp')\n", "ax.set_yticks(range(len(y_labels)))\n", "ax.set_yticklabels(y_labels)\n", "ax.grid(axis='x', linestyle='--', alpha=0.7)\n", From 53be92520a32a446a2b974355610bb094d7d4af8 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 26 Mar 2025 21:03:21 +0000 Subject: [PATCH 31/74] #1132 add cumulative distribution of tt --- .../here_dynamic_binning_explore.ipynb | 237 +++++++++++++++++- 1 file changed, 235 insertions(+), 2 deletions(-) diff --git a/here/traffic/here_dynamic_binning_explore.ipynb b/here/traffic/here_dynamic_binning_explore.ipynb index 3658b4290..d43b57fa1 100644 --- a/here/traffic/here_dynamic_binning_explore.ipynb +++ b/here/traffic/here_dynamic_binning_explore.ipynb @@ -2,15 +2,25 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "id": "8c6f35d7-fbd6-4336-91e3-4ab18b4009e5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/jupyterhub/.venv/lib/python3.10/site-packages/geopandas/io/sql.py:170: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(\n" + ] + } + ], "source": [ "from pathlib import Path\n", "import configparser\n", "from psycopg2 import connect\n", "import struct\n", + "import numpy as np\n", "import pandas as pd\n", "from datetime import datetime\n", "import matplotlib as mpl\n", @@ -192,6 +202,101 @@ "plt.show()\n" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d31bbf9c-f2f6-42c2-a0c6-9c43c8a5aeb9", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1286164/1721737923.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + } + ], + "source": [ + "sql = '''SELECT\n", + " max_bin AS legend,\n", + " lower(bin_range)::time AS bin_start,\n", + " upper(bin_range)::time AS bin_end\n", + "FROM gwolofs.congestion_raw_segments_max_bin_analysis\n", + "WHERE dt >= '2024-12-01' AND dt < '2024-12-02' AND segment_id = 2511\n", + "ORDER BY 1, 2'''\n", + "\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + " # Convert time columns to datetime\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()\n", + "\n", + "# Convert time columns to seconds since midnight\n", + "def time_to_seconds(t):\n", + " return t.hour * 3600 + t.minute * 60 + t.second\n", + "\n", + "df['bin_start'] = df['bin_start'].apply(time_to_seconds)\n", + "df['bin_end'] = df['bin_end'].apply(time_to_seconds)\n", + "df['duration'] = df['bin_end'] - df['bin_start']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3d5d92eb-711d-4b91-b2fd-4a26769b7538", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Assign numeric values for y-axis based on legend\n", + "y_labels = df['legend'].unique()\n", + "y_mapping = {label: i for i, label in enumerate(y_labels)}\n", + "df['y_pos'] = df['legend'].map(y_mapping)\n", + "\n", + "# Define color mapping\n", + "palette = sns.color_palette('viridis', n_colors=len(y_labels))\n", + "legend_colors = {label: palette[i] for i, label in enumerate(y_labels)}\n", + "\n", + "# Plot timeline graph using broken_barh\n", + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "for i, row in df.iterrows():\n", + " ax.broken_barh([(row['bin_start'], row['duration'])], (row['y_pos'] - 0.4, 0.8),\n", + " color=legend_colors[row['legend']], edgecolor='white')\n", + "\n", + "ax.set_xlabel('Time of Day')\n", + "ax.set_ylabel('Legend')\n", + "ax.set_title('Timeline of Bin Ranges by time_grp')\n", + "ax.set_yticks(range(len(y_labels)))\n", + "ax.set_yticklabels(y_labels)\n", + "ax.grid(axis='x', linestyle='--', alpha=0.7)\n", + "\n", + "# Adjust x-ticks to every 3600 seconds (1 hour)\n", + "ax.set_xticks(range(0, 86400, 3600))\n", + "ax.set_xticklabels([f\"{h:02d}:00\" for h in range(24)])\n", + "plt.xticks(rotation=45)\n", + "\n", + "# Create legend\n", + "handles = [plt.Rectangle((0, 0), 1, 1, color=legend_colors[label]) for label in y_labels]\n", + "ax.legend(handles, y_labels, title='Legend')\n", + "\n", + "plt.show()\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -203,6 +308,64 @@ "#raph's idea: cut off at 15 minutes, use the starting bin hour." ] }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6d709c8a-1a4d-4aed-b402-f2eada4b3861", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1286164/271536835.py:14: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " max_bin count percent_of_original\n", + "0 0 days 00:05:00 5647506 1.000000\n", + "1 0 days 00:10:00 5776554 1.022850\n", + "2 0 days 00:15:00 5793370 1.025828\n", + "3 0 days 00:20:00 5801453 1.027259\n", + "4 0 days 01:00:00 5813682 1.029425\n" + ] + } + ], + "source": [ + "sql = '''WITH count_5min_bins AS (\n", + "SELECT COUNT(*)\n", + "FROM gwolofs.congestion_raw_segments_max_bin_analysis\n", + "WHERE max_bin = '00:05:00'::interval\n", + ")\n", + "\n", + "SELECT max_bin, COUNT(*), COUNT(*) / (SELECT count::numeric FROM count_5min_bins) AS percent_of_original\n", + "FROM gwolofs.congestion_raw_segments_max_bin_analysis\n", + "GROUP BY max_bin\n", + "ORDER BY 1\n", + "'''\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + " print(df)\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94d6d78f-f20a-4f95-aede-315e058361ce", + "metadata": {}, + "outputs": [], + "source": [ + "This shows 15 minutes -> 1 hour max bin only results in 0.3% more observations." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -268,6 +431,76 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4966c18a-7678-4913-8d1d-9df2035afe35", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1286164/1152226507.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, con)\n" + ] + } + ], + "source": [ + "sql = '''\n", + "SELECT ROUND(tt, 0) AS tt, COUNT(*)\n", + "FROM gwolofs.congestion_raw_segments_max_bin_analysis\n", + "WHERE max_bin = '00:15:00'::interval\n", + "GROUP BY 1\n", + "ORDER BY 1\n", + "'''\n", + "\n", + "try:\n", + " with connect(**dbset) as con:\n", + " df = pd.read_sql(sql, con)\n", + "except Exception as e:\n", + " print(\"Error connecting to the database:\", e)\n", + " exit()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6781e1d7-012a-4b43-bbe0-7608008ced36", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Compute cumulative distribution\n", + "df = df.sort_values(by='tt')\n", + "df['cumulative'] = df['count'].cumsum() / df['count'].sum()\n", + "\n", + "# Plot cumulative distribution\n", + "plt.figure(figsize=(10, 6))\n", + "sns.lineplot(x=df['tt'], y=df['cumulative'], color='royalblue')\n", + "\n", + "plt.xlabel('Travel time (s)')\n", + "plt.ylabel('Cumulative Distribution')\n", + "plt.title('Cumulative Distribution of Congestion network travel times')\n", + "plt.xticks(np.arange(0, 241, 20)) # X-axis limits from 0 to 240\n", + "plt.yticks(np.arange(0, 1.05, 0.05), [f'{int(y*100)}%' for y in np.arange(0, 1.05, 0.05)]) # Y-axis every 5%\n", + "plt.xlim(0, 240)\n", + "plt.ylim(0, 1)\n", + "plt.grid(axis='both', linestyle='--', alpha=0.7)\n", + "\n", + "plt.show()" + ] + }, { "cell_type": "code", "execution_count": 30, From 0422ab098322d3b5eca20e82b0635741480dda28 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 1 Apr 2025 18:15:19 +0000 Subject: [PATCH 32/74] #1132 dynamic binning explore --- .../here_dynamic_binning_explore.ipynb | 120 +++--------------- 1 file changed, 18 insertions(+), 102 deletions(-) diff --git a/here/traffic/here_dynamic_binning_explore.ipynb b/here/traffic/here_dynamic_binning_explore.ipynb index d43b57fa1..5acc7016c 100644 --- a/here/traffic/here_dynamic_binning_explore.ipynb +++ b/here/traffic/here_dynamic_binning_explore.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "id": "8c6f35d7-fbd6-4336-91e3-4ab18b4009e5", "metadata": {}, "outputs": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "def42f51-59b0-42bd-b300-5d1f3e3dee15", "metadata": {}, "outputs": [ @@ -49,7 +49,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_818626/423225491.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + "/tmp/ipykernel_2029777/423225491.py:6: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(sql, con)\n" ] } @@ -68,13 +68,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "id": "a1a4f215-4fc4-4b0b-8223-870934a9c7f0", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -95,12 +95,13 @@ "sns.barplot(x=df['bin_length'], y=df['proportion'], hue=df['hr'], palette='viridis')\n", "plt.xlabel('Bin Size')\n", "plt.ylabel('Proportion')\n", - "plt.title('Congestion Proportions by Bin Size and Hour')\n", + "plt.title('Proportions of bins by duration and time of day')\n", "plt.yticks([i * 0.01 for i in range(0, 100, 5)])\n", "plt.xticks(rotation=45)\n", "plt.legend(title='Hour of the Day')\n", "plt.grid(axis='y', linestyle='--', alpha=0.7)\n", - "plt.show()\n" + "plt.show()\n", + "#note, higher proportion of longer bins before ~8am" ] }, { @@ -204,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "d31bbf9c-f2f6-42c2-a0c6-9c43c8a5aeb9", "metadata": { "scrolled": true @@ -214,7 +215,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1286164/1721737923.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + "/tmp/ipykernel_2029777/1721737923.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(sql, con)\n" ] } @@ -247,13 +248,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "id": "3d5d92eb-711d-4b91-b2fd-4a26769b7538", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -280,7 +281,7 @@ "\n", "ax.set_xlabel('Time of Day')\n", "ax.set_ylabel('Legend')\n", - "ax.set_title('Timeline of Bin Ranges by time_grp')\n", + "ax.set_title('Timeline of Bin Ranges for Different Max Bin Lengths')\n", "ax.set_yticks(range(len(y_labels)))\n", "ax.set_yticklabels(y_labels)\n", "ax.grid(axis='x', linestyle='--', alpha=0.7)\n", @@ -433,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "4966c18a-7678-4913-8d1d-9df2035afe35", "metadata": {}, "outputs": [ @@ -441,7 +442,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1286164/1152226507.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + "/tmp/ipykernel_2029777/1152226507.py:11: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql(sql, con)\n" ] } @@ -465,13 +466,13 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "id": "6781e1d7-012a-4b43-bbe0-7608008ced36", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -491,7 +492,7 @@ "\n", "plt.xlabel('Travel time (s)')\n", "plt.ylabel('Cumulative Distribution')\n", - "plt.title('Cumulative Distribution of Congestion network travel times')\n", + "plt.title('Cumulative Distribution of Congestion network travel times (for max_bin = ''00:15:00'')')\n", "plt.xticks(np.arange(0, 241, 20)) # X-axis limits from 0 to 240\n", "plt.yticks(np.arange(0, 1.05, 0.05), [f'{int(y*100)}%' for y in np.arange(0, 1.05, 0.05)]) # Y-axis every 5%\n", "plt.xlim(0, 240)\n", @@ -530,91 +531,6 @@ " ax.legend(handles=handles,loc='lower right', ncol=1, title = title) \n", " ax.set_axis_off()" ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "cce5f66c-01d9-4be6-9881-70eb8d2c994a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/jupyterhub/.venv/lib/python3.10/site-packages/geopandas/io/sql.py:170: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", - " df = pd.read_sql(\n" - ] - } - ], - "source": [ - "import geopandas as gpd\n", - "new_links = '''\n", - "WITH all_link_dirs_24_4 AS(\n", - "\tSELECT link_id || 'F'::text AS link_dir, func_class\n", - "\tFROM here_gis.traffic_streets_24_4\n", - "\tWHERE dir_travel = ANY (ARRAY['F', 'B'])\n", - "\tUNION ALL\n", - "\tSELECT link_id || 'T'::text AS link_dir, func_class\n", - "\tFROM here_gis.traffic_streets_24_4\n", - "\tWHERE dir_travel = ANY (ARRAY['T', 'B'])\n", - "\t)\n", - "\t,all_link_dirs_23_4 AS(\n", - "\tSELECT link_id || 'F'::text AS link_dir, func_class\n", - "\tFROM here_gis.traffic_streets_23_4\n", - "\tWHERE dir_travel = ANY (ARRAY['F', 'B'])\n", - "\tUNION ALL\n", - "\tSELECT link_id || 'T'::text AS link_dir, func_class\n", - "\tFROM here_gis.traffic_streets_23_4\n", - "\tWHERE dir_travel = ANY (ARRAY['T', 'B'])\n", - "\t)\n", - "SELECT link_dir, a.func_class::integer as id, a.func_class::text as col,gis.geopandas_transform(geom) as geom\n", - "FROM (select nu.* from all_link_dirs_23_4 ole\n", - "FULL OUTER JOIN all_link_dirs_24_4 nu on nu.link_dir = ole.link_dir\n", - "WHERE ole.link_dir IS NULL AND nu.link_dir IS NOT NULL)a\n", - "inner join (SELECT sts.link_id || 'F'::text AS link_dir,\n", - " sts.link_id,\n", - "\t\t\tfunc_class,\n", - " sts.geom\n", - " FROM here_gis.streets_24_4 sts\n", - "\t\t \t\tinner join here_gis.traffic_streets_24_4 using (link_id)\n", - " WHERE dir_travel in ('F', 'B')\n", - " UNION ALL\n", - " SELECT sts.link_id || 'T'::text AS link_dir,\n", - " sts.link_id,\n", - "\t\t\tfunc_class,\n", - " st_reverse(sts.geom) AS geom\n", - " FROM here_gis.streets_24_4 sts\n", - "\t\t inner join here_gis.traffic_streets_24_4 using (link_id)\n", - " WHERE dir_travel in ('T', 'B') )streets\n", - "using (link_dir)\n", - "--where ST_geometrytype(geom) = 'ST_LineString' \n", - "order by a.func_class\n", - " '''\n", - "\n", - "new_links = gpd.GeoDataFrame.from_postgis(new_links, con, geom_col='geom')\n", - "new_links = new_links.to_crs('epsg:26917')" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "fd3d8266-58d8-47fe-af42-cd163961e90c", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "make_map(new_links, True, False, False, 'magma', 'Functional Class');" - ] } ], "metadata": { From 40f9c99a89c97afa45ea3faa2d4b4e8dd215241a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 8 Apr 2025 18:05:37 +0000 Subject: [PATCH 33/74] add trigger_dags_tasks to pull_here_path #1132 --- dags/pull_here_path.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/dags/pull_here_path.py b/dags/pull_here_path.py index aeaa4d8de..78f8f6c41 100644 --- a/dags/pull_here_path.py +++ b/dags/pull_here_path.py @@ -3,10 +3,11 @@ import pendulum from datetime import timedelta -from airflow.decorators import task, dag +from airflow.decorators import task, dag, task_group from airflow.hooks.base import BaseHook from airflow.models import Variable from airflow.macros import ds_add, ds_format +from airflow.operators.trigger_dagrun import TriggerDagRunOperator try: repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -85,6 +86,21 @@ def get_download_link(request_id: str, access_token: str): def load_data()->str: return '''curl $DOWNLOAD_URL | gunzip | psql -h $HOST -U $LOGIN -d bigdata -c "\\COPY here.ta_path_view FROM STDIN WITH (FORMAT csv, HEADER TRUE);" ''' - load_data() + # Create a task group for triggering the DAGs + @task_group + def trigger_dags_tasks(): + # Define TriggerDagRunOperator for each DAG to trigger + trigger_operators = [] + DAGS_TO_TRIGGER = Variable.get('here_path_dag_triggers', deserialize_json=True) + for dag_id in DAGS_TO_TRIGGER: + trigger_operator = TriggerDagRunOperator( + task_id=f'trigger_{dag_id}', + trigger_dag_id=dag_id, + logical_date='{{ ds }}', + reset_dag_run=True # Clear existing dag if already exists (for backfilling), old runs will not be in the logs + ) + trigger_operators.append(trigger_operator) + + load_data() >> trigger_dags_tasks() pull_here_path() \ No newline at end of file From 3ea3d8ad63d91c33cff307c13ed96cde9a243686 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 8 Apr 2025 18:27:17 +0000 Subject: [PATCH 34/74] #1132 add here_dynamic_binning_agg DAG --- dags/here_dynamic_binning_agg.py | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 dags/here_dynamic_binning_agg.py diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py new file mode 100644 index 000000000..cea208e63 --- /dev/null +++ b/dags/here_dynamic_binning_agg.py @@ -0,0 +1,61 @@ +import sys +import os +import logging +import pendulum +from datetime import timedelta + +from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator +from airflow.models import Variable +from airflow.decorators import dag, task + +try: + repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + sys.path.insert(0, repo_path) + from dags.dag_functions import task_fail_slack_alert +except: + raise ImportError("Cannot import slack alert functions") + +LOGGER = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + +doc_md = "This DAG is running off the `1132-here-aggregation-proposal` branch to test dynamic binning aggregation." +DAG_NAME = 'here_dynamic_binning_agg' +DAG_OWNERS = Variable.get('dag_owners', deserialize_json=True).get(DAG_NAME, ["Unknown"]) + +# Slack alert +SLACK_CONN_ID = 'slack_data_pipeline' + +default_args = { + 'owner': ','.join(DAG_OWNERS), + 'depends_on_past':False, + 'start_date': pendulum.datetime(2025, 4, 1, tz="America/Toronto"), + 'email_on_failure': False, + 'email_on_success': False, + 'retries': 0, + 'retry_delay': timedelta(minutes=5), + 'on_failure_callback': task_fail_slack_alert +} + +@dag( + DAG_NAME, + default_args=default_args, + schedule=None, # triggered by `pull_here_path` DAG + doc_md = doc_md, + tags=["HERE", "aggregation"], + catchup=False +) + +#to add: catchup, one task at a time, depends on past. + +def here_dynamic_binning_agg(): + + aggregate_daily = SQLExecuteQueryOperator( + #sql="SELECT covid.generate_citywide_tti( '{{macros.ds_add(ds, -1)}}' )", + task_id='aggregate_daily', + conn_id='congestion_bot', + autocommit=True, + retries = 0 + ) + aggregate_daily + +here_dynamic_binning_agg() \ No newline at end of file From 17037fdd69cfb83bf069416c11185e8a92b71026 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:22:23 +0000 Subject: [PATCH 35/74] #1132 smol dag changes --- dags/here_dynamic_binning_agg.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index cea208e63..e813e276b 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -22,9 +22,6 @@ DAG_NAME = 'here_dynamic_binning_agg' DAG_OWNERS = Variable.get('dag_owners', deserialize_json=True).get(DAG_NAME, ["Unknown"]) -# Slack alert -SLACK_CONN_ID = 'slack_data_pipeline' - default_args = { 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, @@ -50,7 +47,7 @@ def here_dynamic_binning_agg(): aggregate_daily = SQLExecuteQueryOperator( - #sql="SELECT covid.generate_citywide_tti( '{{macros.ds_add(ds, -1)}}' )", + sql="SELECT '{{macros.ds_add(ds, -1)}}';", task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, From b3f8dac84d44591c2639d42c1d17e9ac99336e5c Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:27:41 +0000 Subject: [PATCH 36/74] #1132 fix schedule error from rebase --- dags/pull_here_path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/pull_here_path.py b/dags/pull_here_path.py index 78f8f6c41..6b2f0fdc7 100644 --- a/dags/pull_here_path.py +++ b/dags/pull_here_path.py @@ -46,7 +46,7 @@ @dag(dag_id = dag_name, default_args=default_args, - schedule='30 10 * * *' , + schedule='0 17 * * * ', catchup=False, doc_md = doc_md, tags=["HERE", "data_pull"] From ec107527734c1ac748ffb64d44971ae988e0d115 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 9 Apr 2025 16:14:17 +0000 Subject: [PATCH 37/74] #1132 add variable to test_dags --- test/integration/test_dags.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/integration/test_dags.py b/test/integration/test_dags.py index ce574db40..b41571003 100644 --- a/test/integration/test_dags.py +++ b/test/integration/test_dags.py @@ -27,6 +27,7 @@ 'AIRFLOW_VAR_COLLISIONS_TABLES': "["+",".join([f'["src_schema.table_{i}", "dst_schema.table_{i}"]' for i in range(0, 2)])+"]", 'AIRFLOW_VAR_COUNTS_TABLES': "["+",".join([f'["src_schema.table_{i}", "dst_schema.table_{i}"]' for i in range(0, 3)])+"]", 'AIRFLOW_VAR_HERE_DAG_TRIGGERS': "["+",".join([f'"dag_{i}"' for i in range(0, 3)])+"]", + 'AIRFLOW_VAR_HERE_PATH_DAG_TRIGGERS': "["+",".join([f'"dag_{i}"' for i in range(0, 3)])+"]", 'AIRFLOW_VAR_REPLICATORS': '{"dag": {"dag_name": "value", "tables": "value", "conn": "value"}}', 'AIRFLOW_VAR_TEST_DAG_TRIGGERS': "["+",".join([f'"dag_{i}"' for i in range(0, 3)])+"]", 'AIRFLOW_VAR_GCC_DAGS': '{"dag": {"conn": "value", "deployments": ["value"]}}', From 57496b57d2167c79e52887cce7415863414ee6ac Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 9 Apr 2025 16:46:50 +0000 Subject: [PATCH 38/74] #1132 remove time_grp, add bin_start, add temp table to reduce constraint size --- .../create-table-congestion_raw_segments.sql | 42 +++++--- ...unction-congestion_network_segment_agg.sql | 102 ++++++++++-------- 2 files changed, 82 insertions(+), 62 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index f73ecb566..940e6bf0f 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -6,19 +6,12 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments ( segment_id integer NOT NULL, dt date NOT NULL, - time_grp timerange NOT NULL, + bin_start timestamp without time zone NOT NULL, bin_range tsrange NOT NULL, tt numeric, num_obs integer, - CONSTRAINT congestion_raw_segments_exclude EXCLUDE USING gist ( - segment_id WITH =, - dt WITH =, - time_grp WITH =, - bin_range WITH && - ) -) - -TABLESPACE pg_default; + CONSTRAINT congestion_raw_segments_pkey PRIMARY KEY (segment_id, dt, bin_start) +) PARTITION BY RANGE (dt); ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments OWNER TO gwolofs; @@ -28,23 +21,40 @@ REVOKE ALL ON TABLE gwolofs.congestion_raw_segments FROM bdit_humans; GRANT SELECT ON TABLE gwolofs.congestion_raw_segments TO bdit_humans; GRANT ALL ON TABLE gwolofs.congestion_raw_segments TO gwolofs; - -- Index: congestion_raw_segments_dt_idx -- DROP INDEX IF EXISTS gwolofs.congestion_raw_segments_dt_idx; CREATE INDEX IF NOT EXISTS congestion_raw_segments_dt_idx ON gwolofs.congestion_raw_segments USING brin -(dt) -TABLESPACE pg_default; +(dt); -- Index: congestion_raw_segments_segment_dt_idx -- DROP INDEX IF EXISTS gwolofs.congestion_raw_segments_segment_dt_idx; CREATE INDEX IF NOT EXISTS congestion_raw_segments_segment_dt_idx ON gwolofs.congestion_raw_segments USING btree -(segment_id ASC NULLS LAST, dt ASC NULLS LAST) +(segment_id ASC NULLS LAST, bin_start ASC NULLS LAST); + +-- Partitions SQL + +CREATE TABLE gwolofs.congestion_raw_segments_2023 PARTITION OF gwolofs.congestion_raw_segments +FOR VALUES FROM ('2023-01-01') TO ('2024-01-01') TABLESPACE pg_default; -COMMENT ON TABLE gwolofs.congestion_raw_corridors IS -'Stores dynamic binning results from standard HERE congestion network travel time aggregations.'; +ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments_2023 +OWNER TO gwolofs; + +CREATE TABLE gwolofs.congestion_raw_segments_2024 PARTITION OF gwolofs.congestion_raw_segments +FOR VALUES FROM ('2024-01-01') TO ('2025-01-01') +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments_2024 +OWNER TO gwolofs; + +CREATE TABLE gwolofs.congestion_raw_segments_2025 PARTITION OF gwolofs.congestion_raw_segments +FOR VALUES FROM ('2025-01-01') TO ('2026-01-01') +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments_2025 +OWNER TO gwolofs; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index 1b1724675..e8c137cdf 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -18,6 +18,20 @@ DECLARE BEGIN +--using a temp table to aply the exclusion constraint should prevent the +--insert from getting bogged down by large constraint on main table over time +CREATE TEMPORARY TABLE congestion_raw_segments_temp ( + segment_id integer NOT NULL, + bin_start timestamp without time zone NOT NULL, + bin_range tsrange NOT NULL, + tt numeric, + num_obs integer, + CONSTRAINT congestion_raw_segments_exclude_temp EXCLUDE USING gist ( + bin_range WITH &&, + segment_id WITH = + ) +); + EXECUTE FORMAT( $$ WITH segments AS ( @@ -32,7 +46,6 @@ EXECUTE FORMAT( segment_5min_bins AS ( SELECT links.segment_id, - timerange(tg.start_tod, tg.end_tod, '[)') AS time_grp, ta.tx, RANK() OVER w AS bin_rank, links.total_length, @@ -44,21 +57,16 @@ EXECUTE FORMAT( ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, ARRAY_AGG(links.length ORDER BY link_dir) AS lengths FROM here.ta_path AS ta - JOIN gwolofs.congestion_time_grps AS tg ON - ta.tx >= %1$L::date + tg.start_tod - AND ta.tx < %1$L::date + tg.end_tod JOIN segments AS links USING (link_dir) WHERE ta.dt >= %1$L::date AND ta.dt < %1$L::date + interval '1 day' GROUP BY links.segment_id, - tg.start_tod, - tg.end_tod, ta.tx, links.total_length WINDOW w AS ( - PARTITION BY links.segment_id, tg.start_tod, tg.end_tod + PARTITION BY links.segment_id ORDER BY ta.tx ) ), @@ -69,7 +77,6 @@ EXECUTE FORMAT( --also don't generate options past the next bin with 80%% length SELECT tx, - time_grp, segment_id, bin_rank AS start_bin, --generate all the options for the end bin within the group. @@ -89,7 +96,7 @@ EXECUTE FORMAT( ) AS end_bin FROM segment_5min_bins WINDOW w AS ( - PARTITION BY time_grp, segment_id + PARTITION BY segment_id ORDER BY tx --look only forward for end_bin options RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING @@ -98,7 +105,6 @@ EXECUTE FORMAT( unnested_db_options AS ( SELECT - dbo.time_grp, dbo.segment_id, s5b.total_length, dbo.tx AS dt_start, @@ -110,63 +116,67 @@ EXECUTE FORMAT( SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir FROM dynamic_bin_options AS dbo LEFT JOIN segment_5min_bins AS s5b - ON s5b.time_grp = dbo.time_grp - AND s5b.segment_id = dbo.segment_id + ON s5b.segment_id = dbo.segment_id AND s5b.bin_rank >= dbo.start_bin AND s5b.bin_rank <= dbo.end_bin --this join is used to get the tx info about the last bin only LEFT JOIN segment_5min_bins AS s5b_end - ON s5b_end.time_grp = dbo.time_grp - AND s5b_end.segment_id = dbo.segment_id + ON s5b_end.segment_id = dbo.segment_id AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' + --dynamic bins should not exceed 15 minutes (dt_end <= dt_start + 15 min) + WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '15 minutes' GROUP BY - dbo.time_grp, dbo.segment_id, s5b.total_length, dbo.tx, --stard_bin s5b_end.tx, --end_bin unnested.link_dir, unnested.len + ), + + inserted AS ( + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO congestion_raw_segments_temp AS inserted ( + bin_start, segment_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (segment_id, dt_start) + dt_start AS bin_start, + segment_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + segment_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + segment_id, + dt_start, + dt_end --uses the option that ends first + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude_temp + DO NOTHING + RETURNING inserted.bin_start, inserted.segment_id, inserted.bin_range, inserted.tt, inserted.num_obs ) + + INSERT INTO gwolofs.congestion_raw_segments (dt, bin_start, segment_id, bin_range, tt, num_obs) + SELECT bin_start::date AS dt, bin_start, segment_id, bin_range, tt, num_obs + FROM inserted; - --this query contains overlapping values which get eliminated - --via on conflict with the exclusion constraint on congestion_raw_segments table. - INSERT INTO gwolofs.congestion_raw_segments ( - dt, time_grp, segment_id, bin_range, tt, num_obs - ) - --distinct on ensures only the shortest option gets proposed for insert - SELECT DISTINCT ON (time_grp, segment_id, dt_start) - dt_start::date AS dt, - time_grp, - segment_id, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment - FROM unnested_db_options - GROUP BY - time_grp, - segment_id, - dt_start, - dt_end, - total_length - HAVING SUM(len) >= 0.8 * total_length - ORDER BY - time_grp, - segment_id, - dt_start, - dt_end --uses the option that ends first - --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude - DO NOTHING; $$, start_date, congestion_network_table ); + DROP TABLE congestion_raw_segments_temp; + END; $BODY$; From d1ccac35afab4869549fef15e2fa7856e5feee0d Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 9 Apr 2025 21:03:47 +0000 Subject: [PATCH 39/74] #1132 add trigger command for here_dynamic_binning_agg --- dags/here_dynamic_binning_agg.py | 18 +++++++++++++++--- dags/pull_here_path.py | 2 +- ...function-congestion_network_segment_agg.sql | 2 ++ .../function-congestion_select_map_version.sql | 2 ++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index e813e276b..ec2af86c9 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -1,3 +1,14 @@ +''' +To trigger for past date (range) use CLI: +for i in {0..5}; do + end_date=$(date -I -d "2023-11-02 +$i days") + airflow dags trigger -e "${end_date}" here_dynamic_binning_agg +done + +or trigger just one day: airflow dags trigger -e 2023-11-02 here_dynamic_binning_agg +`airflow dags backfill ...` doesn't work because there are no scheduled run dates in that range. +''' + import sys import os import logging @@ -25,7 +36,7 @@ default_args = { 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, - 'start_date': pendulum.datetime(2025, 4, 1, tz="America/Toronto"), + 'start_date': pendulum.datetime(2023, 1, 1, tz="America/Toronto"), 'email_on_failure': False, 'email_on_success': False, 'retries': 0, @@ -39,15 +50,16 @@ schedule=None, # triggered by `pull_here_path` DAG doc_md = doc_md, tags=["HERE", "aggregation"], + max_active_runs=1, catchup=False ) #to add: catchup, one task at a time, depends on past. def here_dynamic_binning_agg(): - aggregate_daily = SQLExecuteQueryOperator( - sql="SELECT '{{macros.ds_add(ds, -1)}}';", + sql=["DELETE FROM gwolofs.congestion_raw_segments WHERE dt = '{{ ds }}'", + "SELECT gwolofs.congestion_network_segment_agg('{{ ds }}'::date);"], task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, diff --git a/dags/pull_here_path.py b/dags/pull_here_path.py index 6b2f0fdc7..ee6888009 100644 --- a/dags/pull_here_path.py +++ b/dags/pull_here_path.py @@ -96,7 +96,7 @@ def trigger_dags_tasks(): trigger_operator = TriggerDagRunOperator( task_id=f'trigger_{dag_id}', trigger_dag_id=dag_id, - logical_date='{{ ds }}', + logical_date='{{macros.ds_add(ds, 1)}}', reset_dag_run=True # Clear existing dag if already exists (for backfilling), old runs will not be in the logs ) trigger_operators.append(trigger_operator) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index e8c137cdf..4d89920fc 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -183,6 +183,8 @@ $BODY$; ALTER FUNCTION gwolofs.congestion_network_segment_agg(date) OWNER TO gwolofs; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_network_segment_agg(date) TO congestion_bot; + COMMENT ON FUNCTION gwolofs.congestion_network_segment_agg(date) IS 'Dynamic bin aggregation of the congestion network by hour and time periods. Takes around 10 minutes to run for one day (hourly and period based aggregation)'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index 5cdcb6ac7..5999cb713 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -28,5 +28,7 @@ $BODY$; ALTER FUNCTION gwolofs.congestion_select_map_version(date, date) OWNER TO gwolofs; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_select_map_version(date, date) TO congestion_bot; + COMMENT ON FUNCTION gwolofs.congestion_select_map_version IS 'Implement TT App selectMapVersion.py'; From 98d7b95276d9a579e02c13d79f72d4644843c5c6 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:14:24 +0000 Subject: [PATCH 40/74] #1132 add temp aggregation end date --- dags/here_dynamic_binning_agg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index ec2af86c9..4c8301df3 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -37,6 +37,8 @@ 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, 'start_date': pendulum.datetime(2023, 1, 1, tz="America/Toronto"), + #aggregation doesn't work on 24_4 yet (no congestion.network_links_24_4) + 'end_date': pendulum.datetime(2025, 3, 17, tz="America/Toronto"), 'email_on_failure': False, 'email_on_success': False, 'retries': 0, From 9e37d4a1e4513b378bde87791c4652025cc4cfa0 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 3 Jun 2025 19:49:06 +0000 Subject: [PATCH 41/74] #1132 an overloaded/simplified version of congestion_cache_tt_results_daily --- .../traffic/sql/dynamic_bins/corridor_agg.sql | 14 ++++++++++ .../function-congestion_cache_tt_results.sql | 27 +++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 here/traffic/sql/dynamic_bins/corridor_agg.sql diff --git a/here/traffic/sql/dynamic_bins/corridor_agg.sql b/here/traffic/sql/dynamic_bins/corridor_agg.sql new file mode 100644 index 000000000..5e81033f1 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/corridor_agg.sql @@ -0,0 +1,14 @@ +--test: 35 projects, 1 day = 47s +SELECT gwolofs.congestion_cache_tt_results_daily( + node_start := congestion_corridors.node_start, + node_end := congestion_corridors.node_end, + start_date := dates.dt::date +) +FROM gwolofs.congestion_corridors +JOIN gwolofs.congestion_projects USING (project_id), +generate_series('2025-02-01', '2025-02-28', '1 day'::interval) AS dates(dt) +WHERE + congestion_projects.description IN ( + 'bluetooth_corridors', 'scrutinized-cycleway-corridors' + ) + AND map_version = '23_4'; \ No newline at end of file diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 4529296df..105f2766d 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -200,3 +200,30 @@ OWNER TO gwolofs; COMMENT ON FUNCTION gwolofs.congestion_cache_tt_results IS 'Caches the dynamic binning results for a request.'; + +-- overload the function for more straightforward situation of daily corridor agg +CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results_daily( + start_date date, + node_start bigint, + node_end bigint +) +RETURNS void +LANGUAGE sql +COST 100 +VOLATILE PARALLEL UNSAFE +AS +$BODY$ +SELECT gwolofs.congestion_cache_tt_results( + uri_string := NULL::text, + start_date := congestion_cache_tt_results_daily.start_date, + end_date := congestion_cache_tt_results_daily.start_date + 1, + start_tod := '00:00'::time without time zone, + end_tod := '24:00'::time without time zone, + dow_list := ARRAY[extract('isodow' from congestion_cache_tt_results_daily.start_date)]::int[], + node_start := congestion_cache_tt_results_daily.node_start, + node_end := congestion_cache_tt_results_daily.node_end, + holidays := True) +$BODY$; + +COMMENT ON FUNCITON gwolofs.congestion_cache_tt_results_daily +IS 'A simplified version of `congestion_cache_tt_results` for aggregating entire days of data.' \ No newline at end of file From 79b7cf75d490ab991e282a7e95893add63ea1b46 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 3 Jun 2025 19:49:44 +0000 Subject: [PATCH 42/74] #1132 add project table, fkey, and corridor descriptions --- .../create-table-congestion_corridors.sql | 11 +++++++++- .../create-table-congestion_projects.sql | 22 +++++++++++++++++++ .../function-congestion_cache_corridor.sql | 12 ++++++---- 3 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 here/traffic/sql/dynamic_bins/create-table-congestion_projects.sql diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql index 529d26905..827bb6710 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql @@ -12,7 +12,16 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_corridors node_start bigint NOT NULL, node_end bigint NOT NULL, map_version text COLLATE pg_catalog."default" NOT NULL, - CONSTRAINT congestion_corridors_pkey PRIMARY KEY (node_start, node_end, map_version) + corridor_streets text COLLATE pg_catalog."default", + corridor_start text COLLATE pg_catalog."default", + corridor_end text COLLATE pg_catalog."default", + project_id integer, + CONSTRAINT congestion_corridors_pkey PRIMARY KEY (node_start, node_end, map_version), + CONSTRAINT project_id_fk FOREIGN KEY (project_id) + REFERENCES gwolofs.congestion_projects (project_id) MATCH SIMPLE + ON UPDATE NO ACTION + ON DELETE NO ACTION + NOT VALID ) TABLESPACE pg_default; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_projects.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_projects.sql new file mode 100644 index 000000000..67ae3a2bc --- /dev/null +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_projects.sql @@ -0,0 +1,22 @@ +-- Table: gwolofs.congestion_projects + +-- DROP TABLE IF EXISTS gwolofs.congestion_projects; + +CREATE TABLE IF NOT EXISTS gwolofs.congestion_projects +( + project_id integer NOT NULL DEFAULT nextval('congestion_projects_project_id_seq'::regclass), + description text COLLATE pg_catalog."default" NOT NULL, + CONSTRAINT congestion_projects_pkey PRIMARY KEY (project_id), + CONSTRAINT unique_prj_description UNIQUE NULLS NOT DISTINCT (description) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_projects +OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.congestion_projects FROM bdit_humans; + +GRANT SELECT ON TABLE gwolofs.congestion_projects TO bdit_humans; + +GRANT ALL ON TABLE gwolofs.congestion_projects TO gwolofs; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index 9840dfd7b..9d4878198 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -20,6 +20,7 @@ AS $BODY$ DECLARE routing_function text := 'get_links_btwn_nodes_' || map_version; street_geoms_table text := 'routing_streets_' || map_version; + traffic_streets_table text := 'traffic_streets_' || map_version; BEGIN @@ -49,7 +50,7 @@ EXECUTE format ( ) INSERT INTO gwolofs.congestion_corridors ( - node_start, node_end, map_version, link_dirs, lengths, geom, total_length + node_start, node_end, map_version, link_dirs, lengths, geom, total_length, corridor_streets ) SELECT %2$L AS node_start, @@ -59,9 +60,11 @@ EXECUTE format ( --lengths in m ARRAY_AGG(st_length(st_transform(streets.geom, 2952)) ORDER BY rl.seq) AS lengths, st_union(st_linemerge(streets.geom)) AS geom, - SUM(ST_Length(ST_Transform(streets.geom, 2952))) AS total_length + SUM(ST_Length(ST_Transform(streets.geom, 2952))) AS total_length, + string_agg(DISTINCT initcap(traffic_streets.st_name), ' / ') AS corridor_streets FROM routed_links AS rl JOIN here.%5$I AS streets USING (link_dir) + LEFT JOIN here_gis.%6$I AS traffic_streets USING (link_id) --conflict would occur because of null values ON CONFLICT (node_start, node_end, map_version) DO UPDATE @@ -72,8 +75,9 @@ EXECUTE format ( RETURNING corridor_id, link_dirs, lengths, total_length $$, routing_function, node_start, node_end, -- For routed_links - map_version, -- For INSERT SELECT values - street_geoms_table -- For JOIN table + map_version, -- For INSERT / SELECT values + street_geoms_table, -- For JOIN here.%5$I AS streets + traffic_streets_table -- For LEFT JOIN here_gis.%6$I AS traffic_streets ) INTO corridor_id, link_dirs, lengths, total_length; RETURN; END; From b3a49f84207e4a89e029dd3d5a790d0e4c2d4918 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 3 Jun 2025 19:50:01 +0000 Subject: [PATCH 43/74] #1132 project / corridor demo --- .../insert_projects_and_corridors.sql | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql diff --git a/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql b/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql new file mode 100644 index 000000000..62c107fc0 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql @@ -0,0 +1,46 @@ +--for naming corridor_streets. +--need help with corridor_start and corridor_end locations - not sure how to turn here nodes into names. Intersection conflation? +WITH named_corridors AS ( + SELECT corridor_id, string_agg(DISTINCT initcap(st_name), ' / ') AS corridor_streets + FROM gwolofs.congestion_corridors, + UNNEST (congestion_corridors.link_dirs) AS unnested(link_dir) + LEFT JOIN here_gis.traffic_streets_24_4 ON link_id = trim(trailing 'T|F' from link_dir)::int + WHERE map_version = '24_4' + GROUP BY corridor_id + ORDER BY corridor_id DESC +) + +UPDATE gwolofs.congestion_corridors AS cc +SET corridor_streets = nc.corridor_streets +FROM named_corridors AS nc +WHERE nc.corridor_id = cc.corridor_id; + +--look at bluetooth corridors +REFRESH MATERIALIZED VIEW bluetooth.here_cn_23_4_lookup; + +--cache project +WITH project AS ( + INSERT INTO gwolofs.congestion_projects (description) + VALUES ('bluetooth_corridors') + RETURNING project_id +), + +--cache corridors, repeat with multiple map versions +corridors AS ( + SELECT corridor_id + FROM bluetooth.here_cn_23_4_lookup AS bt, + gwolofs.congestion_cache_corridor(bt.here_fnode, bt.here_tnode, '24_4') +) + +--add project_id to corridors +UPDATE gwolofs.congestion_corridors +SET project_id = (SELECT project_id FROM project) +WHERE corridor_id IN (SELECT corridor_id FROM corridors) +RETURNING corridor_id; + +--examine the projects +SELECT congestion_corridors.* +FROM gwolofs.congestion_corridors +JOIN gwolofs.congestion_projects USING (project_id) +WHERE congestion_projects.description IN ('bluetooth_corridors', 'scrutinized-cycleway-corridors') + From c3501a7e91e837b00c5191e3580b786d1f6dfbcc Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 4 Jun 2025 20:24:59 +0000 Subject: [PATCH 44/74] #1132 add node start/end names to corridors --- .../function-congestion_cache_corridor.sql | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index 9d4878198..da926770f 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -21,6 +21,7 @@ DECLARE routing_function text := 'get_links_btwn_nodes_' || map_version; street_geoms_table text := 'routing_streets_' || map_version; traffic_streets_table text := 'traffic_streets_' || map_version; + routing_nodes_table text := 'routing_nodes_' || map_version; BEGIN @@ -50,7 +51,8 @@ EXECUTE format ( ) INSERT INTO gwolofs.congestion_corridors ( - node_start, node_end, map_version, link_dirs, lengths, geom, total_length, corridor_streets + node_start, node_end, map_version, link_dirs, lengths, geom, + total_length, corridor_streets, corridor_start, corridor_end ) SELECT %2$L AS node_start, @@ -61,23 +63,43 @@ EXECUTE format ( ARRAY_AGG(st_length(st_transform(streets.geom, 2952)) ORDER BY rl.seq) AS lengths, st_union(st_linemerge(streets.geom)) AS geom, SUM(ST_Length(ST_Transform(streets.geom, 2952))) AS total_length, - string_agg(DISTINCT initcap(traffic_streets.st_name), ' / ') AS corridor_streets + string_agg(DISTINCT initcap(traffic_streets.st_name), + ' / ' ORDER BY initcap(traffic_streets.st_name)) AS corridor_streets, + string_agg(DISTINCT initcap(from_streets.st_name), + ' / ' ORDER BY initcap(from_streets.st_name)) AS corridor_start, + string_agg(DISTINCT initcap(to_streets.st_name), + ' / ' ORDER BY initcap(to_streets.st_name)) AS corridor_end FROM routed_links AS rl JOIN here.%5$I AS streets USING (link_dir) LEFT JOIN here_gis.%6$I AS traffic_streets USING (link_id) + LEFT JOIN here.%7$I AS from_node ON from_node.node_id = %2$L + LEFT JOIN here_gis.%6$I AS from_streets + ON from_node.link_id = from_streets.link_id + AND from_streets.st_name <> traffic_streets.st_name + AND from_streets.st_name IS NOT NULL + LEFT JOIN here.%7$I AS to_node ON to_node.node_id = %3$L + LEFT JOIN here_gis.%6$I AS to_streets + ON to_node.link_id = to_streets.link_id + AND to_streets.st_name <> traffic_streets.st_name + AND to_streets.st_name IS NOT NULL --conflict would occur because of null values ON CONFLICT (node_start, node_end, map_version) DO UPDATE SET link_dirs = excluded.link_dirs, lengths = excluded.lengths, - total_length = excluded.total_length + total_length = excluded.total_length, + corridor_streets = excluded.corridor_streets, + corridor_start = excluded.corridor_start, + corridor_end = excluded.corridor_end + --returned values are used by fn congestion_cache_tt_results RETURNING corridor_id, link_dirs, lengths, total_length $$, routing_function, node_start, node_end, -- For routed_links map_version, -- For INSERT / SELECT values street_geoms_table, -- For JOIN here.%5$I AS streets - traffic_streets_table -- For LEFT JOIN here_gis.%6$I AS traffic_streets + traffic_streets_table, -- For LEFT JOIN here_gis.%6$I AS traffic_streets + routing_nodes_table -- For LEFT JOIN here.%7$I AS from_node / to_node ) INTO corridor_id, link_dirs, lengths, total_length; RETURN; END; @@ -90,4 +112,4 @@ COMMENT ON FUNCTION gwolofs.congestion_cache_corridor IS 'Returns definition of a HERE corridor, given input nodes and map_version. First checks if corridor has already been cached and if so retrieves the cached values. If not, a new entry is added to gwolofs.congestion_corridors -table and returned.'; +table and returned.'; \ No newline at end of file From 0254db5c916f4993c30bbdcf4033a72ca834db4a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 5 Jun 2025 18:34:53 +0000 Subject: [PATCH 45/74] #1132 reconcile some differences between segment and corridor functions --- .../function-congestion_cache_tt_results.sql | 128 +++++++++++------- ...unction-congestion_network_segment_agg.sql | 38 +++--- 2 files changed, 103 insertions(+), 63 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 105f2766d..9cfbf8c4d 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -23,23 +23,41 @@ DECLARE map_version text; BEGIN +--using a temp table to aply the exclusion constraint should prevent the +--insert from getting bogged down by large constraint on main table over time +CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( + corridor_id smallint, + time_grp timerange NOT NULL, + bin_range tsrange NOT NULL, + tt numeric, + num_obs integer, + uri_string text, + dt date, + CONSTRAINT congestion_raw_corridors_exclude_temp EXCLUDE USING gist ( + bin_range WITH &&, + corridor_id WITH =, + time_grp WITH =, + uri_string WITH = + ) +); + SELECT gwolofs.congestion_select_map_version( congestion_cache_tt_results.start_date, congestion_cache_tt_results.end_date ) INTO map_version; -EXECUTE format( +EXECUTE FORMAT( $$ - WITH segment AS ( + WITH corridor AS ( SELECT - corridor_id, + ccc.corridor_id, unnested.link_dir, unnested.length, - total_length - FROM gwolofs.congestion_cache_corridor(%L, %L, %L), + ccc.total_length + FROM gwolofs.congestion_cache_corridor(%1$L, %2$L, %3$L) AS ccc, UNNEST( - congestion_cache_corridor.link_dirs, - congestion_cache_corridor.lengths + ccc.link_dirs, + ccc.lengths ) AS unnested(link_dir, length) ), @@ -49,8 +67,8 @@ EXECUTE format( ta.tx, seg.total_length, tsrange( - ta.dt + %L::time, - ta.dt + %L::time, '[)') AS time_grp, + ta.dt + %4$L::time, + ta.dt + %5$L::time, '[)') AS time_grp, RANK() OVER w AS bin_rank, SUM(seg.length) / seg.total_length AS sum_length, SUM(seg.length) AS length_w_data, @@ -60,25 +78,25 @@ EXECUTE format( ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths FROM here.ta_path AS ta - JOIN segment AS seg USING (link_dir) + JOIN corridor AS seg USING (link_dir) WHERE ( - ta.tod >= %L + ta.tod >= %4$L AND --{ToD_and_or} - ta.tod < %L + ta.tod < %5$L ) - AND date_part('isodow', ta.dt) = ANY(%L::int[]) - AND ta.dt >= %L - AND ta.dt < %L + AND date_part('isodow', ta.dt) = ANY(%6$L::int[]) + AND ta.dt >= %7$L + AND ta.dt < %8$L /*--{holiday_clause} AND NOT EXISTS ( SELECT 1 FROM ref.holiday WHERE ta.dt = holiday.dt )*/ GROUP BY + seg.corridor_id, ta.tx, ta.dt, - seg.total_length, - seg.corridor_id + seg.total_length WINDOW w AS ( PARTITION BY seg.corridor_id, ta.dt ORDER BY ta.tx @@ -86,11 +104,12 @@ EXECUTE format( ), dynamic_bin_options AS ( - --within each segment/hour, generate all possible forward looking bin combinations + --within each corridor/hour, generate all possible forward looking bin combinations --don't generate options for bins with sufficient length --also don't generate options past the next bin with 80%% length SELECT tx, + corridor_id, time_grp, bin_rank AS start_bin, --generate all the options for the end bin within the group. @@ -113,7 +132,7 @@ EXECUTE format( ) AS end_bin FROM segment_5min_bins WINDOW w AS ( - PARTITION BY time_grp + PARTITION BY corridor_id, time_grp ORDER BY tx --look only forward for end_bin options RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING @@ -143,6 +162,8 @@ EXECUTE format( AND s5b_end.bin_rank = dbo.end_bin, --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + --WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' GROUP BY s5b.corridor_id, dbo.time_grp, @@ -151,44 +172,57 @@ EXECUTE format( s5b_end.tx, --end_bin unnested.link_dir, unnested.len - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - --HAVING MAX(s5b.tx) + interval '5 minutes' <= dbo.tx + interval '1 hour' + ), + + inserted AS ( + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO congestion_raw_corridors_temp AS inserted ( + uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (dt_start) + %9$L, --uristring + dt_start::date AS dt, + timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, + corridor_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + corridor_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + dt_start, + dt_end + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude_temp + DO NOTHING + RETURNING inserted.uri_string, inserted.dt, inserted.time_grp, inserted.corridor_id, inserted.bin_range, inserted.tt, inserted.num_obs ) + --insert into the final table INSERT INTO gwolofs.congestion_raw_corridors ( uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs ) - --this query contains overlapping values which get eliminated - --via on conflict with the exclusion constraint on congestion_raw_segments table. - SELECT DISTINCT ON (dt_start) --distinct on ensures only the shortest option gets proposed for insert - %L, - dt_start::date AS dt, - timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, - corridor_id, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment - FROM unnested_db_options - GROUP BY - time_grp, - corridor_id, - dt_start, - dt_end, - total_length - HAVING SUM(len) >= 0.8 * total_length - ORDER BY - dt_start, - dt_end - --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude - DO NOTHING; + SELECT uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs + FROM inserted + ON CONFLICT DO NOTHING; + $$, node_start, node_end, map_version, --segment CTE start_tod, end_tod, --segment_5min_bins CTE SELECT - start_tod, end_tod, dow_list, start_date, end_date, --segment_5min_bins CTE WHERE - congestion_cache_tt_results.uri_string, congestion_cache_tt_results.uri_string --INSERT + dow_list, start_date, end_date, --segment_5min_bins CTE WHERE + congestion_cache_tt_results.uri_string --INSERT ); + DROP TABLE congestion_raw_corridors_temp; + END; $BODY$; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index 4d89920fc..813c37203 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -45,28 +45,28 @@ EXECUTE FORMAT( segment_5min_bins AS ( SELECT - links.segment_id, + seg.segment_id, ta.tx, + seg.total_length, RANK() OVER w AS bin_rank, - links.total_length, - SUM(links.length) / links.total_length AS sum_length, - SUM(links.length) AS length_w_data, - SUM(links.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(seg.length) / seg.total_length AS sum_length, + SUM(seg.length) AS length_w_data, + SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, SUM(sample_size) AS num_obs, - ARRAY_AGG(ta.link_dir ORDER BY link_dir) AS link_dirs, - ARRAY_AGG(links.length / ta.mean * 3.6 ORDER BY link_dir) AS tts, - ARRAY_AGG(links.length ORDER BY link_dir) AS lengths + ARRAY_AGG(ta.link_dir ORDER BY ta.link_dir) AS link_dirs, + ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, + ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths FROM here.ta_path AS ta - JOIN segments AS links USING (link_dir) + JOIN segments AS seg USING (link_dir) WHERE ta.dt >= %1$L::date AND ta.dt < %1$L::date + interval '1 day' GROUP BY - links.segment_id, + seg.segment_id, ta.tx, - links.total_length + seg.total_length WINDOW w AS ( - PARTITION BY links.segment_id + PARTITION BY seg.segment_id ORDER BY ta.tx ) ), @@ -90,7 +90,10 @@ EXECUTE FORMAT( --dont need to generate options when start segment is already sufficient WHEN sum_length >= 0.8 THEN bin_rank --generate options until 1 bin has sufficient length, otherwise until last bin in group - ELSE COALESCE(MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, MAX(bin_rank) OVER w) + ELSE COALESCE( + MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, + MAX(bin_rank) OVER w + ) END, 1 ) AS end_bin @@ -166,11 +169,14 @@ EXECUTE FORMAT( RETURNING inserted.bin_start, inserted.segment_id, inserted.bin_range, inserted.tt, inserted.num_obs ) - INSERT INTO gwolofs.congestion_raw_segments (dt, bin_start, segment_id, bin_range, tt, num_obs) + INSERT INTO gwolofs.congestion_raw_segments ( + dt, bin_start, segment_id, bin_range, tt, num_obs + ) SELECT bin_start::date AS dt, bin_start, segment_id, bin_range, tt, num_obs - FROM inserted; + FROM inserted + ON CONFLICT DO NOTHING; - $$, + $$, start_date, congestion_network_table ); From bbf677954cad2d1f92e4a1e0e7d77ae3742ac5d8 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 6 Jun 2025 19:34:29 +0000 Subject: [PATCH 46/74] #1132 account for streets_valid_range_path --- .../function-congestion_cache_tt_results.sql | 3 +- ...unction-congestion_network_segment_agg.sql | 2 +- ...function-congestion_select_map_version.sql | 40 ++++++++++++------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 9cfbf8c4d..39932d0df 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -43,7 +43,8 @@ CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( SELECT gwolofs.congestion_select_map_version( congestion_cache_tt_results.start_date, - congestion_cache_tt_results.end_date + congestion_cache_tt_results.end_date, + 'path' ) INTO map_version; EXECUTE FORMAT( diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index 813c37203..e9d587931 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -12,7 +12,7 @@ VOLATILE PARALLEL UNSAFE AS $BODY$ DECLARE - map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1); + map_version text := gwolofs.congestion_select_map_version(start_date, start_date + 1, 'path'); congestion_network_table text := 'network_links_' || map_version || CASE map_version WHEN '23_4' THEN '_geom' ELSE '' END; --temp fix version diff --git a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index 5999cb713..71c635e7b 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -4,31 +4,43 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_select_map_version( start_date date, - end_date date + end_date date, + agg_type text default null, --null or 'path' + OUT selected_version text ) RETURNS text -LANGUAGE sql +LANGUAGE plpgsql COST 100 STABLE PARALLEL SAFE AS $BODY$ -SELECT street_version -FROM here.street_valid_range AS svr, -LATERAL ( - SELECT svr.valid_range * daterange( - congestion_select_map_version.start_date, - congestion_select_map_version.end_date, '[)') AS overlap -) AS lat -WHERE UPPER(lat.overlap) - LOWER(lat.overlap) IS NOT NULL -ORDER BY UPPER(lat.overlap) - LOWER(lat.overlap) DESC NULLS LAST -LIMIT 1; +DECLARE + svr text := 'street_valid_range' || CASE agg_type WHEN 'path' THEN '_path' ELSE '' END; +BEGIN +EXECUTE FORMAT( + $$ + SELECT street_version + FROM here.%I AS svr, + LATERAL ( + SELECT svr.valid_range * daterange(%L, %L, '[)') AS overlap + ) AS lat + WHERE UPPER(lat.overlap) - LOWER(lat.overlap) IS NOT NULL + ORDER BY UPPER(lat.overlap) - LOWER(lat.overlap) DESC NULLS LAST + LIMIT 1; + $$, svr, congestion_select_map_version.start_date, congestion_select_map_version.end_date +) INTO selected_version; +END; $BODY$; -ALTER FUNCTION gwolofs.congestion_select_map_version(date, date) +ALTER FUNCTION gwolofs.congestion_select_map_version(date, date, text) OWNER TO gwolofs; -GRANT EXECUTE ON FUNCTION gwolofs.congestion_select_map_version(date, date) TO congestion_bot; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_select_map_version(date, date, text) TO congestion_bot; COMMENT ON FUNCTION gwolofs.congestion_select_map_version IS 'Implement TT App selectMapVersion.py'; + +--test cases +SELECT * FROM gwolofs.congestion_select_map_version('2022-01-01', '2023-01-01'); +SELECT * FROM gwolofs.congestion_select_map_version('2022-01-01', '2023-01-01', 'path'); \ No newline at end of file From e1e9e38eb4be568dc203fe38161f44cfa8a25d15 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 6 Jun 2025 20:17:42 +0000 Subject: [PATCH 47/74] #1132 fn to calculate hourly overlap --- .../dynamic_bins/function-assign_bin_hr.sql | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql diff --git a/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql b/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql new file mode 100644 index 000000000..59bc7c7ec --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql @@ -0,0 +1,30 @@ +CREATE OR REPLACE FUNCTION gwolofs.assign_bin_hour(bin_range tsrange) +RETURNS timestamp +LANGUAGE plpgsql +IMMUTABLE +AS $$ + DECLARE + lower_hour timestamp; + upper_hour timestamp; + + BEGIN + --calculate hour boundaries + lower_hour := date_trunc('hour', lower(bin_range)); + upper_hour := date_trunc('hour', upper(bin_range)); + + --early return if same hour + IF lower_hour = upper_hour THEN + RETURN lower_hour; + --if the intersection between the hours is equal or in favour of lower. + ELSIF + (least(upper(bin_range), upper_hour) - lower(bin_range)) >= + upper(bin_range) - least(upper_hour, upper(bin_range)) THEN + RETURN lower_hour; + ELSE + RETURN upper_hour; + END IF; + END; +$$; + +COMMENT ON FUNCTION gwolofs.assign_bin_hour +IS 'Assign hour to a tsrange based on how much overlap there is with start/end hour.'; From afd63a861f02c9d15ee2299e70ed1192560ec013 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 9 Jun 2025 17:55:10 +0000 Subject: [PATCH 48/74] #1132 new method to calcualte hr; midpoint --- .../create-table-congestion_raw_corridors.sql | 21 ++++++++++++- .../create-table-congestion_raw_segments.sql | 19 ++++++++++++ .../dynamic_bins/function-assign_bin_hr.sql | 30 ------------------- .../function-congestion_cache_tt_results.sql | 20 +++++++++---- ...unction-congestion_network_segment_agg.sql | 15 ++++++++-- 5 files changed, 65 insertions(+), 40 deletions(-) delete mode 100644 here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index 9a823249c..cd6d73ee1 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -5,12 +5,13 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors ( corridor_id smallint, - dt date, time_grp timerange NOT NULL, bin_range tsrange NOT NULL, tt numeric, num_obs integer, uri_string text COLLATE pg_catalog."default", + dt date, + hr timestamp without time zone, CONSTRAINT congestion_raw_corridors_exclude EXCLUDE USING gist ( bin_range WITH &&, corridor_id WITH =, @@ -59,3 +60,21 @@ TABLESPACE pg_default; COMMENT ON TABLE gwolofs.congestion_raw_corridors IS 'Stores dynamic binning results for custom corridor based travel time requests.'; + +COMMENT ON TABLE gwolofs.congestion_raw_corridors + IS 'Stores dynamic binning results from standard HERE congestion network travel time aggregations.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_corridors.bin_range + IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_corridors.tt + IS 'Travel time in seconds.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_corridors.num_obs + IS 'The sum of the sample size from here.ta_path.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_corridors.dt + IS 'The date of aggregation for the record. Records may not overlap dates.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_corridors.hr + IS 'The hour the majority of the record occured in. Ties are rounded up.'; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index 940e6bf0f..dc2c090c6 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -10,6 +10,7 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments bin_range tsrange NOT NULL, tt numeric, num_obs integer, + hr timestamp without time zone, CONSTRAINT congestion_raw_segments_pkey PRIMARY KEY (segment_id, dt, bin_start) ) PARTITION BY RANGE (dt); @@ -58,3 +59,21 @@ TABLESPACE pg_default; ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments_2025 OWNER TO gwolofs; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.dt + IS 'The date of aggregation for the record. Records may not overlap dates.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.bin_start + IS 'The start of the observation. It is recommended to use `hr` to group the bin instead. This column is used in the primary key, although the main constraint occurs during insert (non overlapping ranges).'; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.bin_range + IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.tt + IS 'Travel time in seconds.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.num_obs + IS 'The sum of the sample size from here.ta_path.'; + +COMMENT ON COLUMN gwolofs.congestion_raw_segments.hr + IS 'The hour the majority of the record occured in. Ties are rounded up.'; diff --git a/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql b/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql deleted file mode 100644 index 59bc7c7ec..000000000 --- a/here/traffic/sql/dynamic_bins/function-assign_bin_hr.sql +++ /dev/null @@ -1,30 +0,0 @@ -CREATE OR REPLACE FUNCTION gwolofs.assign_bin_hour(bin_range tsrange) -RETURNS timestamp -LANGUAGE plpgsql -IMMUTABLE -AS $$ - DECLARE - lower_hour timestamp; - upper_hour timestamp; - - BEGIN - --calculate hour boundaries - lower_hour := date_trunc('hour', lower(bin_range)); - upper_hour := date_trunc('hour', upper(bin_range)); - - --early return if same hour - IF lower_hour = upper_hour THEN - RETURN lower_hour; - --if the intersection between the hours is equal or in favour of lower. - ELSIF - (least(upper(bin_range), upper_hour) - lower(bin_range)) >= - upper(bin_range) - least(upper_hour, upper(bin_range)) THEN - RETURN lower_hour; - ELSE - RETURN upper_hour; - END IF; - END; -$$; - -COMMENT ON FUNCTION gwolofs.assign_bin_hour -IS 'Assign hour to a tsrange based on how much overlap there is with start/end hour.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 39932d0df..968a2e11a 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -32,7 +32,6 @@ CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( tt numeric, num_obs integer, uri_string text, - dt date, CONSTRAINT congestion_raw_corridors_exclude_temp EXCLUDE USING gist ( bin_range WITH &&, corridor_id WITH =, @@ -179,12 +178,11 @@ EXECUTE FORMAT( --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. INSERT INTO congestion_raw_corridors_temp AS inserted ( - uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs + uri_string, time_grp, corridor_id, bin_range, tt, num_obs ) --distinct on ensures only the shortest option gets proposed for insert SELECT DISTINCT ON (dt_start) %9$L, --uristring - dt_start::date AS dt, timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, corridor_id, tsrange(dt_start, dt_end, '[)') AS bin_range, @@ -204,14 +202,24 @@ EXECUTE FORMAT( --exclusion constraint + ordered insert to prevent overlapping bins ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude_temp DO NOTHING - RETURNING inserted.uri_string, inserted.dt, inserted.time_grp, inserted.corridor_id, inserted.bin_range, inserted.tt, inserted.num_obs + RETURNING + inserted.uri_string, inserted.time_grp, inserted.corridor_id, + inserted.bin_range, inserted.tt, inserted.num_obs ) --insert into the final table INSERT INTO gwolofs.congestion_raw_corridors ( - uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs + uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs, hr ) - SELECT uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs + SELECT + uri_string, + lower(bin_range)::date AS dt, + time_grp, + corridor_id, + bin_range, + tt, + num_obs, + date_trunc('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr FROM inserted ON CONFLICT DO NOTHING; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index e9d587931..e838ca00c 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -166,13 +166,22 @@ EXECUTE FORMAT( --exclusion constraint + ordered insert to prevent overlapping bins ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude_temp DO NOTHING - RETURNING inserted.bin_start, inserted.segment_id, inserted.bin_range, inserted.tt, inserted.num_obs + RETURNING + inserted.bin_start, inserted.segment_id, inserted.bin_range, + inserted.tt, inserted.num_obs ) INSERT INTO gwolofs.congestion_raw_segments ( - dt, bin_start, segment_id, bin_range, tt, num_obs + dt, bin_start, segment_id, bin_range, tt, num_obs, hr ) - SELECT bin_start::date AS dt, bin_start, segment_id, bin_range, tt, num_obs + SELECT + bin_start::date AS dt, + bin_start, + segment_id, + bin_range, + tt, + num_obs, + date_trunc('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr FROM inserted ON CONFLICT DO NOTHING; From b69fcd77e79595faaee20bc469bbc5172c1e0cf8 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 9 Jun 2025 17:55:54 +0000 Subject: [PATCH 49/74] #1132 new fn to calculate node streets --- .../function-congestion_cache_corridor.sql | 22 +++------- .../function-identify_node_streets.sql | 43 +++++++++++++++++++ 2 files changed, 48 insertions(+), 17 deletions(-) create mode 100644 here/traffic/sql/dynamic_bins/function-identify_node_streets.sql diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql index da926770f..ef95b0de5 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_corridor.sql @@ -21,7 +21,6 @@ DECLARE routing_function text := 'get_links_btwn_nodes_' || map_version; street_geoms_table text := 'routing_streets_' || map_version; traffic_streets_table text := 'traffic_streets_' || map_version; - routing_nodes_table text := 'routing_nodes_' || map_version; BEGIN @@ -65,23 +64,13 @@ EXECUTE format ( SUM(ST_Length(ST_Transform(streets.geom, 2952))) AS total_length, string_agg(DISTINCT initcap(traffic_streets.st_name), ' / ' ORDER BY initcap(traffic_streets.st_name)) AS corridor_streets, - string_agg(DISTINCT initcap(from_streets.st_name), - ' / ' ORDER BY initcap(from_streets.st_name)) AS corridor_start, - string_agg(DISTINCT initcap(to_streets.st_name), - ' / ' ORDER BY initcap(to_streets.st_name)) AS corridor_end + gwolofs.identify_node_streets(%2$L, %4$L, + array_agg(DISTINCT initcap(traffic_streets.st_name))) AS corridor_start, + gwolofs.identify_node_streets(%3$L, %4$L, + array_agg(DISTINCT initcap(traffic_streets.st_name))) AS corridor_end FROM routed_links AS rl JOIN here.%5$I AS streets USING (link_dir) LEFT JOIN here_gis.%6$I AS traffic_streets USING (link_id) - LEFT JOIN here.%7$I AS from_node ON from_node.node_id = %2$L - LEFT JOIN here_gis.%6$I AS from_streets - ON from_node.link_id = from_streets.link_id - AND from_streets.st_name <> traffic_streets.st_name - AND from_streets.st_name IS NOT NULL - LEFT JOIN here.%7$I AS to_node ON to_node.node_id = %3$L - LEFT JOIN here_gis.%6$I AS to_streets - ON to_node.link_id = to_streets.link_id - AND to_streets.st_name <> traffic_streets.st_name - AND to_streets.st_name IS NOT NULL --conflict would occur because of null values ON CONFLICT (node_start, node_end, map_version) DO UPDATE @@ -98,8 +87,7 @@ EXECUTE format ( routing_function, node_start, node_end, -- For routed_links map_version, -- For INSERT / SELECT values street_geoms_table, -- For JOIN here.%5$I AS streets - traffic_streets_table, -- For LEFT JOIN here_gis.%6$I AS traffic_streets - routing_nodes_table -- For LEFT JOIN here.%7$I AS from_node / to_node + traffic_streets_table -- For LEFT JOIN here_gis.%6$I AS traffic_streets ) INTO corridor_id, link_dirs, lengths, total_length; RETURN; END; diff --git a/here/traffic/sql/dynamic_bins/function-identify_node_streets.sql b/here/traffic/sql/dynamic_bins/function-identify_node_streets.sql new file mode 100644 index 000000000..2793e3f9f --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-identify_node_streets.sql @@ -0,0 +1,43 @@ +CREATE OR REPLACE FUNCTION gwolofs.identify_node_streets( + node bigint, + map_version text, + exclude_streets text [], + OUT streets text +) +RETURNS text +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL SAFE +AS $BODY$ + +DECLARE + routing_nodes_table text := 'routing_nodes_' || map_version; + traffic_streets_table text := 'traffic_streets_' || map_version; + +BEGIN +EXECUTE format ( + $$ + SELECT string_agg(DISTINCT initcap(streets.st_name), ' / ' ORDER BY initcap(streets.st_name)) + FROM here.%1$I AS node + LEFT JOIN here_gis.%2$I AS streets + ON node.link_id = streets.link_id + AND NOT(initcap(streets.st_name) = ANY(%3$L)) + AND streets.st_name IS NOT NULL + WHERE node.node_id = %4$L + $$, + routing_nodes_table, + traffic_streets_table, + identify_node_streets.exclude_streets, + identify_node_streets.node +) INTO streets; + +RETURN; +END; +$BODY$; + +ALTER FUNCTION gwolofs.identify_node_streets(bigint, text, text []) +OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.identify_node_streets IS +'Identifies the streets intersecting with a HERE node_id, given a node, map_version, +and a list of streets to exclude (generally those which form the corridor).'; From aa724cece12ae128d877c14e5253543729eb1cd7 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 30 Jun 2025 21:29:59 +0000 Subject: [PATCH 50/74] #1132 add task timeout --- dags/here_dynamic_binning_agg.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index 4c8301df3..05c57daaa 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -12,8 +12,7 @@ import sys import os import logging -import pendulum -from datetime import timedelta +from pendulum import duration, datetime from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator from airflow.models import Variable @@ -36,13 +35,13 @@ default_args = { 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, - 'start_date': pendulum.datetime(2023, 1, 1, tz="America/Toronto"), + 'start_date': datetime(2019, 1, 1, tz="America/Toronto"), #aggregation doesn't work on 24_4 yet (no congestion.network_links_24_4) - 'end_date': pendulum.datetime(2025, 3, 17, tz="America/Toronto"), + #'end_date': datetime(2025, 3, 17, tz="America/Toronto"), 'email_on_failure': False, 'email_on_success': False, - 'retries': 0, - 'retry_delay': timedelta(minutes=5), + 'retries': 1, + 'retry_delay': duration(minutes=5), 'on_failure_callback': task_fail_slack_alert } @@ -65,7 +64,8 @@ def here_dynamic_binning_agg(): task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, - retries = 0 + retries = 0, + execution_timeout=duration(minutes=30) ) aggregate_daily From 5c71af76a899e63b8654f33ba9477a42f31c2309 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 30 Jun 2025 21:30:15 +0000 Subject: [PATCH 51/74] #1132 adjust constraints --- here/traffic/sql/dynamic_bins/corridor_agg.sql | 11 ++++++++--- .../create-table-congestion_corridors.sql | 1 + .../create-table-congestion_raw_corridors.sql | 12 ++++++------ .../function-congestion_cache_tt_results.sql | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/corridor_agg.sql b/here/traffic/sql/dynamic_bins/corridor_agg.sql index 5e81033f1..a16c0dbd2 100644 --- a/here/traffic/sql/dynamic_bins/corridor_agg.sql +++ b/here/traffic/sql/dynamic_bins/corridor_agg.sql @@ -6,9 +6,14 @@ SELECT gwolofs.congestion_cache_tt_results_daily( ) FROM gwolofs.congestion_corridors JOIN gwolofs.congestion_projects USING (project_id), -generate_series('2025-02-01', '2025-02-28', '1 day'::interval) AS dates(dt) -WHERE +generate_series('2025-01-01', '2025-02-28', '1 day'::interval) AS dates(dt) +WHERE congestion_projects.description IN ( - 'bluetooth_corridors', 'scrutinized-cycleway-corridors' + 'Avenue Road cycleway installation', + 'bluetooth_corridors', + 'scrutinized-cycleway-corridors' + ) + AND corridor_id NOT IN ( + SELECT DISTINCT corridor_id FROM gwolofs.congestion_raw_corridors WHERE dt >= '2025-01-01' AND dt < '2025-02-28' ) AND map_version = '23_4'; \ No newline at end of file diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql index 827bb6710..2d905a19e 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql @@ -17,6 +17,7 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_corridors corridor_end text COLLATE pg_catalog."default", project_id integer, CONSTRAINT congestion_corridors_pkey PRIMARY KEY (node_start, node_end, map_version), + CONSTRAINT corridor_pkey UNIQUE NULLS NOT DISTINCT (corridor_id), CONSTRAINT project_id_fk FOREIGN KEY (project_id) REFERENCES gwolofs.congestion_projects (project_id) MATCH SIMPLE ON UPDATE NO ACTION diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index cd6d73ee1..9088b1a6a 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -12,12 +12,12 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors uri_string text COLLATE pg_catalog."default", dt date, hr timestamp without time zone, - CONSTRAINT congestion_raw_corridors_exclude EXCLUDE USING gist ( - bin_range WITH &&, - corridor_id WITH =, - time_grp WITH =, - uri_string WITH = - ) + CONSTRAINT congestion_raw_corridors_pkey PRIMARY KEY (corridor_id, bin_range, time_grp), + CONSTRAINT corridor_fkey FOREIGN KEY (corridor_id) + REFERENCES gwolofs.congestion_corridors (corridor_id) MATCH SIMPLE + ON UPDATE NO ACTION + ON DELETE CASCADE + NOT VALID ) TABLESPACE pg_default; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 968a2e11a..ff29ab8b5 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -36,7 +36,7 @@ CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( bin_range WITH &&, corridor_id WITH =, time_grp WITH =, - uri_string WITH = + coalesce(uri_string, '') WITH = --this is the only column in constraint which is nullable ) ); @@ -268,5 +268,5 @@ SELECT gwolofs.congestion_cache_tt_results( holidays := True) $BODY$; -COMMENT ON FUNCITON gwolofs.congestion_cache_tt_results_daily +COMMENT ON FUNCTION gwolofs.congestion_cache_tt_results_daily IS 'A simplified version of `congestion_cache_tt_results` for aggregating entire days of data.' \ No newline at end of file From ecb658591b64ef6695199821a53b8a2ac93b6e5c Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 15 Jul 2025 17:53:25 +0000 Subject: [PATCH 52/74] #1132 add max bin length constraint --- .../sql/dynamic_bins/function-congestion_cache_tt_results.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index ff29ab8b5..4398838e9 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -163,7 +163,7 @@ EXECUTE FORMAT( --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - --WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '1 hour' + WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '30 minutes' GROUP BY s5b.corridor_id, dbo.time_grp, From d2dafe6c2d2d7a1b5bd113c4ee31911854499991 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Tue, 15 Jul 2025 17:53:55 +0000 Subject: [PATCH 53/74] #1132 add DROP (temp) TABLE IF EXISTS --- .../sql/dynamic_bins/function-congestion_cache_tt_results.sql | 1 + .../sql/dynamic_bins/function-congestion_network_segment_agg.sql | 1 + 2 files changed, 2 insertions(+) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 4398838e9..f55190539 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -25,6 +25,7 @@ BEGIN --using a temp table to aply the exclusion constraint should prevent the --insert from getting bogged down by large constraint on main table over time +DROP TABLE IF EXISTS congestion_raw_corridors_temp; CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( corridor_id smallint, time_grp timerange NOT NULL, diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index e838ca00c..557c81756 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -20,6 +20,7 @@ BEGIN --using a temp table to aply the exclusion constraint should prevent the --insert from getting bogged down by large constraint on main table over time +DROP TABLE IF EXISTS congestion_raw_segments_temp; CREATE TEMPORARY TABLE congestion_raw_segments_temp ( segment_id integer NOT NULL, bin_start timestamp without time zone NOT NULL, From 3ff60e4c6ce58c6b4b30f98ab0bad161c978de99 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 16 Jul 2025 18:38:23 +0000 Subject: [PATCH 54/74] #1132 add task timeout --- dags/here_dynamic_binning_agg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index 05c57daaa..2e89c0e2d 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -42,7 +42,7 @@ 'email_on_success': False, 'retries': 1, 'retry_delay': duration(minutes=5), - 'on_failure_callback': task_fail_slack_alert + #'on_failure_callback': task_fail_slack_alert } @dag( @@ -64,8 +64,8 @@ def here_dynamic_binning_agg(): task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, - retries = 0, - execution_timeout=duration(minutes=30) + retries = 1, + execution_timeout=duration(hours=1) ) aggregate_daily From bb3880038648ac09a3ef6559f084bef72a29858f Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:10:04 +0000 Subject: [PATCH 55/74] #1132 monthly agg dag --- dags/here_dynamic_binning_monthly_agg.py | 73 +++++++++++++++++++ ...ble-congestion_segments_monthy_summary.sql | 30 ++++++++ ...unction-congestion_segment_monthly_agg.sql | 51 +++++++++++++ .../select-check_missing_days.sql | 19 +++++ 4 files changed, 173 insertions(+) create mode 100644 dags/here_dynamic_binning_monthly_agg.py create mode 100644 here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql create mode 100644 here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql create mode 100644 here/traffic/sql/dynamic_bins/select-check_missing_days.sql diff --git a/dags/here_dynamic_binning_monthly_agg.py b/dags/here_dynamic_binning_monthly_agg.py new file mode 100644 index 000000000..cd42d9b38 --- /dev/null +++ b/dags/here_dynamic_binning_monthly_agg.py @@ -0,0 +1,73 @@ +import os +import sys +import logging +from datetime import timedelta +from pendulum import duration, datetime + +from airflow.models import Variable +from airflow.decorators import dag +from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator + +try: + repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + sys.path.insert(0, repo_path) + from dags.dag_functions import task_fail_slack_alert + from dags.custom_operators import SQLCheckOperatorWithReturnValue +except: + raise ImportError("Cannot import slack alert functions") + +LOGGER = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + +doc_md = "This DAG is running off the `1132-here-aggregation-proposal` branch to test dynamic binning aggregation." +DAG_NAME = 'here_dynamic_binning_monthly_agg' +DAG_OWNERS = Variable.get('dag_owners', deserialize_json=True).get(DAG_NAME, ["Unknown"]) + +default_args = { + 'owner': ','.join(DAG_OWNERS), + 'depends_on_past':False, + 'start_date': datetime(2019, 1, 1, tz="America/Toronto"), + 'email_on_failure': False, + 'email_on_success': False, + 'retries': 1, + 'retry_delay': duration(hours=1), + 'on_failure_callback': task_fail_slack_alert +} + +@dag( + DAG_NAME, + default_args=default_args, + schedule='* 10 1 * *', # 10am, first day of month + template_searchpath=os.path.join(repo_path,'here/traffic/sql/dynamic_bins'), + doc_md = doc_md, + tags=["HERE", "aggregation"], + max_active_runs=1, + catchup=True +) + +#to add: catchup, one task at a time, depends on past. + +def here_dynamic_binning_monthly_agg(): + + check_missing_dates = SQLCheckOperatorWithReturnValue( + sql="select-check_missing_days.sql", + task_id="check_missing_dates", + conn_id='congestion_bot', + retries = 1, + execution_timeout=timedelta(minutes=10) + ) + + aggregate_monthly = SQLExecuteQueryOperator( + sql=[ + "DELETE FROM gwolofs.congestion_segments_monthy_summary WHERE mnth = '{{ ds }}'", + "SELECT gwolofs.congestion_segment_monthly_agg('{{ ds }}')" + ], + task_id='aggregate_monthly', + conn_id='congestion_bot', + autocommit=True, + retries = 1, + execution_timeout=timedelta(hours=1) + ) + check_missing_dates >> aggregate_monthly + +here_dynamic_binning_monthly_agg() \ No newline at end of file diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql new file mode 100644 index 000000000..462ae6b59 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql @@ -0,0 +1,30 @@ +-- Table: gwolofs.congestion_segments_monthy_summary + +-- DROP TABLE IF EXISTS gwolofs.congestion_segments_monthy_summary; + +CREATE TABLE IF NOT EXISTS gwolofs.congestion_segments_monthy_summary +( + segment_id integer, + mnth date, + is_wkdy boolean, + hr double precision, + avg_tt numeric, + stdev numeric, + percentile_05 numeric, + percentile_15 numeric, + percentile_50 numeric, + percentile_85 numeric, + percentile_95 numeric, + num_quasi_obs bigint +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_segments_monthy_summary +OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.congestion_segments_monthy_summary FROM bdit_humans; + +GRANT SELECT ON TABLE gwolofs.congestion_segments_monthy_summary TO bdit_humans; + +GRANT ALL ON TABLE gwolofs.congestion_segments_monthy_summary TO gwolofs; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql new file mode 100644 index 000000000..b80610c33 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql @@ -0,0 +1,51 @@ +-- FUNCTION: gwolofs.congestion_segment_monthly_agg(date) + +-- DROP FUNCTION IF EXISTS gwolofs.congestion_segment_monthly_agg(date); + +CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_monthly_agg( + mon date) + RETURNS void + LANGUAGE 'sql' + COST 100 + VOLATILE PARALLEL UNSAFE +AS $BODY$ + +INSERT INTO gwolofs.congestion_segments_monthy_summary ( + segment_id, mnth, is_wkdy, hr, avg_tt, stdev, percentile_05, percentile_15, + percentile_50, percentile_85, percentile_95, num_quasi_obs +) +SELECT + segment_id, + congestion_segment_monthly_agg.mon AS mnth, + date_part('isodow', dt) <= 5 AS is_wkdy, + date_part('hour', hr) AS hr, + ROUND(AVG(tt), 2) AS avg_tt, + ROUND(stddev(tt), 2) AS stdev, + ROUND(PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_05, + ROUND(PERCENTILE_CONT(0.15) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_15, + ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_50, + ROUND(PERCENTILE_CONT(0.85) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_85, + ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_95, + COUNT(*) AS num_quasi_obs +FROM gwolofs.congestion_raw_segments +LEFT JOIN ref.holiday USING (dt) +WHERE + dt >= congestion_segment_monthly_agg.mon + AND dt < congestion_segment_monthly_agg.mon + interval '1 month' + AND holiday.holiday IS NULL +GROUP BY + segment_id, + date_part('hour', hr), + is_wkdy; + +$BODY$; + +ALTER FUNCTION gwolofs.congestion_segment_monthly_agg(date) + OWNER TO gwolofs; + +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO PUBLIC; + +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO congestion_bot; + +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO gwolofs; + diff --git a/here/traffic/sql/dynamic_bins/select-check_missing_days.sql b/here/traffic/sql/dynamic_bins/select-check_missing_days.sql new file mode 100644 index 000000000..a993ba57c --- /dev/null +++ b/here/traffic/sql/dynamic_bins/select-check_missing_days.sql @@ -0,0 +1,19 @@ +WITH distinct_days AS ( + SELECT DISTINCT dt + FROM gwolofs.congestion_raw_segments + WHERE + dt >= '{{ ds }}'::date + AND dt < '{{ ds }}'::date + interval '1 month' +) + +SELECT + COUNT(*) = 0, + 'The following days are missing from `congestion_raw_segments`: ' + || string_agg(dates.dt::date::text, ', ') AS summary +FROM generate_series( + '{{ ds }}'::date, + --one day before start of next month + ('{{ ds }}'::date + interval '1 month')::date - 1, + '1 day') AS dates(dt) +LEFT JOIN distinct_days USING (dt) +WHERE distinct_days.dt IS NULL; \ No newline at end of file From d742eb9bec4ff93db5388e63ea56fe7b2b32b7c8 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:10:22 +0000 Subject: [PATCH 56/74] #1132 try execution_timeout with timedelta rather than duration --- dags/here_dynamic_binning_agg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index 2e89c0e2d..0dc6d349b 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -12,6 +12,7 @@ import sys import os import logging +from datetime import timedelta from pendulum import duration, datetime from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator @@ -65,7 +66,7 @@ def here_dynamic_binning_agg(): conn_id='congestion_bot', autocommit=True, retries = 1, - execution_timeout=duration(hours=1) + execution_timeout=timedelta(hours=1) ) aggregate_daily From 70e0fc9ede3fddb4c79287b09d099ab447059035 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:14:44 +0000 Subject: [PATCH 57/74] #1132 fix monthly cron schedule --- dags/here_dynamic_binning_monthly_agg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/here_dynamic_binning_monthly_agg.py b/dags/here_dynamic_binning_monthly_agg.py index cd42d9b38..57b9eb201 100644 --- a/dags/here_dynamic_binning_monthly_agg.py +++ b/dags/here_dynamic_binning_monthly_agg.py @@ -37,7 +37,7 @@ @dag( DAG_NAME, default_args=default_args, - schedule='* 10 1 * *', # 10am, first day of month + schedule='0 16 1 * *', # 4pm, first day of month template_searchpath=os.path.join(repo_path,'here/traffic/sql/dynamic_bins'), doc_md = doc_md, tags=["HERE", "aggregation"], From f1679d18019a186be7447d2975b1791b197acaf2 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:51:17 +0000 Subject: [PATCH 58/74] #1132 fix sql operator timeout --- dags/here_dynamic_binning_agg.py | 4 ++-- dags/here_dynamic_binning_monthly_agg.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index 0dc6d349b..a516fcaa8 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -65,8 +65,8 @@ def here_dynamic_binning_agg(): task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, - retries = 1, - execution_timeout=timedelta(hours=1) + retries = 2, + hook_params={"options": "-c statement_timeout=10800000ms"} #3 hours ) aggregate_daily diff --git a/dags/here_dynamic_binning_monthly_agg.py b/dags/here_dynamic_binning_monthly_agg.py index 57b9eb201..fd8f1e0d9 100644 --- a/dags/here_dynamic_binning_monthly_agg.py +++ b/dags/here_dynamic_binning_monthly_agg.py @@ -31,7 +31,7 @@ 'email_on_success': False, 'retries': 1, 'retry_delay': duration(hours=1), - 'on_failure_callback': task_fail_slack_alert + #'on_failure_callback': task_fail_slack_alert } @dag( @@ -53,8 +53,7 @@ def here_dynamic_binning_monthly_agg(): sql="select-check_missing_days.sql", task_id="check_missing_dates", conn_id='congestion_bot', - retries = 1, - execution_timeout=timedelta(minutes=10) + retries = 0 ) aggregate_monthly = SQLExecuteQueryOperator( From cdb888674f9e3f57743a81d7d8d2afb332648cfa Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:51:30 +0000 Subject: [PATCH 59/74] #1132 implementation of bootstrapping method --- ...-function-congestion_segment_bootstrap.sql | 108 ++++++++++++++++++ .../sql/dynamic_bins/segment_grouping.sql | 22 ++++ 2 files changed, 130 insertions(+) create mode 100644 here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql create mode 100644 here/traffic/sql/dynamic_bins/segment_grouping.sql diff --git a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql new file mode 100644 index 000000000..5d2dd9418 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql @@ -0,0 +1,108 @@ +/* +--test +SELECT tt_array[ceiling(random() * 3)] +FROM (VALUES(ARRAY[1,2,3])) AS val(tt_array) +CROSS JOIN generate_series(1,100,1) +*/ + + +CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_bootstrap( + mnth date, + segment_ids bigint[], + n_resamples int) + RETURNS TABLE( + segment_id integer, mnth date, is_wkdy boolean, hr numeric, avg_tt real, n bigint, ci_lower real, ci_upper real + ) + LANGUAGE SQL + COST 100 + VOLATILE PARALLEL SAFE +AS $BODY$ + +WITH raw_obs AS ( + SELECT + segment_id, + date_trunc('month', dt)::date AS mnth, + EXTRACT('isodow' FROM dt) IN (1, 2, 3, 4, 5) AS is_wkdy, + EXTRACT('hour' FROM hr) AS hr, + ARRAY_AGG(tt::real) AS tt_array, + AVG(tt::real) AS avg_tt, + COUNT(*) AS n + FROM gwolofs.congestion_raw_segments + WHERE -- same params as the above aggregation + dt >= congestion_segment_bootstrap.mnth + AND dt < congestion_segment_bootstrap.mnth + interval '1 month' + AND segment_id = ANY(congestion_segment_bootstrap.segment_ids) + GROUP BY + segment_id, + mnth, + is_wkdy, + EXTRACT('hour' FROM hr) +), + +random_selections AS ( + SELECT + raw_obs.segment_id, + raw_obs.mnth, + raw_obs.is_wkdy, + raw_obs.hr, + sample_group.group_id, + raw_obs.avg_tt, + raw_obs.n, + --get a random observation from the array of tts + raw_obs.tt_array[ceiling(random() * raw_obs.n)] AS rnd_tt + FROM raw_obs + CROSS JOIN generate_series(1, n) + -- 200 resamples (could be any number) + CROSS JOIN generate_series(1, congestion_segment_bootstrap.n_resamples) AS sample_group(group_id) +), + +resampled_averages AS ( + SELECT + segment_id, + mnth, + is_wkdy, + hr, + group_id, + AVG(rnd_tt) AS rnd_avg_tt + FROM random_selections + GROUP BY + segment_id, + mnth, + is_wkdy, + hr, + group_id +) + +SELECT + ra.segment_id, + ra.mnth, + ra.is_wkdy, + ra.hr, + raw_obs.avg_tt::real, + raw_obs.n, + percentile_disc(0.025) WITHIN GROUP (ORDER BY ra.rnd_avg_tt)::real AS ci_lower, + percentile_disc(0.975) WITHIN GROUP (ORDER BY ra.rnd_avg_tt)::real AS ci_upper +FROM resampled_averages AS ra +JOIN raw_obs USING (segment_id, mnth, is_wkdy, hr) +GROUP BY + ra.segment_id, + ra.mnth, + ra.is_wkdy, + ra.hr, + raw_obs.avg_tt, + raw_obs.n; + + $BODY$; + +--6:52 for 100 +/*example +SELECT congestion_segment_bootstrap.* +FROM gwolofs.congestion_segment_bootstrap( + mnth := '2025-05-01'::date, + segment_ids := (SELECT ARRAY(SELECT segment_id FROM generate_series(1,100) AS a(segment_id))), + n_resamples := 300 +) +*/ + +--6.5 hours estimate for all +--SELECT COUNT(DISTINCT segment_id) / 100.0 * 352 / 60 / 60 FROM congestion.network_segments_24_4 ORDER BY 1 diff --git a/here/traffic/sql/dynamic_bins/segment_grouping.sql b/here/traffic/sql/dynamic_bins/segment_grouping.sql new file mode 100644 index 000000000..6343181f8 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/segment_grouping.sql @@ -0,0 +1,22 @@ +WITH summary AS ( + --find the number of groups required to have no more than 100 per group + SELECT + FLOOR(COUNT(DISTINCT segment_id) + / CEIL((COUNT(DISTINCT segment_id)) / 100.0)) AS num_per_group + FROM congestion.network_segments_24_4 +), + +groups AS ( + SELECT + CEIL(ROW_NUMBER() OVER (ORDER BY segment_id) / summary.num_per_group) AS group_id, + segment_id + FROM congestion.network_segments_24_4, summary +) + +SELECT + group_id, + array_agg(segment_id), + COUNT(*) +FROM groups +GROUP BY group_id +ORDER BY group_id \ No newline at end of file From 065747123930d0b463cd4b3943a96199bbe40adf Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Fri, 29 Aug 2025 14:14:30 +0000 Subject: [PATCH 60/74] #1132 bootstrapping method - eliminate another cte --- ...-function-congestion_segment_bootstrap.sql | 67 ++++++++----------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql index 5d2dd9418..5c1a0c832 100644 --- a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql +++ b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql @@ -6,12 +6,14 @@ CROSS JOIN generate_series(1,100,1) */ + CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_bootstrap( mnth date, segment_ids bigint[], n_resamples int) RETURNS TABLE( - segment_id integer, mnth date, is_wkdy boolean, hr numeric, avg_tt real, n bigint, ci_lower real, ci_upper real + segment_id integer, mnth date, is_wkdy boolean, hr numeric, + avg_tt real, n bigint, ci_lower real, ci_upper real ) LANGUAGE SQL COST 100 @@ -21,7 +23,7 @@ AS $BODY$ WITH raw_obs AS ( SELECT segment_id, - date_trunc('month', dt)::date AS mnth, + congestion_segment_bootstrap.mnth AS mnth, EXTRACT('isodow' FROM dt) IN (1, 2, 3, 4, 5) AS is_wkdy, EXTRACT('hour' FROM hr) AS hr, ARRAY_AGG(tt::real) AS tt_array, @@ -34,7 +36,6 @@ WITH raw_obs AS ( AND segment_id = ANY(congestion_segment_bootstrap.segment_ids) GROUP BY segment_id, - mnth, is_wkdy, EXTRACT('hour' FROM hr) ), @@ -42,55 +43,45 @@ WITH raw_obs AS ( random_selections AS ( SELECT raw_obs.segment_id, - raw_obs.mnth, raw_obs.is_wkdy, raw_obs.hr, - sample_group.group_id, raw_obs.avg_tt, raw_obs.n, + raw_obs.mnth, + sample_group.group_id, --get a random observation from the array of tts - raw_obs.tt_array[ceiling(random() * raw_obs.n)] AS rnd_tt + AVG(raw_obs.tt_array[ceiling(random() * raw_obs.n)]) AS rnd_avg_tt FROM raw_obs CROSS JOIN generate_series(1, n) -- 200 resamples (could be any number) CROSS JOIN generate_series(1, congestion_segment_bootstrap.n_resamples) AS sample_group(group_id) -), - -resampled_averages AS ( - SELECT - segment_id, - mnth, - is_wkdy, - hr, - group_id, - AVG(rnd_tt) AS rnd_avg_tt - FROM random_selections GROUP BY - segment_id, - mnth, - is_wkdy, - hr, - group_id + raw_obs.segment_id, + raw_obs.is_wkdy, + raw_obs.hr, + raw_obs.avg_tt, + raw_obs.n, + raw_obs.mnth, + sample_group.group_id ) SELECT - ra.segment_id, - ra.mnth, - ra.is_wkdy, - ra.hr, - raw_obs.avg_tt::real, - raw_obs.n, - percentile_disc(0.025) WITHIN GROUP (ORDER BY ra.rnd_avg_tt)::real AS ci_lower, - percentile_disc(0.975) WITHIN GROUP (ORDER BY ra.rnd_avg_tt)::real AS ci_upper -FROM resampled_averages AS ra -JOIN raw_obs USING (segment_id, mnth, is_wkdy, hr) + segment_id, + mnth, + is_wkdy, + hr, + avg_tt::real, + n, + percentile_disc(0.025) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_lower, + percentile_disc(0.975) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_upper +FROM random_selections GROUP BY - ra.segment_id, - ra.mnth, - ra.is_wkdy, - ra.hr, - raw_obs.avg_tt, - raw_obs.n; + segment_id, + mnth, + is_wkdy, + hr, + avg_tt, + n; $BODY$; From e77503ac80e48f4070580a422ba817167160a849 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:24:59 +0000 Subject: [PATCH 61/74] #1132 update segment_grouping to work for all map versions --- .../sql/dynamic_bins/segment_grouping.sql | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/segment_grouping.sql b/here/traffic/sql/dynamic_bins/segment_grouping.sql index 6343181f8..e18f762b2 100644 --- a/here/traffic/sql/dynamic_bins/segment_grouping.sql +++ b/here/traffic/sql/dynamic_bins/segment_grouping.sql @@ -1,22 +1,38 @@ -WITH summary AS ( - --find the number of groups required to have no more than 100 per group +WITH segments AS ( + --segments active in relevant month + SELECT DISTINCT segment_id + FROM gwolofs.congestion_raw_segments + WHERE + dt >= {{ ds }}::date + AND dt < {{ ds }}::date + '1 month'::interval +), + +group_size AS ( + --find the number of groups required to have no more than `max_group_size` per group SELECT - FLOOR(COUNT(DISTINCT segment_id) - / CEIL((COUNT(DISTINCT segment_id)) / 100.0)) AS num_per_group - FROM congestion.network_segments_24_4 + FLOOR(COUNT(*) + / CEIL((COUNT(*)) / {{ params.max_group_size }}::numeric)) AS num_per_group + FROM segments ), groups AS ( SELECT - CEIL(ROW_NUMBER() OVER (ORDER BY segment_id) / summary.num_per_group) AS group_id, + --assign group_ids using row number + CEIL(ROW_NUMBER() OVER (ORDER BY segment_id) / group_size.num_per_group) AS group_id, segment_id - FROM congestion.network_segments_24_4, summary + FROM segments, group_size +), + +groups_summarized AS ( + SELECT + group_id, + array_agg(segment_id) AS segment_ids, + COUNT(*) + FROM groups + GROUP BY group_id + ORDER BY group_id ) -SELECT - group_id, - array_agg(segment_id), - COUNT(*) -FROM groups -GROUP BY group_id -ORDER BY group_id \ No newline at end of file +--return list of lists for xcom +SELECT array_agg(segment_ids::text) +FROM groups_summarized \ No newline at end of file From f50d023fb3873e89678f8866d0fc7fa2caa3ce3d Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:25:34 +0000 Subject: [PATCH 62/74] #1132 check day is not empty before aggregating --- dags/here_dynamic_binning_agg.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index a516fcaa8..3d46af93b 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -12,7 +12,6 @@ import sys import os import logging -from datetime import timedelta from pendulum import duration, datetime from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator @@ -23,6 +22,7 @@ repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, repo_path) from dags.dag_functions import task_fail_slack_alert + from dags.custom_operators import SQLCheckOperatorWithReturnValue except: raise ImportError("Cannot import slack alert functions") @@ -37,8 +37,6 @@ 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, 'start_date': datetime(2019, 1, 1, tz="America/Toronto"), - #aggregation doesn't work on 24_4 yet (no congestion.network_links_24_4) - #'end_date': datetime(2025, 3, 17, tz="America/Toronto"), 'email_on_failure': False, 'email_on_success': False, 'retries': 1, @@ -59,6 +57,14 @@ #to add: catchup, one task at a time, depends on past. def here_dynamic_binning_agg(): + check_not_empty = SQLCheckOperatorWithReturnValue( + task_id="check_not_empty", + sql="SELECT COUNT(*), COUNT(*) FROM here.ta_path WHERE dt = '{{ ds }}'", + conn_id="congestion_bot", + retries=1, + retry_delay=duration(days=1) + ) + aggregate_daily = SQLExecuteQueryOperator( sql=["DELETE FROM gwolofs.congestion_raw_segments WHERE dt = '{{ ds }}'", "SELECT gwolofs.congestion_network_segment_agg('{{ ds }}'::date);"], @@ -68,6 +74,7 @@ def here_dynamic_binning_agg(): retries = 2, hook_params={"options": "-c statement_timeout=10800000ms"} #3 hours ) - aggregate_daily + + check_not_empty >> aggregate_daily here_dynamic_binning_agg() \ No newline at end of file From 57cc194b4091ca18e5d4bf2b44e5e8cc6f0f9386 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:26:04 +0000 Subject: [PATCH 63/74] #1132 perform bootstrapping by group of segment_ids --- dags/here_dynamic_binning_monthly_agg.py | 56 ++++++++++++++++++++---- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/dags/here_dynamic_binning_monthly_agg.py b/dags/here_dynamic_binning_monthly_agg.py index fd8f1e0d9..d46989cf9 100644 --- a/dags/here_dynamic_binning_monthly_agg.py +++ b/dags/here_dynamic_binning_monthly_agg.py @@ -1,12 +1,12 @@ import os import sys import logging -from datetime import timedelta from pendulum import duration, datetime from airflow.models import Variable -from airflow.decorators import dag +from airflow.decorators import dag, task from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator +from airflow.providers.postgres.hooks.postgres import PostgresHook try: repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -27,10 +27,8 @@ 'owner': ','.join(DAG_OWNERS), 'depends_on_past':False, 'start_date': datetime(2019, 1, 1, tz="America/Toronto"), - 'email_on_failure': False, - 'email_on_success': False, 'retries': 1, - 'retry_delay': duration(hours=1), + 'retry_delay': duration(hours=1) #'on_failure_callback': task_fail_slack_alert } @@ -64,9 +62,51 @@ def here_dynamic_binning_monthly_agg(): task_id='aggregate_monthly', conn_id='congestion_bot', autocommit=True, - retries = 1, - execution_timeout=timedelta(hours=1) + retries = 1 ) - check_missing_dates >> aggregate_monthly + + create_groups = SQLExecuteQueryOperator( + sql="segment_grouping.sql", + task_id="create_segment_groups", + #TODO: update sql to work for different map versions + start_date=datetime(2025, 4, 1, tz="America/Toronto"), + conn_id='congestion_bot', + retries = 0, + params={"max_group_size": 100} + ) + + delete_data = SQLExecuteQueryOperator( + sql="DELETE FROM gwolofs.congestion_segments_monthly_bootstrap WHERE mnth = '{{ ds }}' AND n_resamples = 300", + task_id="delete_bootstrap_results", + conn_id='congestion_bot', + retries=0 + ) + + @task + def expand_groups(**context): + return context["ti"].xcom_pull(task_ids="create_segment_groups")[0][0] + + @task(retries=0, max_active_tis_per_dag=1) + def bootstrap_agg(segments, ds): + print(f"segments: {segments}") + postgres_cred = PostgresHook("congestion_bot") + query="""SELECT * + FROM UNNEST(%s::bigint[]) AS unnested(segment_id), + LATERAL ( + SELECT gwolofs.congestion_segment_bootstrap( + mnth := %s::date, + segment_id := segment_id, + n_resamples := 300) + ) AS lat""" + with postgres_cred.get_conn() as conn: + with conn.cursor() as cur: + cur.execute(query, (segments, ds)) + conn.commit() + + expand = expand_groups() + + check_missing_dates >> aggregate_monthly >> create_groups >> delete_data + delete_data >> expand + bootstrap_agg.expand(segments=expand) here_dynamic_binning_monthly_agg() \ No newline at end of file From 2c3763140ecf9adeaaa3c850c2cc58a128e25733 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:26:33 +0000 Subject: [PATCH 64/74] #1132 much faster over a single segment_id rather than array --- ...-function-congestion_segment_bootstrap.sql | 160 ++++++++---------- 1 file changed, 75 insertions(+), 85 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql index 5c1a0c832..d23110761 100644 --- a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql +++ b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql @@ -1,99 +1,89 @@ -/* ---test -SELECT tt_array[ceiling(random() * 3)] -FROM (VALUES(ARRAY[1,2,3])) AS val(tt_array) -CROSS JOIN generate_series(1,100,1) -*/ - - +--DROP FUNCTION gwolofs.congestion_segment_bootstrap(date,bigint,integer); CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_bootstrap( - mnth date, - segment_ids bigint[], - n_resamples int) - RETURNS TABLE( - segment_id integer, mnth date, is_wkdy boolean, hr numeric, - avg_tt real, n bigint, ci_lower real, ci_upper real - ) + mnth date, + segment_id bigint, + n_resamples int + ) + RETURNS VOID LANGUAGE SQL COST 100 VOLATILE PARALLEL SAFE AS $BODY$ -WITH raw_obs AS ( + SELECT setseed(('0.'||replace(mnth::text, '-', ''))::numeric); + + WITH raw_obs AS ( + SELECT + --segment_id and mnth don't need to be in group by until end + EXTRACT('isodow' FROM dt) IN (1, 2, 3, 4, 5) AS is_wkdy, + EXTRACT('hour' FROM hr) AS hr, + ARRAY_AGG(tt::real) AS tt_array, + AVG(tt::real) AS avg_tt, + COUNT(*) AS n + FROM gwolofs.congestion_raw_segments + WHERE -- same params as the above aggregation + dt >= congestion_segment_bootstrap.mnth + AND dt < congestion_segment_bootstrap.mnth + interval '1 month' + AND segment_id = congestion_segment_bootstrap.segment_id + GROUP BY + segment_id, + is_wkdy, + EXTRACT('hour' FROM hr) + ), + + random_selections AS ( + SELECT + raw_obs.is_wkdy, + raw_obs.hr, + raw_obs.avg_tt, + raw_obs.n, + sample_group.group_id, + --get a random observation from the array of tts + AVG(raw_obs.tt_array[ceiling(random() * raw_obs.n)]) AS rnd_avg_tt + FROM raw_obs + CROSS JOIN generate_series(1, n) + -- 200 resamples (could be any number) + CROSS JOIN generate_series(1, congestion_segment_bootstrap.n_resamples) AS sample_group(group_id) + GROUP BY + raw_obs.is_wkdy, + raw_obs.hr, + raw_obs.avg_tt, + raw_obs.n, + sample_group.group_id + ) + + INSERT INTO gwolofs.congestion_segments_monthly_bootstrap ( + segment_id, mnth, is_wkdy, hr, avg_tt, n, n_resamples, ci_lower, ci_upper + ) SELECT - segment_id, - congestion_segment_bootstrap.mnth AS mnth, - EXTRACT('isodow' FROM dt) IN (1, 2, 3, 4, 5) AS is_wkdy, - EXTRACT('hour' FROM hr) AS hr, - ARRAY_AGG(tt::real) AS tt_array, - AVG(tt::real) AS avg_tt, - COUNT(*) AS n - FROM gwolofs.congestion_raw_segments - WHERE -- same params as the above aggregation - dt >= congestion_segment_bootstrap.mnth - AND dt < congestion_segment_bootstrap.mnth + interval '1 month' - AND segment_id = ANY(congestion_segment_bootstrap.segment_ids) - GROUP BY - segment_id, + congestion_segment_bootstrap.segment_id, + congestion_segment_bootstrap.mnth, is_wkdy, - EXTRACT('hour' FROM hr) -), - -random_selections AS ( - SELECT - raw_obs.segment_id, - raw_obs.is_wkdy, - raw_obs.hr, - raw_obs.avg_tt, - raw_obs.n, - raw_obs.mnth, - sample_group.group_id, - --get a random observation from the array of tts - AVG(raw_obs.tt_array[ceiling(random() * raw_obs.n)]) AS rnd_avg_tt - FROM raw_obs - CROSS JOIN generate_series(1, n) - -- 200 resamples (could be any number) - CROSS JOIN generate_series(1, congestion_segment_bootstrap.n_resamples) AS sample_group(group_id) + hr, + avg_tt::real, + n, + n_resamples, + percentile_disc(0.025) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_lower, + percentile_disc(0.975) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_upper + FROM random_selections GROUP BY - raw_obs.segment_id, - raw_obs.is_wkdy, - raw_obs.hr, - raw_obs.avg_tt, - raw_obs.n, - raw_obs.mnth, - sample_group.group_id -) - -SELECT - segment_id, - mnth, - is_wkdy, - hr, - avg_tt::real, - n, - percentile_disc(0.025) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_lower, - percentile_disc(0.975) WITHIN GROUP (ORDER BY rnd_avg_tt)::real AS ci_upper -FROM random_selections -GROUP BY - segment_id, - mnth, - is_wkdy, - hr, - avg_tt, - n; + is_wkdy, + hr, + avg_tt, + n; $BODY$; ---6:52 for 100 -/*example -SELECT congestion_segment_bootstrap.* -FROM gwolofs.congestion_segment_bootstrap( - mnth := '2025-05-01'::date, - segment_ids := (SELECT ARRAY(SELECT segment_id FROM generate_series(1,100) AS a(segment_id))), - n_resamples := 300 -) -*/ +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_bootstrap(date,bigint,integer) TO congestion_bot; ---6.5 hours estimate for all ---SELECT COUNT(DISTINCT segment_id) / 100.0 * 352 / 60 / 60 FROM congestion.network_segments_24_4 ORDER BY 1 +/*Usage example: (works best one segment at a time with Lateral) +SELECT * +FROM UNNEST('{1,2,3,4,5,6,7,8,9}'::bigint[]) AS unnested(segment_id) +LATERAL ( + SELECT gwolofs.congestion_segment_bootstrap( + mnth := '2025-06-01'::date, + segment_ids := segment_id, + n_resamples := 300) +) +*/ \ No newline at end of file From 0effb5ff1f7e258e3bc93ecffe3d0f31fee5b02a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:27:00 +0000 Subject: [PATCH 65/74] #1132 bootstrap table structure --- ...-congestion_segments_monthly_bootstrap.sql | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql new file mode 100644 index 000000000..4b9ea844a --- /dev/null +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql @@ -0,0 +1,33 @@ +-- Table: gwolofs.congestion_segments_monthly_bootstrap + +-- DROP TABLE IF EXISTS gwolofs.congestion_segments_monthly_bootstrap; + +CREATE TABLE IF NOT EXISTS gwolofs.congestion_segments_monthly_bootstrap +( + segment_id integer NOT NULL, + mnth date NOT NULL, + is_wkdy boolean NOT NULL, + hr numeric NOT NULL, + avg_tt real, + n bigint, + ci_lower real, + ci_upper real, + n_resamples integer NOT NULL, + CONSTRAINT congestion_segments_monthly_bootstrap_pkey PRIMARY KEY (segment_id, mnth, is_wkdy, hr, n_resamples) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS gwolofs.congestion_segments_monthly_bootstrap +OWNER TO gwolofs; + +REVOKE ALL ON TABLE gwolofs.congestion_segments_monthly_bootstrap FROM bdit_humans; +REVOKE ALL ON TABLE gwolofs.congestion_segments_monthly_bootstrap FROM congestion_bot; + +GRANT SELECT, TRIGGER, REFERENCES ON TABLE gwolofs.congestion_segments_monthly_bootstrap TO bdit_humans WITH GRANT OPTION; + +GRANT INSERT, SELECT, DELETE ON TABLE gwolofs.congestion_segments_monthly_bootstrap TO congestion_bot; + +GRANT ALL ON TABLE gwolofs.congestion_segments_monthly_bootstrap TO dbadmin; + +GRANT ALL ON TABLE gwolofs.congestion_segments_monthly_bootstrap TO rds_superuser WITH GRANT OPTION; From a058f7cc833d75329d074874fe8fa1c652e529b4 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 10 Sep 2025 21:20:02 +0000 Subject: [PATCH 66/74] #1132 tt->real, hr->smallint --- .../create-table-congestion_raw_corridors.sql | 4 ++-- .../create-table-congestion_raw_segments.sql | 4 ++-- ...-congestion_segments_monthly_bootstrap.sql | 6 ++--- ...ble-congestion_segments_monthy_summary.sql | 18 +++++++-------- .../function-congestion_cache_tt_results.sql | 4 ++-- ...unction-congestion_network_segment_agg.sql | 6 ++--- ...unction-congestion_segment_monthly_agg.sql | 23 ++++++++++--------- 7 files changed, 33 insertions(+), 32 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index 9088b1a6a..b4471f323 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -7,11 +7,11 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors corridor_id smallint, time_grp timerange NOT NULL, bin_range tsrange NOT NULL, - tt numeric, + tt real, num_obs integer, uri_string text COLLATE pg_catalog."default", dt date, - hr timestamp without time zone, + hr smallint, CONSTRAINT congestion_raw_corridors_pkey PRIMARY KEY (corridor_id, bin_range, time_grp), CONSTRAINT corridor_fkey FOREIGN KEY (corridor_id) REFERENCES gwolofs.congestion_corridors (corridor_id) MATCH SIMPLE diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index dc2c090c6..ce988d66b 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -8,9 +8,9 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_segments dt date NOT NULL, bin_start timestamp without time zone NOT NULL, bin_range tsrange NOT NULL, - tt numeric, + tt real, num_obs integer, - hr timestamp without time zone, + hr smallint, CONSTRAINT congestion_raw_segments_pkey PRIMARY KEY (segment_id, dt, bin_start) ) PARTITION BY RANGE (dt); diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql index 4b9ea844a..52d71097d 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthly_bootstrap.sql @@ -7,12 +7,12 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_segments_monthly_bootstrap segment_id integer NOT NULL, mnth date NOT NULL, is_wkdy boolean NOT NULL, - hr numeric NOT NULL, + hr smallint NOT NULL, avg_tt real, - n bigint, + n smallint, ci_lower real, ci_upper real, - n_resamples integer NOT NULL, + n_resamples smallint NOT NULL, CONSTRAINT congestion_segments_monthly_bootstrap_pkey PRIMARY KEY (segment_id, mnth, is_wkdy, hr, n_resamples) ) diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql index 462ae6b59..09e2c0733 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_segments_monthy_summary.sql @@ -7,15 +7,15 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_segments_monthy_summary segment_id integer, mnth date, is_wkdy boolean, - hr double precision, - avg_tt numeric, - stdev numeric, - percentile_05 numeric, - percentile_15 numeric, - percentile_50 numeric, - percentile_85 numeric, - percentile_95 numeric, - num_quasi_obs bigint + hr smallint, + avg_tt real, + stdev real, + percentile_05 real, + percentile_15 real, + percentile_50 real, + percentile_85 real, + percentile_95 real, + num_quasi_obs smallint ) TABLESPACE pg_default; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index f55190539..221c36bf7 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -218,9 +218,9 @@ EXECUTE FORMAT( time_grp, corridor_id, bin_range, - tt, + tt::real, num_obs, - date_trunc('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr + date_part('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr FROM inserted ON CONFLICT DO NOTHING; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index 557c81756..a99a94549 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -140,7 +140,7 @@ EXECUTE FORMAT( unnested.len ), - inserted AS ( + inserted AS ( --this query contains overlapping values which get eliminated --via on conflict with the exclusion constraint on congestion_raw_segments table. INSERT INTO congestion_raw_segments_temp AS inserted ( @@ -180,9 +180,9 @@ EXECUTE FORMAT( bin_start, segment_id, bin_range, - tt, + tt::real, num_obs, - date_trunc('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr + date_part('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr FROM inserted ON CONFLICT DO NOTHING; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql index b80610c33..93e8741d6 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql @@ -3,9 +3,10 @@ -- DROP FUNCTION IF EXISTS gwolofs.congestion_segment_monthly_agg(date); CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_monthly_agg( - mon date) - RETURNS void - LANGUAGE 'sql' + mon date +) + RETURNS VOID + LANGUAGE SQL COST 100 VOLATILE PARALLEL UNSAFE AS $BODY$ @@ -19,13 +20,13 @@ SELECT congestion_segment_monthly_agg.mon AS mnth, date_part('isodow', dt) <= 5 AS is_wkdy, date_part('hour', hr) AS hr, - ROUND(AVG(tt), 2) AS avg_tt, - ROUND(stddev(tt), 2) AS stdev, - ROUND(PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_05, - ROUND(PERCENTILE_CONT(0.15) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_15, - ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_50, - ROUND(PERCENTILE_CONT(0.85) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_85, - ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY tt)::numeric, 2) AS percentile_95, + AVG(tt) AS avg_tt, + stddev(tt) AS stdev, + PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY tt) AS percentile_05, + PERCENTILE_CONT(0.15) WITHIN GROUP (ORDER BY tt) AS percentile_15, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY tt) AS percentile_50, + PERCENTILE_CONT(0.85) WITHIN GROUP (ORDER BY tt) AS percentile_85, + PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY tt) AS percentile_95, COUNT(*) AS num_quasi_obs FROM gwolofs.congestion_raw_segments LEFT JOIN ref.holiday USING (dt) @@ -41,7 +42,7 @@ GROUP BY $BODY$; ALTER FUNCTION gwolofs.congestion_segment_monthly_agg(date) - OWNER TO gwolofs; +OWNER TO gwolofs; GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO PUBLIC; From c6ef2082cd65cf5cc014beb56098f61e6eb676de Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 15 Sep 2025 14:42:30 +0000 Subject: [PATCH 67/74] #1132 fluff --- .../traffic/sql/dynamic_bins/corridor_agg.sql | 19 ++++++++++-------- ...-function-congestion_segment_bootstrap.sql | 14 +++++++------ .../create-table-congestion_corridors.sql | 8 ++++---- .../create-table-congestion_raw_corridors.sql | 20 +++++++++---------- .../create-table-congestion_raw_segments.sql | 12 +++++------ ...unction-congestion_segment_monthly_agg.sql | 12 +++++------ ...function-congestion_select_map_version.sql | 2 +- .../insert_projects_and_corridors.sql | 12 ++++++----- .../sql/dynamic_bins/segment_grouping.sql | 10 ++++++---- .../select-check_missing_days.sql | 20 ++++++++++--------- 10 files changed, 70 insertions(+), 59 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/corridor_agg.sql b/here/traffic/sql/dynamic_bins/corridor_agg.sql index a16c0dbd2..b1cb9e9b8 100644 --- a/here/traffic/sql/dynamic_bins/corridor_agg.sql +++ b/here/traffic/sql/dynamic_bins/corridor_agg.sql @@ -1,19 +1,22 @@ --test: 35 projects, 1 day = 47s -SELECT gwolofs.congestion_cache_tt_results_daily( - node_start := congestion_corridors.node_start, - node_end := congestion_corridors.node_end, - start_date := dates.dt::date -) +SELECT + gwolofs.congestion_cache_tt_results_daily( + node_start := congestion_corridors.node_start, + node_end := congestion_corridors.node_end, + start_date := dates.dt::date + ) FROM gwolofs.congestion_corridors JOIN gwolofs.congestion_projects USING (project_id), -generate_series('2025-01-01', '2025-02-28', '1 day'::interval) AS dates(dt) -WHERE + generate_series('2025-01-01', '2025-02-28', '1 day'::interval) AS dates (dt) +WHERE congestion_projects.description IN ( 'Avenue Road cycleway installation', 'bluetooth_corridors', 'scrutinized-cycleway-corridors' ) AND corridor_id NOT IN ( - SELECT DISTINCT corridor_id FROM gwolofs.congestion_raw_corridors WHERE dt >= '2025-01-01' AND dt < '2025-02-28' + SELECT DISTINCT corridor_id + FROM gwolofs.congestion_raw_corridors + WHERE dt >= '2025-01-01' AND dt < '2025-02-28' ) AND map_version = '23_4'; \ No newline at end of file diff --git a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql index d23110761..7a9e7ad8f 100644 --- a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql +++ b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql @@ -4,11 +4,11 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_bootstrap( mnth date, segment_id bigint, n_resamples int - ) - RETURNS VOID - LANGUAGE SQL - COST 100 - VOLATILE PARALLEL SAFE +) +RETURNS void +LANGUAGE SQL +COST 100 +VOLATILE PARALLEL SAFE AS $BODY$ SELECT setseed(('0.'||replace(mnth::text, '-', ''))::numeric); @@ -75,7 +75,9 @@ AS $BODY$ $BODY$; -GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_bootstrap(date,bigint,integer) TO congestion_bot; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_bootstrap( + date, bigint, integer +) TO CONGESTION_BOT; /*Usage example: (works best one segment at a time with Lateral) SELECT * diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql index 2d905a19e..7c09a0628 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_corridors.sql @@ -19,10 +19,10 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_corridors CONSTRAINT congestion_corridors_pkey PRIMARY KEY (node_start, node_end, map_version), CONSTRAINT corridor_pkey UNIQUE NULLS NOT DISTINCT (corridor_id), CONSTRAINT project_id_fk FOREIGN KEY (project_id) - REFERENCES gwolofs.congestion_projects (project_id) MATCH SIMPLE - ON UPDATE NO ACTION - ON DELETE NO ACTION - NOT VALID + REFERENCES gwolofs.congestion_projects (project_id) MATCH SIMPLE + ON UPDATE NO ACTION + ON DELETE NO ACTION + NOT VALID ) TABLESPACE pg_default; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql index b4471f323..2d6f294ca 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_corridors.sql @@ -14,10 +14,10 @@ CREATE TABLE IF NOT EXISTS gwolofs.congestion_raw_corridors hr smallint, CONSTRAINT congestion_raw_corridors_pkey PRIMARY KEY (corridor_id, bin_range, time_grp), CONSTRAINT corridor_fkey FOREIGN KEY (corridor_id) - REFERENCES gwolofs.congestion_corridors (corridor_id) MATCH SIMPLE - ON UPDATE NO ACTION - ON DELETE CASCADE - NOT VALID + REFERENCES gwolofs.congestion_corridors (corridor_id) MATCH SIMPLE + ON UPDATE NO ACTION + ON DELETE CASCADE + NOT VALID ) TABLESPACE pg_default; @@ -62,19 +62,19 @@ COMMENT ON TABLE gwolofs.congestion_raw_corridors IS 'Stores dynamic binning results for custom corridor based travel time requests.'; COMMENT ON TABLE gwolofs.congestion_raw_corridors - IS 'Stores dynamic binning results from standard HERE congestion network travel time aggregations.'; +IS 'Stores dynamic binning results from standard HERE congestion network travel time aggregations.'; COMMENT ON COLUMN gwolofs.congestion_raw_corridors.bin_range - IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; +IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; COMMENT ON COLUMN gwolofs.congestion_raw_corridors.tt - IS 'Travel time in seconds.'; +IS 'Travel time in seconds.'; COMMENT ON COLUMN gwolofs.congestion_raw_corridors.num_obs - IS 'The sum of the sample size from here.ta_path.'; +IS 'The sum of the sample size from here.ta_path.'; COMMENT ON COLUMN gwolofs.congestion_raw_corridors.dt - IS 'The date of aggregation for the record. Records may not overlap dates.'; +IS 'The date of aggregation for the record. Records may not overlap dates.'; COMMENT ON COLUMN gwolofs.congestion_raw_corridors.hr - IS 'The hour the majority of the record occured in. Ties are rounded up.'; +IS 'The hour the majority of the record occured in. Ties are rounded up.'; diff --git a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql index ce988d66b..cfcc515bb 100644 --- a/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql +++ b/here/traffic/sql/dynamic_bins/create-table-congestion_raw_segments.sql @@ -61,19 +61,19 @@ ALTER TABLE IF EXISTS gwolofs.congestion_raw_segments_2025 OWNER TO gwolofs; COMMENT ON COLUMN gwolofs.congestion_raw_segments.dt - IS 'The date of aggregation for the record. Records may not overlap dates.'; +IS 'The date of aggregation for the record. Records may not overlap dates.'; COMMENT ON COLUMN gwolofs.congestion_raw_segments.bin_start - IS 'The start of the observation. It is recommended to use `hr` to group the bin instead. This column is used in the primary key, although the main constraint occurs during insert (non overlapping ranges).'; +IS 'The start of the observation. It is recommended to use `hr` to group the bin instead. This column is used in the primary key, although the main constraint occurs during insert (non overlapping ranges).'; COMMENT ON COLUMN gwolofs.congestion_raw_segments.bin_range - IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; +IS 'Bin range. An exclusion constraint on a temp table prevents overlapping ranges during insert.'; COMMENT ON COLUMN gwolofs.congestion_raw_segments.tt - IS 'Travel time in seconds.'; +IS 'Travel time in seconds.'; COMMENT ON COLUMN gwolofs.congestion_raw_segments.num_obs - IS 'The sum of the sample size from here.ta_path.'; +IS 'The sum of the sample size from here.ta_path.'; COMMENT ON COLUMN gwolofs.congestion_raw_segments.hr - IS 'The hour the majority of the record occured in. Ties are rounded up.'; +IS 'The hour the majority of the record occured in. Ties are rounded up.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql index 93e8741d6..4c13186a8 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql @@ -5,10 +5,10 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_segment_monthly_agg( mon date ) - RETURNS VOID - LANGUAGE SQL - COST 100 - VOLATILE PARALLEL UNSAFE +RETURNS void +LANGUAGE SQL +COST 100 +VOLATILE PARALLEL UNSAFE AS $BODY$ INSERT INTO gwolofs.congestion_segments_monthy_summary ( @@ -46,7 +46,7 @@ OWNER TO gwolofs; GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO PUBLIC; -GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO congestion_bot; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO CONGESTION_BOT; -GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO gwolofs; +GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_monthly_agg(date) TO GWOLOFS; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql index 71c635e7b..cd5371ef0 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_select_map_version.sql @@ -5,7 +5,7 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_select_map_version( start_date date, end_date date, - agg_type text default null, --null or 'path' + agg_type text DEFAULT NULL, --null or 'path' OUT selected_version text ) RETURNS text diff --git a/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql b/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql index 62c107fc0..cbde86da4 100644 --- a/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql +++ b/here/traffic/sql/dynamic_bins/insert_projects_and_corridors.sql @@ -1,10 +1,12 @@ --for naming corridor_streets. --need help with corridor_start and corridor_end locations - not sure how to turn here nodes into names. Intersection conflation? WITH named_corridors AS ( - SELECT corridor_id, string_agg(DISTINCT initcap(st_name), ' / ') AS corridor_streets + SELECT + corridor_id, + string_agg(DISTINCT initcap(st_name), ' / ') AS corridor_streets FROM gwolofs.congestion_corridors, - UNNEST (congestion_corridors.link_dirs) AS unnested(link_dir) - LEFT JOIN here_gis.traffic_streets_24_4 ON link_id = trim(trailing 'T|F' from link_dir)::int + UNNEST(congestion_corridors.link_dirs) AS unnested (link_dir) + LEFT JOIN here_gis.traffic_streets_24_4 ON link_id = trim(TRAILING 'T|F' FROM link_dir)::int WHERE map_version = '24_4' GROUP BY corridor_id ORDER BY corridor_id DESC @@ -29,12 +31,12 @@ WITH project AS ( corridors AS ( SELECT corridor_id FROM bluetooth.here_cn_23_4_lookup AS bt, - gwolofs.congestion_cache_corridor(bt.here_fnode, bt.here_tnode, '24_4') + gwolofs.congestion_cache_corridor(bt.here_fnode, bt.here_tnode, '24_4') ) --add project_id to corridors UPDATE gwolofs.congestion_corridors -SET project_id = (SELECT project_id FROM project) +SET project_id = (SELECT project_id FROM project) WHERE corridor_id IN (SELECT corridor_id FROM corridors) RETURNING corridor_id; diff --git a/here/traffic/sql/dynamic_bins/segment_grouping.sql b/here/traffic/sql/dynamic_bins/segment_grouping.sql index e18f762b2..93303ca43 100644 --- a/here/traffic/sql/dynamic_bins/segment_grouping.sql +++ b/here/traffic/sql/dynamic_bins/segment_grouping.sql @@ -3,15 +3,17 @@ WITH segments AS ( SELECT DISTINCT segment_id FROM gwolofs.congestion_raw_segments WHERE - dt >= {{ ds }}::date - AND dt < {{ ds }}::date + '1 month'::interval + dt >= '{{ ds }}'::date --noqa: TMP + AND dt < '{{ ds }}'::date + '1 month'::interval --noqa: TMP ), group_size AS ( --find the number of groups required to have no more than `max_group_size` per group SELECT - FLOOR(COUNT(*) - / CEIL((COUNT(*)) / {{ params.max_group_size }}::numeric)) AS num_per_group + FLOOR( + COUNT(*) + / CEIL((COUNT(*)) / {{ params.max_group_size }}::numeric) --noqa: TMP + ) AS num_per_group FROM segments ), diff --git a/here/traffic/sql/dynamic_bins/select-check_missing_days.sql b/here/traffic/sql/dynamic_bins/select-check_missing_days.sql index a993ba57c..0b09c3673 100644 --- a/here/traffic/sql/dynamic_bins/select-check_missing_days.sql +++ b/here/traffic/sql/dynamic_bins/select-check_missing_days.sql @@ -2,18 +2,20 @@ WITH distinct_days AS ( SELECT DISTINCT dt FROM gwolofs.congestion_raw_segments WHERE - dt >= '{{ ds }}'::date - AND dt < '{{ ds }}'::date + interval '1 month' + dt >= '{{ ds }}'::date --noqa: TMP + AND dt < '{{ ds }}'::date + interval '1 month' --noqa: TMP ) SELECT - COUNT(*) = 0, + COUNT(*) = 0 AS _check, 'The following days are missing from `congestion_raw_segments`: ' - || string_agg(dates.dt::date::text, ', ') AS summary -FROM generate_series( - '{{ ds }}'::date, - --one day before start of next month - ('{{ ds }}'::date + interval '1 month')::date - 1, - '1 day') AS dates(dt) + || string_agg(dates.dt::date::text, ', ') AS _summary +FROM + generate_series( + '{{ ds }}'::date, + --one day before start of next month + ('{{ ds }}'::date + interval '1 month')::date - 1, + '1 day' + ) AS dates (dt) LEFT JOIN distinct_days USING (dt) WHERE distinct_days.dt IS NULL; \ No newline at end of file From 6dc942fa671ea2173ee3bd4a079c9af3f2ae30f7 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:50:55 +0000 Subject: [PATCH 68/74] #1132 separate out insert and select funcionality --- .../function-congestion_cache_tt_results.sql | 224 +-------------- ...unction-congestion_return_dynamic_bins.sql | 256 ++++++++++++++++++ 2 files changed, 270 insertions(+), 210 deletions(-) create mode 100644 here/traffic/sql/dynamic_bins/function-congestion_return_dynamic_bins.sql diff --git a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql index 221c36bf7..1c486eada 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_cache_tt_results.sql @@ -14,226 +14,30 @@ CREATE OR REPLACE FUNCTION gwolofs.congestion_cache_tt_results( holidays boolean ) RETURNS void -LANGUAGE plpgsql +LANGUAGE SQL COST 100 VOLATILE PARALLEL UNSAFE AS $BODY$ -DECLARE map_version text; - -BEGIN - ---using a temp table to aply the exclusion constraint should prevent the ---insert from getting bogged down by large constraint on main table over time -DROP TABLE IF EXISTS congestion_raw_corridors_temp; -CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( - corridor_id smallint, - time_grp timerange NOT NULL, - bin_range tsrange NOT NULL, - tt numeric, - num_obs integer, - uri_string text, - CONSTRAINT congestion_raw_corridors_exclude_temp EXCLUDE USING gist ( - bin_range WITH &&, - corridor_id WITH =, - time_grp WITH =, - coalesce(uri_string, '') WITH = --this is the only column in constraint which is nullable - ) -); - -SELECT gwolofs.congestion_select_map_version( - congestion_cache_tt_results.start_date, - congestion_cache_tt_results.end_date, - 'path' -) INTO map_version; - -EXECUTE FORMAT( - $$ - WITH corridor AS ( - SELECT - ccc.corridor_id, - unnested.link_dir, - unnested.length, - ccc.total_length - FROM gwolofs.congestion_cache_corridor(%1$L, %2$L, %3$L) AS ccc, - UNNEST( - ccc.link_dirs, - ccc.lengths - ) AS unnested(link_dir, length) - ), - - segment_5min_bins AS ( - SELECT - seg.corridor_id, - ta.tx, - seg.total_length, - tsrange( - ta.dt + %4$L::time, - ta.dt + %5$L::time, '[)') AS time_grp, - RANK() OVER w AS bin_rank, - SUM(seg.length) / seg.total_length AS sum_length, - SUM(seg.length) AS length_w_data, - SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, - SUM(sample_size) AS num_obs, - ARRAY_AGG(ta.link_dir ORDER BY ta.link_dir) AS link_dirs, - ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, - ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths - FROM here.ta_path AS ta - JOIN corridor AS seg USING (link_dir) - WHERE - ( - ta.tod >= %4$L - AND --{ToD_and_or} - ta.tod < %5$L - ) - AND date_part('isodow', ta.dt) = ANY(%6$L::int[]) - AND ta.dt >= %7$L - AND ta.dt < %8$L - /*--{holiday_clause} - AND NOT EXISTS ( - SELECT 1 FROM ref.holiday WHERE ta.dt = holiday.dt - )*/ - GROUP BY - seg.corridor_id, - ta.tx, - ta.dt, - seg.total_length - WINDOW w AS ( - PARTITION BY seg.corridor_id, ta.dt - ORDER BY ta.tx - ) - ), - - dynamic_bin_options AS ( - --within each corridor/hour, generate all possible forward looking bin combinations - --don't generate options for bins with sufficient length - --also don't generate options past the next bin with 80%% length - SELECT - tx, - corridor_id, - time_grp, - bin_rank AS start_bin, - --generate all the options for the end bin within the group. - generate_series( - CASE - WHEN sum_length >= 0.8 THEN bin_rank - --if length is insufficient, need at least 1 more bin - ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) - END, - CASE - --dont need to generate options when start segment is already sufficient - WHEN sum_length >= 0.8 THEN bin_rank - --generate options until 1 bin has sufficient length, otherwise until last bin in group - ELSE COALESCE( - MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, - MAX(bin_rank) OVER w - ) - END, - 1 - ) AS end_bin - FROM segment_5min_bins - WINDOW w AS ( - PARTITION BY corridor_id, time_grp - ORDER BY tx - --look only forward for end_bin options - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING - ) - ), - - unnested_db_options AS ( - SELECT - s5b.corridor_id, - dbo.time_grp, - s5b.total_length, - dbo.tx AS dt_start, - --exclusive end bin - s5b_end.tx + interval '5 minutes' AS dt_end, - unnested.link_dir, - unnested.len, - AVG(unnested.tt) AS tt, --avg TT for each link_dir - SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir - FROM dynamic_bin_options AS dbo - LEFT JOIN segment_5min_bins AS s5b - ON s5b.time_grp = dbo.time_grp - AND s5b.bin_rank >= dbo.start_bin - AND s5b.bin_rank <= dbo.end_bin - --this join is used to get the tx info about the last bin only - LEFT JOIN segment_5min_bins AS s5b_end - ON s5b_end.time_grp = dbo.time_grp - AND s5b_end.bin_rank = dbo.end_bin, - --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin - UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) - --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) - WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '30 minutes' - GROUP BY - s5b.corridor_id, - dbo.time_grp, - s5b.total_length, - dbo.tx, --stard_bin - s5b_end.tx, --end_bin - unnested.link_dir, - unnested.len - ), - - inserted AS ( - --this query contains overlapping values which get eliminated - --via on conflict with the exclusion constraint on congestion_raw_segments table. - INSERT INTO congestion_raw_corridors_temp AS inserted ( - uri_string, time_grp, corridor_id, bin_range, tt, num_obs - ) - --distinct on ensures only the shortest option gets proposed for insert - SELECT DISTINCT ON (dt_start) - %9$L, --uristring - timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, - corridor_id, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment - FROM unnested_db_options - GROUP BY - time_grp, - corridor_id, - dt_start, - dt_end, - total_length - HAVING SUM(len) >= 0.8 * total_length - ORDER BY - dt_start, - dt_end - --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude_temp - DO NOTHING - RETURNING - inserted.uri_string, inserted.time_grp, inserted.corridor_id, - inserted.bin_range, inserted.tt, inserted.num_obs - ) - --insert into the final table INSERT INTO gwolofs.congestion_raw_corridors ( uri_string, dt, time_grp, corridor_id, bin_range, tt, num_obs, hr ) SELECT - uri_string, - lower(bin_range)::date AS dt, - time_grp, - corridor_id, - bin_range, - tt::real, - num_obs, - date_part('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr - FROM inserted + congestion_cache_tt_results.uri_string, + dt, time_grp, corridor_id, bin_range, tt, num_obs, hr + FROM gwolofs.congestion_return_dynamic_bins( + congestion_cache_tt_results.start_date, + congestion_cache_tt_results.end_date, + congestion_cache_tt_results.start_tod, + congestion_cache_tt_results.end_tod, + congestion_cache_tt_results.dow_list, + congestion_cache_tt_results.node_start, + congestion_cache_tt_results.node_end, + congestion_cache_tt_results.holidays + ) ON CONFLICT DO NOTHING; - $$, - node_start, node_end, map_version, --segment CTE - start_tod, end_tod, --segment_5min_bins CTE SELECT - dow_list, start_date, end_date, --segment_5min_bins CTE WHERE - congestion_cache_tt_results.uri_string --INSERT -); - - DROP TABLE congestion_raw_corridors_temp; - -END; $BODY$; ALTER FUNCTION gwolofs.congestion_cache_tt_results( @@ -270,4 +74,4 @@ SELECT gwolofs.congestion_cache_tt_results( $BODY$; COMMENT ON FUNCTION gwolofs.congestion_cache_tt_results_daily -IS 'A simplified version of `congestion_cache_tt_results` for aggregating entire days of data.' \ No newline at end of file +IS 'A simplified version of `congestion_cache_tt_results` for aggregating entire days of data.'; diff --git a/here/traffic/sql/dynamic_bins/function-congestion_return_dynamic_bins.sql b/here/traffic/sql/dynamic_bins/function-congestion_return_dynamic_bins.sql new file mode 100644 index 000000000..38ccbc8a8 --- /dev/null +++ b/here/traffic/sql/dynamic_bins/function-congestion_return_dynamic_bins.sql @@ -0,0 +1,256 @@ +-- FUNCTION: gwolofs.congestion_return_dynamic_bins(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean) --noqa: LT05 + +-- DROP FUNCTION IF EXISTS gwolofs.congestion_return_dynamic_bins(text, date, date, time without time zone, time without time zone, integer[], bigint, bigint, boolean); --noqa: LT05 + +CREATE OR REPLACE FUNCTION gwolofs.congestion_return_dynamic_bins( + start_date date, + end_date date, + start_tod time without time zone, + end_tod time without time zone, + dow_list integer [], + node_start bigint, + node_end bigint, + holidays boolean +) +RETURNS TABLE ( + dt date, + time_grp timerange, + corridor_id smallint, + bin_range tsrange, + tt real, + num_obs integer, + hr smallint +) +LANGUAGE plpgsql +COST 100 +VOLATILE PARALLEL RESTRICTED +AS $BODY$ + +DECLARE +map_version text; + +BEGIN + +--using a temp table to aply the exclusion constraint should prevent the +--insert from getting bogged down by large constraint on main table over time +DROP TABLE IF EXISTS congestion_raw_corridors_temp; +CREATE TEMPORARY TABLE congestion_raw_corridors_temp ( + dt date GENERATED ALWAYS AS (lower(bin_range)) STORED, + corridor_id smallint, + time_grp timerange NOT NULL, + bin_range tsrange NOT NULL, + tt real, + num_obs integer, + hr smallint GENERATED ALWAYS AS (date_part('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2)) STORED, + CONSTRAINT congestion_raw_corridors_exclude_temp EXCLUDE USING gist ( + bin_range WITH &&, + corridor_id WITH =, + time_grp WITH = + ) +); + +SELECT gwolofs.congestion_select_map_version( + congestion_return_dynamic_bins.start_date, + congestion_return_dynamic_bins.end_date, + 'path' +) INTO map_version; + +RETURN QUERY EXECUTE FORMAT( + $$ + WITH corridor AS ( + SELECT + ccc.corridor_id, + unnested.link_dir, + unnested.length, + ccc.total_length + FROM gwolofs.congestion_cache_corridor(%1$L, %2$L, %3$L) AS ccc, + UNNEST( + ccc.link_dirs, + ccc.lengths + ) AS unnested(link_dir, length) + ), + + segment_5min_bins AS ( + SELECT + seg.corridor_id, + ta.tx, + seg.total_length, + tsrange( + ta.dt + %4$L::time, + ta.dt + %5$L::time, '[)') AS time_grp, + RANK() OVER w AS bin_rank, + SUM(seg.length) / seg.total_length AS sum_length, + SUM(seg.length) AS length_w_data, + SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, + SUM(sample_size) AS num_obs, + ARRAY_AGG(ta.link_dir ORDER BY ta.link_dir) AS link_dirs, + ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, + ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths + FROM here.ta_path AS ta + JOIN corridor AS seg USING (link_dir) + LEFT JOIN ref.holiday USING (dt) + WHERE + ( + ta.tod >= %4$L + AND --{ToD_and_or} + ta.tod < %5$L + ) + AND date_part('isodow', ta.dt) = ANY(%6$L::int[]) + AND ta.dt >= %7$L + AND ta.dt < %8$L + AND (%9$L OR holiday.dt IS NULL) --holiday clause + GROUP BY + seg.corridor_id, + ta.tx, + ta.dt, + seg.total_length + WINDOW w AS ( + PARTITION BY seg.corridor_id, ta.dt + ORDER BY ta.tx + ) + ), + + dynamic_bin_options AS ( + --within each corridor/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80%% length + SELECT + tx, + corridor_id, + time_grp, + bin_rank AS start_bin, + --generate all the options for the end bin within the group. + generate_series( + CASE + WHEN sum_length >= 0.8 THEN bin_rank + --if length is insufficient, need at least 1 more bin + ELSE LEAST(bin_rank + 1, MAX(bin_rank) OVER w) + END, + CASE + --dont need to generate options when start segment is already sufficient + WHEN sum_length >= 0.8 THEN bin_rank + --generate options until 1 bin has sufficient length, otherwise until last bin in group + ELSE COALESCE( + MIN(bin_rank) FILTER (WHERE sum_length >= 0.8) OVER w, + MAX(bin_rank) OVER w + ) + END, + 1 + ) AS end_bin + FROM segment_5min_bins + WINDOW w AS ( + PARTITION BY corridor_id, time_grp + ORDER BY tx + --look only forward for end_bin options + RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) + ), + + unnested_db_options AS ( + SELECT + s5b.corridor_id, + dbo.time_grp, + s5b.total_length, + dbo.tx AS dt_start, + --exclusive end bin + s5b_end.tx + interval '5 minutes' AS dt_end, + unnested.link_dir, + unnested.len, + AVG(unnested.tt) AS tt, --avg TT for each link_dir + SUM(s5b.num_obs) AS num_obs --sum of here.ta_path sample_size for each link_dir + FROM dynamic_bin_options AS dbo + LEFT JOIN segment_5min_bins AS s5b + ON s5b.time_grp = dbo.time_grp + AND s5b.bin_rank >= dbo.start_bin + AND s5b.bin_rank <= dbo.end_bin + --this join is used to get the tx info about the last bin only + LEFT JOIN segment_5min_bins AS s5b_end + ON s5b_end.time_grp = dbo.time_grp + AND s5b_end.bin_rank = dbo.end_bin, + --unnest all the observations from individual link_dirs to reaggregate them within new dynamic bin + UNNEST(s5b.link_dirs, s5b.lengths, s5b.tts) AS unnested(link_dir, len, tt) + --dynamic bins should not exceed one hour (dt_end <= dt_start + 1 hr) + WHERE s5b_end.tx + interval '5 minutes' <= dbo.tx + interval '30 minutes' + GROUP BY + s5b.corridor_id, + dbo.time_grp, + s5b.total_length, + dbo.tx, --stard_bin + s5b_end.tx, --end_bin + unnested.link_dir, + unnested.len + ) + + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO congestion_raw_corridors_temp AS inserted ( + time_grp, corridor_id, bin_range, tt, num_obs + ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (dt_start) + timerange(lower(time_grp)::time, upper(time_grp)::time, '[)') AS time_grp, + corridor_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + time_grp, + corridor_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + dt_start, + dt_end + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_corridors_exclude_temp + DO NOTHING + RETURNING + inserted.dt, inserted.time_grp, inserted.corridor_id, + inserted.bin_range, inserted.tt, inserted.num_obs, inserted.hr; + + $$, + node_start, node_end, map_version, --segment CTE + start_tod, end_tod, --segment_5min_bins CTE SELECT + dow_list, start_date, end_date, holidays --segment_5min_bins CTE WHERE +); + +END; +$BODY$; + +ALTER FUNCTION gwolofs.congestion_return_dynamic_bins( + date, date, time without time zone, + time without time zone, integer [], bigint, bigint, boolean +) +OWNER TO gwolofs; + +COMMENT ON FUNCTION gwolofs.congestion_return_dynamic_bins IS +'Returns the dynamic binning results for a request.'; + +-- overload the function for more straightforward situation of daily corridor agg +CREATE OR REPLACE FUNCTION gwolofs.congestion_return_dynamic_bins_daily( + start_date date, + node_start bigint, + node_end bigint +) +RETURNS void +LANGUAGE sql +COST 100 +VOLATILE PARALLEL UNSAFE +AS +$BODY$ +SELECT gwolofs.congestion_return_dynamic_bins( + start_date := congestion_return_dynamic_bins_daily.start_date, + end_date := congestion_return_dynamic_bins_daily.start_date + 1, + start_tod := '00:00'::time without time zone, + end_tod := '24:00'::time without time zone, + dow_list := ARRAY[extract('isodow' from congestion_return_dynamic_bins_daily.start_date)]::int[], + node_start := congestion_return_dynamic_bins_daily.node_start, + node_end := congestion_return_dynamic_bins_daily.node_end, + holidays := True) +$BODY$; + +COMMENT ON FUNCTION gwolofs.congestion_return_dynamic_bins_daily +IS 'A simplified version of `congestion_return_dynamic_bins` for aggregating entire days of data.' From dd3170b0f4fb49ea4bb615df4e4951c86a838b29 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:51:17 +0000 Subject: [PATCH 69/74] #1132 fix logical date of TriggerDagRunOperator --- dags/here_dynamic_binning_monthly_agg.py | 2 -- dags/pull_here_path.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dags/here_dynamic_binning_monthly_agg.py b/dags/here_dynamic_binning_monthly_agg.py index d46989cf9..75d7e9a29 100644 --- a/dags/here_dynamic_binning_monthly_agg.py +++ b/dags/here_dynamic_binning_monthly_agg.py @@ -68,8 +68,6 @@ def here_dynamic_binning_monthly_agg(): create_groups = SQLExecuteQueryOperator( sql="segment_grouping.sql", task_id="create_segment_groups", - #TODO: update sql to work for different map versions - start_date=datetime(2025, 4, 1, tz="America/Toronto"), conn_id='congestion_bot', retries = 0, params={"max_group_size": 100} diff --git a/dags/pull_here_path.py b/dags/pull_here_path.py index ee6888009..421c82987 100644 --- a/dags/pull_here_path.py +++ b/dags/pull_here_path.py @@ -96,7 +96,7 @@ def trigger_dags_tasks(): trigger_operator = TriggerDagRunOperator( task_id=f'trigger_{dag_id}', trigger_dag_id=dag_id, - logical_date='{{macros.ds_add(ds, 1)}}', + logical_date='{{macros.ds_add(ds, -1)}}', reset_dag_run=True # Clear existing dag if already exists (for backfilling), old runs will not be in the logs ) trigger_operators.append(trigger_operator) From 7863ef953c5bb95d658d3b6724b7e55b0c0369a3 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Oct 2025 20:58:04 +0000 Subject: [PATCH 70/74] #1132 changes to reflect change in hr datatype --- .../create-function-congestion_segment_bootstrap.sql | 6 +++--- .../function-congestion_segment_monthly_agg.sql | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql index 7a9e7ad8f..d95e82638 100644 --- a/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql +++ b/here/traffic/sql/dynamic_bins/create-function-congestion_segment_bootstrap.sql @@ -17,7 +17,7 @@ AS $BODY$ SELECT --segment_id and mnth don't need to be in group by until end EXTRACT('isodow' FROM dt) IN (1, 2, 3, 4, 5) AS is_wkdy, - EXTRACT('hour' FROM hr) AS hr, + hr, ARRAY_AGG(tt::real) AS tt_array, AVG(tt::real) AS avg_tt, COUNT(*) AS n @@ -29,7 +29,7 @@ AS $BODY$ GROUP BY segment_id, is_wkdy, - EXTRACT('hour' FROM hr) + hr ), random_selections AS ( @@ -77,7 +77,7 @@ AS $BODY$ GRANT EXECUTE ON FUNCTION gwolofs.congestion_segment_bootstrap( date, bigint, integer -) TO CONGESTION_BOT; +) TO congestion_bot; /*Usage example: (works best one segment at a time with Lateral) SELECT * diff --git a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql index 4c13186a8..ab2e7602a 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_segment_monthly_agg.sql @@ -19,7 +19,7 @@ SELECT segment_id, congestion_segment_monthly_agg.mon AS mnth, date_part('isodow', dt) <= 5 AS is_wkdy, - date_part('hour', hr) AS hr, + hr, AVG(tt) AS avg_tt, stddev(tt) AS stdev, PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY tt) AS percentile_05, @@ -36,7 +36,7 @@ WHERE AND holiday.holiday IS NULL GROUP BY segment_id, - date_part('hour', hr), + hr, is_wkdy; $BODY$; From 9c63a32c1daa31e21e2568c9dda497fe2e22f79f Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Oct 2025 20:58:14 +0000 Subject: [PATCH 71/74] #1132 try adding an analyze on temp table --- ...unction-congestion_network_segment_agg.sql | 71 +++++++++---------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index a99a94549..e8d51f6b2 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -138,40 +138,42 @@ EXECUTE FORMAT( s5b_end.tx, --end_bin unnested.link_dir, unnested.len - ), + ) - inserted AS ( - --this query contains overlapping values which get eliminated - --via on conflict with the exclusion constraint on congestion_raw_segments table. - INSERT INTO congestion_raw_segments_temp AS inserted ( - bin_start, segment_id, bin_range, tt, num_obs - ) - --distinct on ensures only the shortest option gets proposed for insert - SELECT DISTINCT ON (segment_id, dt_start) - dt_start AS bin_start, - segment_id, - tsrange(dt_start, dt_end, '[)') AS bin_range, - total_length / SUM(len) * SUM(tt) AS tt, - SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment - FROM unnested_db_options - GROUP BY - segment_id, - dt_start, - dt_end, - total_length - HAVING SUM(len) >= 0.8 * total_length - ORDER BY - segment_id, - dt_start, - dt_end --uses the option that ends first - --exclusion constraint + ordered insert to prevent overlapping bins - ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude_temp - DO NOTHING - RETURNING - inserted.bin_start, inserted.segment_id, inserted.bin_range, - inserted.tt, inserted.num_obs + --this query contains overlapping values which get eliminated + --via on conflict with the exclusion constraint on congestion_raw_segments table. + INSERT INTO congestion_raw_segments_temp AS inserted ( + bin_start, segment_id, bin_range, tt, num_obs ) + --distinct on ensures only the shortest option gets proposed for insert + SELECT DISTINCT ON (segment_id, dt_start) + dt_start AS bin_start, + segment_id, + tsrange(dt_start, dt_end, '[)') AS bin_range, + total_length / SUM(len) * SUM(tt) AS tt, + SUM(num_obs) AS num_obs --sum of here.ta_path sample_size for each segment + FROM unnested_db_options + GROUP BY + segment_id, + dt_start, + dt_end, + total_length + HAVING SUM(len) >= 0.8 * total_length + ORDER BY + segment_id, + dt_start, + dt_end --uses the option that ends first + --exclusion constraint + ordered insert to prevent overlapping bins + ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude_temp + DO NOTHING; + + $$, + start_date, + congestion_network_table + ); + ANALYZE congestion_raw_segments_temp; + INSERT INTO gwolofs.congestion_raw_segments ( dt, bin_start, segment_id, bin_range, tt, num_obs, hr ) @@ -183,13 +185,8 @@ EXECUTE FORMAT( tt::real, num_obs, date_part('hour', lower(bin_range) + (upper(bin_range) - lower(bin_range))/2) AS hr - FROM inserted + FROM congestion_raw_segments_temp ON CONFLICT DO NOTHING; - - $$, - start_date, - congestion_network_table - ); DROP TABLE congestion_raw_segments_temp; From 191a2c22e6207a899d8597a102da1d70bcce0387 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Thu, 9 Oct 2025 21:06:08 +0000 Subject: [PATCH 72/74] #1132 materialize? --- .../dynamic_bins/function-congestion_network_segment_agg.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index e8d51f6b2..a314837cb 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -44,7 +44,7 @@ EXECUTE FORMAT( FROM congestion.%2$I ), - segment_5min_bins AS ( + segment_5min_bins AS MATERIALIZED ( SELECT seg.segment_id, ta.tx, From 4e14aee95cd5dd5a100dc19da5d9090cd9c08c1a Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 15 Oct 2025 14:18:36 +0000 Subject: [PATCH 73/74] #1132 separate out delete query --- dags/here_dynamic_binning_agg.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/dags/here_dynamic_binning_agg.py b/dags/here_dynamic_binning_agg.py index 3d46af93b..166031952 100644 --- a/dags/here_dynamic_binning_agg.py +++ b/dags/here_dynamic_binning_agg.py @@ -65,9 +65,16 @@ def here_dynamic_binning_agg(): retry_delay=duration(days=1) ) + delete_daily = SQLExecuteQueryOperator( + sql="DELETE FROM gwolofs.congestion_raw_segments WHERE dt = '{{ ds }}'", + task_id='delete_daily', + conn_id='congestion_bot', + autocommit=True, + retries = 2 + ) + aggregate_daily = SQLExecuteQueryOperator( - sql=["DELETE FROM gwolofs.congestion_raw_segments WHERE dt = '{{ ds }}'", - "SELECT gwolofs.congestion_network_segment_agg('{{ ds }}'::date);"], + sql="SELECT gwolofs.congestion_network_segment_agg('{{ ds }}'::date);", task_id='aggregate_daily', conn_id='congestion_bot', autocommit=True, @@ -75,6 +82,6 @@ def here_dynamic_binning_agg(): hook_params={"options": "-c statement_timeout=10800000ms"} #3 hours ) - check_not_empty >> aggregate_daily + check_not_empty >> delete_daily >> aggregate_daily here_dynamic_binning_agg() \ No newline at end of file From b3c62df9b26a2770e25bf5b8c033a91e156c2232 Mon Sep 17 00:00:00 2001 From: gabrielwol <80077912+gabrielwol@users.noreply.github.com> Date: Wed, 15 Oct 2025 14:19:07 +0000 Subject: [PATCH 74/74] #1132 change cte's to temp tables with indices to speed up congestion_network_segment_agg --- ...unction-congestion_network_segment_agg.sql | 70 +++++++++---------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql index a314837cb..f5905d600 100644 --- a/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql +++ b/here/traffic/sql/dynamic_bins/function-congestion_network_segment_agg.sql @@ -31,52 +31,52 @@ CREATE TEMPORARY TABLE congestion_raw_segments_temp ( bin_range WITH &&, segment_id WITH = ) -); +) ON COMMIT DROP; EXECUTE FORMAT( $$ - WITH segments AS ( - SELECT - segment_id, - link_dir, - length, - SUM(length) OVER (PARTITION BY segment_id) AS total_length - FROM congestion.%2$I - ), - - segment_5min_bins AS MATERIALIZED ( + DROP TABLE IF EXISTS segment_5min_bins; + CREATE TEMP TABLE segment_5min_bins ON COMMIT DROP AS SELECT seg.segment_id, ta.tx, - seg.total_length, - RANK() OVER w AS bin_rank, - SUM(seg.length) / seg.total_length AS sum_length, + seg.segment_length AS total_length, + ROW_NUMBER() OVER w AS bin_rank, + SUM(seg.length) / seg.segment_length AS sum_length, SUM(seg.length) AS length_w_data, SUM(seg.length / ta.mean * 3.6) AS unadjusted_tt, SUM(sample_size) AS num_obs, ARRAY_AGG(ta.link_dir ORDER BY ta.link_dir) AS link_dirs, - ARRAY_AGG(seg.length / ta.mean * 3.6 ORDER BY ta.link_dir) AS tts, + ARRAY_AGG(lat.tt ORDER BY ta.link_dir) AS tts, ARRAY_AGG(seg.length ORDER BY ta.link_dir) AS lengths FROM here.ta_path AS ta - JOIN segments AS seg USING (link_dir) + JOIN congestion.%1$I AS seg USING (link_dir), + LATERAL ( + SELECT seg.length / ta.mean * 3.6 AS tt + ) AS lat WHERE - ta.dt >= %1$L::date - AND ta.dt < %1$L::date + interval '1 day' + ta.dt >= %2$L::date + AND ta.dt < %2$L::date + interval '1 day' GROUP BY seg.segment_id, ta.tx, - seg.total_length - WINDOW w AS ( + seg.segment_length + WINDOW w AS ( PARTITION BY seg.segment_id ORDER BY ta.tx - ) - ), + ); + $$, congestion_network_table, start_date); + + CREATE INDEX idx_s5b_segment_rank ON segment_5min_bins(segment_id, bin_rank); + CREATE INDEX idx_s5b_segment_tx ON segment_5min_bins(segment_id, tx); + ANALYZE segment_5min_bins; - dynamic_bin_options AS ( - --within each segment/hour, generate all possible forward looking bin combinations - --don't generate options for bins with sufficient length - --also don't generate options past the next bin with 80%% length - SELECT + --within each segment/hour, generate all possible forward looking bin combinations + --don't generate options for bins with sufficient length + --also don't generate options past the next bin with 80% length + DROP TABLE IF EXISTS dynamic_bin_options; + CREATE TEMP TABLE dynamic_bin_options ON COMMIT DROP AS + SELECT tx, segment_id, bin_rank AS start_bin, @@ -103,11 +103,14 @@ EXECUTE FORMAT( PARTITION BY segment_id ORDER BY tx --look only forward for end_bin options - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING - ) - ), + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ); - unnested_db_options AS ( + CREATE INDEX idx_dbo_composite ON dynamic_bin_options(segment_id, start_bin, end_bin); + CREATE INDEX idx_dbo_segment_tx ON dynamic_bin_options(segment_id, tx); + ANALYZE dynamic_bin_options; + + WITH unnested_db_options AS ( SELECT dbo.segment_id, s5b.total_length, @@ -167,11 +170,6 @@ EXECUTE FORMAT( ON CONFLICT ON CONSTRAINT congestion_raw_segments_exclude_temp DO NOTHING; - $$, - start_date, - congestion_network_table - ); - ANALYZE congestion_raw_segments_temp; INSERT INTO gwolofs.congestion_raw_segments (