alarm-redist · philipwosull · Jul 19, 2024 · Aug 29, 2024 · Aug 29, 2024 · Sep 1, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -21,4 +21,4 @@ builder.sh
 ^explore$
 ^\.github$
 ^LICENSE\.md$
-^CRAN-SUBMISSION$
+^CRAN-SUBMISSION$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,14 @@
 Package: redist
-Version: 4.3.0
-Date: 2025-07-07
+Version: 5.0.0.1
+Date: 2025-08-08
 Title: Simulation Methods for Legislative Redistricting
 Authors@R: c(
     person("Christopher T.", "Kenny", email = "christopherkenny@fas.harvard.edu", 
            role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9386-6860")),
     person("Cory", "McCartan", email = "mccartan@psu.edu", role = "aut",
            comment = c(ORCID = "0000-0002-6251-669X")),
+    person("Philip", "O'Sullivan", email = "posullivan@fas.harvard.edu", role = "aut",
+           comment = c(ORCID = "0000-0002-9665-2462")),
     person("Ben", "Fifield", email = "benfifield@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0002-2247-0201")),
     person("Kosuke", "Imai", email = "imai@harvard.edu", role = "aut",

diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ S3method(ungroup,redist_plans)
 S3method(weights,redist_plans)
 export(add_constr_compet)
 export(add_constr_custom)
+export(add_constr_custom_plan)
 export(add_constr_edges_rem)
 export(add_constr_fry_hold)
 export(add_constr_grp_hinge)
@@ -67,6 +68,7 @@ export(add_constr_grp_pow)
 export(add_constr_incumbency)
 export(add_constr_log_st)
 export(add_constr_multisplits)
+export(add_constr_plan_splits)
 export(add_constr_polsby)
 export(add_constr_pop_dev)
 export(add_constr_segregation)
@@ -86,11 +88,13 @@ export(filter)
 export(freeze)
 export(get_adj)
 export(get_existing)
+export(get_existing_seats)
 export(get_mh_acceptance_rate)
 export(get_plans_matrix)
 export(get_plans_weights)
 export(get_pop_tol)
 export(get_sampling_info)
+export(get_seats_matrix)
 export(get_target)
 export(group_frac)
 export(is_contiguous)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,45 @@
+# 5.0.0
+* Replaces old SMC weights with generally lower variance optimal weights.
+* Adds the option to add Mergesplit MCMC steps at any point during an SMC run. 
+Adding mergesplit steps can help achieve convergence for plans with a larger 
+number of districts without increasing the sample size. 
+* Improves SMC and Mergesplit MCMC performance by pre-allocating and reusing as 
+much memory as possible while drawing spanning trees.
+* Introduces new methods for sampling plans for both SMC and Mergesplit MCMC. 
+The final output is still plans from the same distribution as before but new 
+sampling spaces and splitting methods will sometimes perform better under some
+scenarios.
+* Introduces a new method for splitting in SMC - generalized region splits.
+Instead of splitting off one district at a time this allows for splitting into
+two arbitrary sized regions. For an equal sample size generalized region splits 
+tends to converge slower but it is typically much faster (up to twice as fast or 
+more) since on average it draws spanning trees on smaller subgraphs then 
+single district splits. 
+* Adds support for sampling multimember district plans with both SMC and 
+mergesplit MCMC under some mild conditions. The district seat sizes (how many 
+legislators a district can have) must be a range of values e.g. (3,4,5) and no 
+district seat size can be the sum of two others. 
+* When counties are used `redist_mergesplit` now samples from the same target 
+distribution as `redist_smc` (it guarantees no more than the number of districts
+minus 1 splits). 
+* `redist_mergesplit` inputs now work differently. 
+    * `nsims` is now the number of plans saved. 
+    * `warmup` is the number of steps to run the chain for before collecting any samples.
+    * `thin` means we will run the chain for `thin - 1` steps between saving plans
+    * Overall the chain will be run for `warmup + nsims * thin` and return `nsims` plans.
+* Adds the option to incorporate rejection sampling for all constraints in SMC
+and mergesplit MCMC. Any constraint can now include a threshold argument `thresh` 
+where for a newly split plan if either of the two new regions has a raw score 
+greater than or equal to `thresh` then the plan will be automatically reject. 
+This amounts to giving plans where any region has a score above `thresh` a 
+probability of 0. 
+* Updates the target distribution when counties are turned on. For more details 
+see the forthcoming working paper. 
+* The mergesplit backend for `redist_shortburst` now uses uniform edge sampling 
+with forest space for the backend instead of sampling with graph space and all
+`k` related parameters have been removed. 
+
+
 # 4.3.0
 * Improves SMC performance by pre-allocating some memory while drawing spanning trees.
 * Replaces SMC label-counting adjustments (exact and importance-sampling-based) with a new backward kernel that eliminates approximation error and requires far less computation

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -53,6 +53,14 @@ dist_dist_diff <- function(p, i_dist, j_dist, x_center, y_center, x, y) {
     .Call(`_redist_dist_dist_diff`, p, i_dist, j_dist, x_center, y_center, x, y)
 }
 
+get_region_multigraph <- function(adj_list, region_ids) {
+    .Call(`_redist_get_region_multigraph`, adj_list, region_ids)
+}
+
+get_region_laplacian <- function(adj_list, region_ids) {
+    .Call(`_redist_get_region_laplacian`, adj_list, region_ids)
+}
+
 log_st_map <- function(g, districts, counties, n_distr) {
     .Call(`_redist_log_st_map`, g, districts, counties, n_distr)
 }
@@ -69,6 +77,52 @@ calcPWDh <- function(x) {
     .Call(`_redist_calcPWDh`, x)
 }
 
+#'
+#' @returns A list with the following 
+#'     - `uncut_tree`: The spanning tree drawn on the region stored as a
+#'     0-indexed directed edge adjacency graph.
+#'     - `num_attempts`: The number of attempts it took to draw the tree.
+#' 
+#' @keywords internal
+#' @noRd
+draw_a_tree_on_a_region <- function(adj_list, counties, pop, ndists, num_regions, num_districts, region_id_to_draw_tree_on, lower, upper, region_ids, region_sizes, verbose) {
+    .Call(`_redist_draw_a_tree_on_a_region`, adj_list, counties, pop, ndists, num_regions, num_districts, region_id_to_draw_tree_on, lower, upper, region_ids, region_sizes, verbose)
+}
+
+#' Splits a multidistrict into two new regions within population bounds
+#'
+#' Splits a multidistrict into two new valid regions by drawing spanning
+#' trees uniformly at random and attempting to find an edge to cut until
+#' a successful cut is made.
+#'
+#' @title Split a multidistrict into two regions
+#'
+#' @inheritParams run_redist_smc
+#' @noRd
+perform_a_valid_multidistrict_split <- function(adj_list, counties, pop, ndists, num_regions, num_districts, region_id_to_split, target, lower, upper, region_ids, region_sizes, split_dval_min, split_dval_max, split_district_only, verbose = FALSE, k_param = 1L) {
+    .Call(`_redist_perform_a_valid_multidistrict_split`, adj_list, counties, pop, ndists, num_regions, num_districts, region_id_to_split, target, lower, upper, region_ids, region_sizes, split_dval_min, split_dval_max, split_district_only, verbose, k_param)
+}
+
+draw_trees_on_a_region <- function(adj_list, counties, pop, ndists, region_id_to_draw_tree_on, region_size, lower, target, upper, region_ids, num_tree, num_threads, verbose) {
+    .Call(`_redist_draw_trees_on_a_region`, adj_list, counties, pop, ndists, region_id_to_draw_tree_on, region_size, lower, target, upper, region_ids, num_tree, num_threads, verbose)
+}
+
+attempt_splits_on_a_region <- function(adj_list, counties, pop, ndists, init_num_regions, region_id_to_split, lower, target, upper, region_ids, region_sizes, splitting_schedule_str, k_param, num_plans, num_threads, verbose) {
+    .Call(`_redist_attempt_splits_on_a_region`, adj_list, counties, pop, ndists, init_num_regions, region_id_to_split, lower, target, upper, region_ids, region_sizes, splitting_schedule_str, k_param, num_plans, num_threads, verbose)
+}
+
+compute_log_unnormalized_target_density_components <- function(adj_list, counties, pop, constraints, pop_temper, compute_pop_temper, rho, ndists, total_seats, num_regions, district_seat_sizes, lower, target, upper, region_ids, region_sizes, output_type, num_threads) {
+    .Call(`_redist_compute_log_unnormalized_target_density_components`, adj_list, counties, pop, constraints, pop_temper, compute_pop_temper, rho, ndists, total_seats, num_regions, district_seat_sizes, lower, target, upper, region_ids, region_sizes, output_type, num_threads)
+}
+
+compute_plans_log_optimal_weights <- function(adj_list, counties, pop, constraints, pop_temper, rho, splitting_schedule_str, ndists, total_seats, district_seat_sizes, num_regions, lower, target, upper, region_ids, region_sizes, num_threads) {
+    .Call(`_redist_compute_plans_log_optimal_weights`, adj_list, counties, pop, constraints, pop_temper, rho, splitting_schedule_str, ndists, total_seats, district_seat_sizes, num_regions, lower, target, upper, region_ids, region_sizes, num_threads)
+}
+
+compute_plans_log_simple_weights <- function(adj_list, counties, pop, constraints, pop_temper, rho, splitting_schedule_str, ndists, total_seats, district_seat_sizes, num_regions, lower, target, upper, region_ids, region_sizes, num_threads) {
+    .Call(`_redist_compute_plans_log_simple_weights`, adj_list, counties, pop, constraints, pop_temper, rho, splitting_schedule_str, ndists, total_seats, district_seat_sizes, num_regions, lower, target, upper, region_ids, region_sizes, num_threads)
+}
+
 group_pct_top_k <- function(m, group_pop, total_pop, k, n_distr) {
     .Call(`_redist_group_pct_top_k`, m, group_pop, total_pop, k, n_distr)
 }
@@ -89,20 +143,32 @@ prec_cooccur <- function(m, idxs, ncores = 0L) {
     .Call(`_redist_prec_cooccur`, m, idxs, ncores)
 }
 
-group_pct <- function(m, group_pop, total_pop, n_distr) {
-    .Call(`_redist_group_pct`, m, group_pop, total_pop, n_distr)
+group_pct <- function(plans_mat, group_pop, total_pop, n_distr, ncores = 0L) {
+    .Call(`_redist_group_pct`, plans_mat, group_pop, total_pop, n_distr, ncores)
+}
+
+pop_tally <- function(districts, pop, n_distr, ncores = 0L) {
+    .Call(`_redist_pop_tally`, districts, pop, n_distr, ncores)
 }
 
-pop_tally <- function(districts, pop, n_distr) {
-    .Call(`_redist_pop_tally`, districts, pop, n_distr)
+infer_region_seats <- function(region_pops, lower, upper, total_seats, num_threads = 0L) {
+    .Call(`_redist_infer_region_seats`, region_pops, lower, upper, total_seats, num_threads)
 }
 
-max_dev <- function(districts, pop, n_distr) {
-    .Call(`_redist_max_dev`, districts, pop, n_distr)
+max_dev <- function(districts, pop, n_distr, multimember_districts = FALSE, nseats = -1L, seats_matrix = matrix(1,1), num_threads = 1L) {
+    .Call(`_redist_max_dev`, districts, pop, n_distr, multimember_districts, nseats, seats_matrix, num_threads)
 }
 
-ms_plans <- function(N, l, init, counties, pop, n_distr, target, lower, upper, rho, constraints, control, k, thin, verbosity) {
-    .Call(`_redist_ms_plans`, N, l, init, counties, pop, n_distr, target, lower, upper, rho, constraints, control, k, thin, verbosity)
+order_district_stats <- function(district_stats, ndists, num_threads) {
+    .Call(`_redist_order_district_stats`, district_stats, ndists, num_threads)
+}
+
+order_columns_by_district <- function(df, columns, ndists, num_threads = 0L) {
+    .Call(`_redist_order_columns_by_district`, df, columns, ndists, num_threads)
+}
+
+ms_plans <- function(nsims, warmup, thin, ndists, total_seats, district_seat_sizes, adj_list, counties, pop, target, lower, upper, rho, init_plan, init_seats, sampling_space_str, pair_rule, control, constraints, verbosity = 3L, diagnostic_mode = FALSE) {
+    .Call(`_redist_ms_plans`, nsims, warmup, thin, ndists, total_seats, district_seat_sizes, adj_list, counties, pop, target, lower, upper, rho, init_plan, init_seats, sampling_space_str, pair_rule, control, constraints, verbosity, diagnostic_mode)
 }
 
 pareto_dominated <- function(x) {
@@ -125,6 +191,105 @@ resample_lowvar <- function(wgts) {
     .Call(`_redist_resample_lowvar`, wgts)
 }
 
+maximum_input_sizes <- function() {
+    .Call(`_redist_maximum_input_sizes`)
+}
+
+#' Checks a matrix of seat counts is valid
+#'
+#' Checks that a matrix of seat counts associated with a plan is valid
+#' meaning that every region has a positive seat value and for each plan
+#' the sum of seats is equal to the total number of seats (`nseats`). 
+#' If anything is not correct an error will be thrown.
+#'
+#' @param init_seats A matrix of 1-indexed plans
+#' @param num_regions The number of regions in the plan.
+#' @param nseats The total number of seats in the map 
+#' @param seats_range Vector of number of seats a district is allowed to have
+#' @param split_districts_only Whether or not to check that all but the last region are
+#' districts or not. (Allows for the possibility the last region is a district too).
+#' @param num_threads The number of threads to use. Defaults to number of machine threads.
+#'
+#' @details Modifications
+#'    - None
+#'
+#' @keywords internal
+#' @noRd
+validate_init_seats_cpp <- function(init_seats, num_regions, nseats, seats_range, split_districts_only, num_threads = 1L) {
+    invisible(.Call(`_redist_validate_init_seats_cpp`, init_seats, num_regions, nseats, seats_range, split_districts_only, num_threads))
+}
+
+#' Get canonically relabeled plans matrix
+#'
+#' Given a matrix of 1-indexed plans (or partial plans) this function 
+#' returns a new plans matrix with all the plans labeled canonically. 
+#' The canonical labelling of a plan is the one where the region of the 
+#' first vertex gets mapped to 1, the region of the next smallest vertex
+#' in a different region than the first gets mapped to 2, and so on. This
+#' is guaranteed to result in the same labelling for any plan where the 
+#' region ids have been permuted. 
+#'
+#'
+#' @param plans_mat A matrix of 1-indexed plans
+#' @param num_regions The number of regions in the plan
+#' @param num_threads The number of threads to use. Defaults to number of machine threads.
+#'
+#' @details Modifications
+#'    - None
+#'
+#' @returns A matrix of canonically labelled plans
+#'
+#' @keywords internal
+#' @noRd
+get_canonical_plan_labelling <- function(plans_mat, num_regions, num_threads = 0L) {
+    .Call(`_redist_get_canonical_plan_labelling`, plans_mat, num_regions, num_threads)
+}
+
+#' Count how many times each plan appears in a plans matrix
+#'
+#' Given a matrix of 1-indexed plans (or partial plans) this function 
+#' returns a list mapping plan vectors as a giant concatened string to 
+#' the count of how many times the plan appears. 
+#'
+#' If `use_canonical_ordering` is set to true then the plans will be 
+#' reordered using the canonical reordering function 
+#' `get_canonical_plan_labelling`. This guarantees that the same plan
+#' will not be incorrectly counted if there are different permutations 
+#' of its labels. If `use_canonical_ordering` is not set to true then 
+#' its possible the count will be incorrect because of different 
+#' permutations of the same underlying plan.
+#'
+#'
+#' @param plans_mat A matrix of 1-indexed plans
+#' @param num_regions The number of regions in the plan
+#' @param use_canonical_ordering Whether or not to reorder the plans using the 
+#' canonical ordering on plans. 
+#' @param num_threads The number of threads to use. Defaults to number of machine threads.
+#'
+#' @details Modifications
+#'    - None
+#'
+#' @returns A list mapping plans (stored as a string concatened vector) to 
+#' how many times they appear in the matrix 
+#'
+#' @keywords internal
+#' @noRd
+get_plan_counts <- function(input_plans_mat, num_regions, use_canonical_ordering = TRUE, num_threads = 0L) {
+    .Call(`_redist_get_plan_counts`, input_plans_mat, num_regions, use_canonical_ordering, num_threads)
+}
+
+resample_plans_lowvar <- function(normalized_weights, plans_mat, region_pops_mat, region_sizes_mat, reorder_sizes_mat) {
+    .Call(`_redist_resample_plans_lowvar`, normalized_weights, plans_mat, region_pops_mat, region_sizes_mat, reorder_sizes_mat)
+}
+
+get_log_number_linking_edges <- function(adj_list, counties, constraints, ndists, nseats, num_regions, region_ids) {
+    .Call(`_redist_get_log_number_linking_edges`, adj_list, counties, constraints, ndists, nseats, num_regions, region_ids)
+}
+
+get_merged_log_number_linking_edges <- function(adj_list, counties, constraints, ndists, nseats, num_regions, region_ids, region1_id, region2_id) {
+    .Call(`_redist_get_merged_log_number_linking_edges`, adj_list, counties, constraints, ndists, nseats, num_regions, region_ids, region1_id, region2_id)
+}
+
 plan_joint <- function(m1, m2, pop) {
     .Call(`_redist_plan_joint`, m1, m2, pop)
 }
@@ -149,8 +314,34 @@ k_biggest <- function(x, k = 1L) {
     .Call(`_redist_k_biggest`, x, k)
 }
 
-smc_plans <- function(N, l, counties, pop, n_distr, target, lower, upper, rho, districts, n_drawn, n_steps, constraints, control, verbosity = 1L) {
-    .Call(`_redist_smc_plans`, N, l, counties, pop, n_distr, target, lower, upper, rho, districts, n_drawn, n_steps, constraints, control, verbosity)
+#' Run SMC (optionally with Merge Split steps too)
+#'
+#' Uses smc method with optimal weights and merge split steps to generate a sample of `nsims` plans in `c++` 
+#' 
+#' 
+#' Using the procedure outlined in <PAPER HERE> this function uses Sequential
+#' Monte Carlo (SMC) methods to generate a sample of `M` plans
+#'
+#'
+#' @param ndists The number of districts the final plans will have
+#' @param adj_list A 0-indexed adjacency list representing the undirected graph
+#' which represents the underlying map the plans are to be drawn on
+#' @param counties Vector of county labels of each vertex in `g`
+#' @param pop A vector of the population associated with each vertex in `g`
+#' @param target Ideal population of a valid district. This is what deviance is calculated
+#' relative to
+#' @param lower Acceptable lower bounds on a valid district's population
+#' @param upper Acceptable upper bounds on a valid district's population
+#' @param nsims The number of plans (samples) to draw
+#' @param k_param The k parameter from the SMC algorithm, you choose among the top k_param edges
+#' @param control Named list of additional parameters.
+#' @param num_threads The number of threads the threadpool should use
+#' @param verbosity What level of detail to print out while the algorithm is
+#' running <ADD OPTIONS>
+#' @keywords internal
+#' @noRd
+run_redist_smc <- function(nsims, total_seats, ndists, district_seat_sizes, initial_num_regions, adj_list, counties, pop, step_types, target, lower, upper, rho, sampling_space_str, control, constraints, verbosity, diagnostic_level, region_id_mat, region_sizes_mat, log_weights) {
+    .Call(`_redist_run_redist_smc`, nsims, total_seats, ndists, district_seat_sizes, initial_num_regions, adj_list, counties, pop, step_types, target, lower, upper, rho, sampling_space_str, control, constraints, verbosity, diagnostic_level, region_id_mat, region_sizes_mat, log_weights)
 }
 
 splits <- function(dm, community, nd, max_split) {