feat(gnn): close the training feedback loop — StatisticsTracker → Julia

hyperpolymath · claude · hyperpolymath · commit e4bf4c7bc534 · 2026-04-27T02:07:16.000+01:00
Turns ECHIDNA into an online learning system by wiring proof outcomes
from the Rust StatisticsTracker into the Julia GNN ML server so that
accumulated evidence influences future premise ranking.

Three parts:

1. StatisticsTracker::export_records() — exports all (prover, domain)
   stats as StatsSummaryRecord (new serialisable type) so the background
   sync task can push snapshots without holding the write lock.

2. Background GNN training sync (server.rs) — tokio::spawn task wakes
   every 60 s, checks if &gt;= 10 new outcomes have accumulated since the
   last push, then POSTs a snapshot to Julia's new /training/update
   endpoint.  Fire-and-forget: errors are logged at debug level.
   StatisticsTracker is now shared between MetaController (Bayesian
   routing) and the sync task via Arc&lt;RwLock&lt;StatisticsTracker&gt;&gt;.

3. Julia /training/update endpoint (gnn_endpoint.jl) — accepts
   [{ prover, domain, success_rate, … }] records, merges them into
   PROVER_DOMAIN_WEIGHTS, and applies the per-domain confidence as a
   [0.5, 1.0] score multiplier in rank_with_gnn() when the /gnn/rank
   caller provides domain_hints.  GnnRankRequest gains a domain_hints
   field (default empty, so all existing callers are unaffected).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/julia/api/gnn_endpoint.jl b/src/julia/api/gnn_endpoint.jl
@@ -31,6 +31,14 @@ using LinearAlgebra
 const GNN_MODEL = Ref{Any}(nothing)
 const GNN_VOCAB = Ref{Any}(nothing)
 
+# Per-(prover, domain) success-rate weights pushed from Rust via
+# POST /training/update.  Format: weights[prover_name][domain] = success_rate.
+# Used to modulate premise scores in rank_with_gnn when domain_hints present.
+const PROVER_DOMAIN_WEIGHTS = Ref{Dict{String,Dict{String,Float64}}}(Dict())
+
+# Running total of training records received since server start.
+const TOTAL_TRAINING_RECORDS = Ref{Int}(0)
+
 """
     load_gnn_model(models_dir::String)
 
@@ -94,23 +102,110 @@ function parse_proof_graph(body)
 end
 
 """
-    rank_with_gnn(g, node_features, goal_idx, premise_indices, config)
+    rank_with_gnn(g, node_features, goal_idx, premise_indices, config, domain_hints)
 
 Run GNN message passing + cross-attention scoring on the parsed graph.
 If a trained model is available, use it. Otherwise, use cosine similarity
 between the goal and premise node features as a fallback.
+
+When `domain_hints` is non-empty and `PROVER_DOMAIN_WEIGHTS` has been
+populated by prior `/training/update` calls, premise scores are modulated
+by the aggregate domain confidence from accumulated proof outcomes.
 """
-function rank_with_gnn(g, node_features, goal_idx, premise_indices, config)
+function rank_with_gnn(g, node_features, goal_idx, premise_indices, config, domain_hints=String[])
     model = GNN_MODEL[]
 
     if model !== nothing
-        # Use trained model for ranking
-        # (Delegate to neural_solver.jl PremiseRanker when available)
-        return rank_with_trained_model(model, g, node_features, goal_idx, premise_indices)
+        scores, indices = rank_with_trained_model(model, g, node_features, goal_idx, premise_indices)
+    else
+        # Fallback: cosine similarity between goal and premise features
+        scores, indices = rank_with_cosine(node_features, goal_idx, premise_indices)
+    end
+
+    # Apply accumulated training weights when the caller provided domain hints.
+    # This is a no-op until /training/update has been called at least once
+    # and the rank request includes non-empty domain_hints.
+    if !isempty(domain_hints) && !isempty(PROVER_DOMAIN_WEIGHTS[])
+        scores = apply_domain_weights(scores, domain_hints)
     end
 
-    # Fallback: cosine similarity between goal and premise features
-    return rank_with_cosine(node_features, goal_idx, premise_indices)
+    return (scores, indices)
+end
+
+"""
+    apply_domain_weights(scores, domain_hints)
+
+Scale premise scores by the mean success rate across all provers for the
+requested domain aspects.  Uses a `[0.5, 1.0]` range so that even low-
+confidence domains retain half the base score rather than collapsing to zero.
+
+When no training evidence exists for the requested domains, scores are
+returned unchanged.
+"""
+function apply_domain_weights(scores::Vector{Float32}, domain_hints::Vector{String})
+    weights = PROVER_DOMAIN_WEIGHTS[]
+    domain_rates = Float64[]
+    for domain in domain_hints
+        for (_, prover_weights) in weights
+            if haskey(prover_weights, domain)
+                push!(domain_rates, prover_weights[domain])
+            end
+        end
+    end
+    isempty(domain_rates) && return scores
+    mean_confidence = Statistics.mean(domain_rates)
+    scale = Float32(0.5 + 0.5 * mean_confidence)
+    return scores .* scale
+end
+
+"""
+    handle_training_update(req::HTTP.Request)
+
+POST /training/update — Receive proof-outcome statistics from the Rust server
+and update per-(prover, domain) success-rate weights used to modulate premise
+ranking scores.
+
+Payload: `{ "records": [{ "prover", "domain", "attempts", "successes",
+                           "timeouts", "failures", "mean_time_ms", "success_rate" }] }`
+
+The Rust `StatisticsTracker` is authoritative; Julia simply mirrors it so that
+the GNN ranking layer can incorporate proof-outcome evidence without a round-trip.
+"""
+function handle_training_update(req::HTTP.Request)
+    try
+        body = JSON3.read(String(req.body))
+        records = get(body, :records, [])
+
+        weights = PROVER_DOMAIN_WEIGHTS[]
+        n = 0
+        for rec in records
+            prover = string(get(rec, :prover, "Unknown"))
+            domain = string(get(rec, :domain, "unspecified"))
+            rate   = Float64(get(rec, :success_rate, 0.0))
+            if !haskey(weights, prover)
+                weights[prover] = Dict{String,Float64}()
+            end
+            weights[prover][domain] = rate
+            n += 1
+        end
+        PROVER_DOMAIN_WEIGHTS[] = weights
+        TOTAL_TRAINING_RECORDS[] += n
+
+        @info "Training update: $n records (total=$(TOTAL_TRAINING_RECORDS[]))"
+
+        return HTTP.Response(200, JSON3.write(Dict(
+            "status"          => "ok",
+            "records_received" => n,
+            "total_records"   => TOTAL_TRAINING_RECORDS[],
+            "weights_updated" => n > 0
+        )))
+    catch e
+        @error "Training update failed" exception=(e, catch_backtrace())
+        return HTTP.Response(500, JSON3.write(Dict(
+            "status" => "error",
+            "error"  => string(e)
+        )))
+    end
 end
 
 """
@@ -190,11 +285,14 @@ function handle_gnn_rank(req::HTTP.Request)
         top_k = get(body, :top_k, 20)
         min_score = get(body, :min_score, 0.05)
         include_embeddings = get(body, :include_embeddings, false)
+        domain_hints = String.(get(body, :domain_hints, String[]))
 
-        # Run GNN ranking
+        # Run GNN ranking; domain_hints allow weight-guided score modulation
+        # when /training/update has populated PROVER_DOMAIN_WEIGHTS.
         (scores, indices) = rank_with_gnn(
             g, node_features, goal_idx, premise_indices,
-            get(body, :config, nothing)
+            get(body, :config, nothing),
+            domain_hints
         )
 
         # Sort by score (descending)
@@ -268,15 +366,17 @@ Call this from the main api_server.jl to enable GNN functionality.
 """
 function register_gnn_routes!(existing_handler)
     @info "Registering GNN endpoints:"
-    @info "  POST /gnn/rank   — Rank premises via GNN"
-    @info "  GET  /gnn/health — GNN model status"
+    @info "  POST /gnn/rank        — Rank premises via GNN"
+    @info "  GET  /gnn/health      — GNN model status"
+    @info "  POST /training/update — Receive proof-outcome stats from Rust"
 
-    # Return a combined handler that dispatches to GNN routes first
     function combined_handler(req::HTTP.Request)
         if req.target == "/gnn/rank" && req.method == "POST"
             return handle_gnn_rank(req)
         elseif req.target == "/gnn/health"
             return handle_gnn_health(req)
+        elseif req.target == "/training/update" && req.method == "POST"
+            return handle_training_update(req)
         else
             return existing_handler(req)
         end
diff --git a/src/rust/gnn/client.rs b/src/rust/gnn/client.rs
@@ -72,6 +72,11 @@ struct GnnRankRequest {
     include_embeddings: bool,
     /// GNN configuration hints for the server
     config: GnnServerHints,
+    /// Optional domain-aspect tags for the goal (e.g. `["arithmetic.factorisation"]`).
+    /// When non-empty, Julia uses accumulated training weights for these domains
+    /// to modulate premise scores.  Empty for backwards-compatible callers.
+    #[serde(default)]
+    domain_hints: Vec<String>,
 }
 
 /// Serialised proof graph for the Julia server.
@@ -355,6 +360,7 @@ impl GnnClient {
                 num_gnn_layers: self.config.num_gnn_layers,
                 use_attention: self.config.use_attention,
             },
+            domain_hints: vec![],
         }
     }
 
diff --git a/src/rust/server.rs b/src/rust/server.rs
@@ -18,6 +18,7 @@ use echidna::agent::meta_controller::{MetaController, Plan};
 use echidna::agent::AgenticGoal;
 use echidna::core::{Goal, ProofState, Tactic, TacticResult, Term};
 use echidna::dispatch::ProverDispatcher;
+use echidna::verification::StatisticsTracker;
 use echidna::{ProverBackend, ProverConfig, ProverKind};
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
@@ -28,7 +29,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{Mutex, RwLock};
 use tower_http::cors::CorsLayer;
-use tracing::{info, instrument};
+use tracing::{debug, info, instrument};
 
 /// Application state shared across handlers
 #[derive(Clone)]
@@ -44,6 +45,9 @@ struct AppState {
     /// state accumulates across the server's lifetime.  All handlers that
     /// perform goal-aware dispatch share this single instance.
     meta_controller: Arc<MetaController>,
+    /// Shared proof-outcome statistics used by MetaController for Bayesian
+    /// routing and exported to Julia for GNN online learning.
+    stats: Arc<RwLock<StatisticsTracker>>,
 }
 
 /// A proof session
@@ -81,15 +85,59 @@ pub async fn start_server(port: u16, host: String, enable_cors: bool) -> Result<
         },
     }
 
-    let meta_controller = Arc::new(MetaController::new());
+    // Shared stats tracker — MetaController uses it for Bayesian routing;
+    // the background GNN sync task pushes snapshots to Julia for online learning.
+    let stats = Arc::new(RwLock::new(StatisticsTracker::new()));
+    let meta_controller = Arc::new(MetaController::new().with_stats(stats.clone()));
 
     let state = AppState {
         sessions: Arc::new(RwLock::new(HashMap::new())),
         ml_client,
         ml_api_url,
         meta_controller,
+        stats,
     };
 
+    // GNN training sync — background task that pushes accumulated proof-outcome
+    // stats to Julia's /training/update endpoint every 60 seconds.
+    // Only fires when >= 10 new outcomes have accumulated since the last push,
+    // so idle servers incur no traffic.  Errors are logged at debug level and
+    // do not affect the main server loop (fire-and-forget).
+    {
+        let sync_stats = state.stats.clone();
+        let sync_client = state.ml_client.clone();
+        let sync_url = state.ml_api_url.clone();
+        tokio::spawn(async move {
+            let mut last_push_count: u64 = 0;
+            let mut interval = tokio::time::interval(Duration::from_secs(60));
+            interval.tick().await; // consume the immediate first tick; wait 60s before first push
+            loop {
+                interval.tick().await;
+                let (total, records) = {
+                    let guard = sync_stats.read().await;
+                    (guard.total_attempts(), guard.export_records())
+                };
+                if total < last_push_count + 10 || records.is_empty() {
+                    continue;
+                }
+                last_push_count = total;
+                let url = format!("{}/training/update", sync_url);
+                let payload = json!({ "records": records });
+                match sync_client.post(&url).json(&payload).send().await {
+                    Ok(resp) if resp.status().is_success() => {
+                        info!("GNN training sync: {} records pushed to Julia", records.len());
+                    },
+                    Ok(resp) => {
+                        debug!("GNN training sync: Julia returned {}", resp.status());
+                    },
+                    Err(e) => {
+                        debug!("GNN training sync: Julia unavailable ({})", e);
+                    },
+                }
+            }
+        });
+    }
+
     // Build router
     let mut app = Router::new()
         // Groove discovery endpoint — returns capability manifest for service mesh.
diff --git a/src/rust/verification/mod.rs b/src/rust/verification/mod.rs
@@ -30,4 +30,4 @@ pub use portfolio::{PortfolioConfig, PortfolioResult, PortfolioSolver};
 pub use proof::{
     theorem_identity, Proof, ProofStateRecord, ProofVersion, TacticApplication, TacticStatus,
 };
-pub use statistics::StatisticsTracker;
+pub use statistics::{StatsSummaryRecord, StatisticsTracker};
diff --git a/src/rust/verification/statistics.rs b/src/rust/verification/statistics.rs
@@ -143,6 +143,27 @@ impl ProverDomainStats {
     }
 }
 
+/// Serialisable snapshot of one (prover, domain) stats pair.
+///
+/// Exported by `StatisticsTracker::export_records` and pushed to the Julia
+/// ML server by the background training-sync task so that online weight
+/// updates incorporate accumulated proof evidence.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StatsSummaryRecord {
+    /// Prover name — Debug-format of `ProverKind` (e.g. `"Z3"`, `"Lean"`).
+    pub prover: String,
+    /// Domain tag (e.g. `"arithmetic.factorisation"`).
+    pub domain: String,
+    pub attempts: u64,
+    pub successes: u64,
+    pub timeouts: u64,
+    pub failures: u64,
+    /// Mean proof time over successful attempts (ms). `None` if no successes.
+    pub mean_time_ms: Option<f64>,
+    /// Fraction of successful attempts `[0.0, 1.0]`.
+    pub success_rate: f64,
+}
+
 /// Tracks statistics across all provers and domains
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct StatisticsTracker {
@@ -322,6 +343,34 @@ impl StatisticsTracker {
     pub fn from_json(json: &str) -> Result<Self, serde_json::Error> {
         serde_json::from_str(json)
     }
+
+    /// Export every (prover, domain) entry as a flat `Vec<StatsSummaryRecord>`.
+    ///
+    /// Used by the background GNN training-sync task to push accumulated proof
+    /// evidence to the Julia ML server (`POST /training/update`).  The server
+    /// uses these records to update per-(prover, domain) success-rate weights
+    /// that modulate premise ranking scores.
+    pub fn export_records(&self) -> Vec<StatsSummaryRecord> {
+        self.stats
+            .iter()
+            .map(|(key, stats)| {
+                // Key format produced by `make_key`: "ProverDebug::domain"
+                let mut parts = key.splitn(2, "::");
+                let prover = parts.next().unwrap_or("Unknown").to_string();
+                let domain = parts.next().unwrap_or("unspecified").to_string();
+                StatsSummaryRecord {
+                    prover,
+                    domain,
+                    attempts: stats.attempts,
+                    successes: stats.successes,
+                    timeouts: stats.timeouts,
+                    failures: stats.failures,
+                    mean_time_ms: stats.mean_time_ms(),
+                    success_rate: stats.success_rate(),
+                }
+            })
+            .collect()
+    }
 }
 
 #[cfg(test)]
@@ -446,6 +495,35 @@ mod tests {
         assert_eq!(upper, 1.0);
     }
 
+    #[test]
+    fn test_export_records() {
+        let mut tracker = StatisticsTracker::new();
+        tracker.record_success(ProverKind::Z3, "arithmetic", 100);
+        tracker.record_success(ProverKind::Z3, "arithmetic", 200);
+        tracker.record_failure(ProverKind::Lean, "topology");
+
+        let records = tracker.export_records();
+        assert_eq!(records.len(), 2);
+
+        let z3 = records
+            .iter()
+            .find(|r| r.prover == "Z3" && r.domain == "arithmetic")
+            .expect("Z3::arithmetic record present");
+        assert_eq!(z3.successes, 2);
+        assert_eq!(z3.attempts, 2);
+        assert!((z3.success_rate - 1.0).abs() < 1e-10);
+        assert!(z3.mean_time_ms.is_some());
+
+        let lean = records
+            .iter()
+            .find(|r| r.prover == "Lean" && r.domain == "topology")
+            .expect("Lean::topology record present");
+        assert_eq!(lean.failures, 1);
+        assert_eq!(lean.successes, 0);
+        assert_eq!(lean.success_rate, 0.0);
+        assert!(lean.mean_time_ms.is_none());
+    }
+
     #[test]
     fn test_serialization_roundtrip() {
         let mut tracker = StatisticsTracker::new();
diff --git a/tests/agentic_integration.rs b/tests/agentic_integration.rs

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,11 @@ struct GnnRankRequest {`
`72`	`72`	`include_embeddings: bool,`
`73`	`73`	`/// GNN configuration hints for the server`
`74`	`74`	`config: GnnServerHints,`
	`75`	+ /// Optional domain-aspect tags for the goal (e.g. `["arithmetic.factorisation"]`).
	`76`	`+ /// When non-empty, Julia uses accumulated training weights for these domains`
	`77`	`+ /// to modulate premise scores. Empty for backwards-compatible callers.`
	`78`	`+ #[serde(default)]`
	`79`	`+ domain_hints: Vec<String>,`
`75`	`80`	`}`
`76`	`81`
`77`	`82`	`/// Serialised proof graph for the Julia server.`
`@@ -355,6 +360,7 @@ impl GnnClient {`
`355`	`360`	`num_gnn_layers: self.config.num_gnn_layers,`
`356`	`361`	`use_attention: self.config.use_attention,`
`357`	`362`	`},`
	`363`	`+ domain_hints: vec![],`
`358`	`364`	`}`
`359`	`365`	`}`
`360`	`366`