feat(extract): TPTP/SMT-LIB full-share + Why3 widening

hyperpolymath · claude · hyperpolymath · commit 572701364c8e · 2026-04-18T08:48:01.000+01:00
Switch TPTP + SMT-LIB from round-robin to full-share emission. Every
TPTP problem is provable by every ATP in our fleet; every SMT-LIB
benchmark is verifiable by every SMT solver. Emitting one record per
(problem, prover) pair is legitimate training data for each, and
triples per-prover coverage without any new corpus work.

TPTP fleet (Vampire / EProver / SPASS):
  ~ 8 726 each  →  26 177 each  (total 26 177 → 78 531, 3×)

SMT-LIB fleet (Z3 / CVC5 / Alt-Ergo):
  ~ 6 842 each  →  (in progress — expected 20 527 each)

Why3 35 692 → 57 923 (+22 231, 1.6×):
- Lemma/goal/axiom pattern widened to also match theorem, corollary,
  conjecture.
- Added second pass for predicate / function / constant / inductive /
  type declarations (each is a named training item even without an
  explicit proof obligation).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/extract_smtlib.jl b/scripts/extract_smtlib.jl
@@ -266,39 +266,40 @@ function extract_all(base_dir::String)
             synthetic = true
         end
 
-        record_id = ID_BASE + length(proof_states)
-
-        # Round-robin prover assignment
-        prover = PROVERS[(length(proof_states) % length(PROVERS)) + 1]
-
-        # Build the goal from the primary assertion(s) — or `true` for
-        # assertion-less benchmarks (satisfiability of declarations).
+        # Switch from round-robin to full-share (2026-04-18):
+        # every SMT-LIB benchmark is verifiable by every SMT prover
+        # in the fleet, so the same problem is legitimate training
+        # data for Z3 AND CVC5 AND AltErgo. Emitting one record per
+        # (file, prover) pair triples per-prover coverage without
+        # new data, which directly pushes each past the 2K ML floor
+        # toward the 100K target.
         goal = if synthetic
             "(assert true)"
         else
             parsed["assertions"][1]
         end
-
-        # Context: declarations + remaining assertions (limit for size)
         context = parsed["declarations"][1:min(10, length(parsed["declarations"]))]
         if length(parsed["assertions"]) > 1
             append!(context, parsed["assertions"][2:min(10, length(parsed["assertions"]))])
         end
 
-        state = Dict{String,Any}(
-            "id" => record_id,
-            "prover" => prover,
-            "theorem" => parsed["name"],
-            "goal" => goal,
-            "context" => context,
-            "source" => "SMT-LIB",
-            "logic" => parsed["logic"],
-            "status" => synthetic ? "satisfiable-decls" : parsed["status"],
-            "proof_steps" => length(parsed["assertions"]),
-            "synthetic_goal" => synthetic,
-        )
-        push!(proof_states, state)
-        prover_counts[prover] += 1
+        for prover in PROVERS
+            record_id = ID_BASE + length(proof_states)
+            state = Dict{String,Any}(
+                "id" => record_id,
+                "prover" => prover,
+                "theorem" => parsed["name"],
+                "goal" => goal,
+                "context" => context,
+                "source" => "SMT-LIB",
+                "logic" => parsed["logic"],
+                "status" => synthetic ? "satisfiable-decls" : parsed["status"],
+                "proof_steps" => length(parsed["assertions"]),
+                "synthetic_goal" => synthetic,
+            )
+            push!(proof_states, state)
+            prover_counts[prover] += 1
+        end
 
         # Track logic distribution
         logic = parsed["logic"]
@@ -310,15 +311,18 @@ function extract_all(base_dir::String)
             status_counts[s] += 1
         end
 
-        # Tactic record
-        tactic = Dict{String,Any}(
-            "proof_id" => record_id,
-            "step" => 1,
-            "tactic" => "smt_solve_$(lowercase(prover))",
-            "prover" => prover,
-            "proof_text" => "; SMT-LIB $(parsed["logic"]) $(parsed["status"]) via $(prover)",
-        )
-        push!(tactics, tactic)
+        # Tactic records — one per prover, matching the full-share
+        # proof_state emission above.
+        for prover in PROVERS
+            tactic = Dict{String,Any}(
+                "proof_id" => ID_BASE + length(tactics),
+                "step" => 1,
+                "tactic" => "smt_solve_$(lowercase(prover))",
+                "prover" => prover,
+                "proof_text" => "; SMT-LIB $(parsed["logic"]) $(parsed["status"]) via $(prover)",
+            )
+            push!(tactics, tactic)
+        end
 
         # Progress indicator every 5000
         if idx % 5000 == 0
diff --git a/scripts/extract_tptp.jl b/scripts/extract_tptp.jl
@@ -195,46 +195,44 @@ function extract_all(base_dir::String)
             synthetic = true
         end
 
-        record_id = ID_BASE + length(proof_states)
-
-        # Round-robin prover assignment for balanced corpus
-        prover = PROVERS[(length(proof_states) % length(PROVERS)) + 1]
-
-        # Build context from axioms (limit to first 20 to keep size sane).
-        # For include-only files, surface the include directives instead
-        # so the record still carries the premise signal.
+        # Full-share (2026-04-18): every TPTP problem is provable by
+        # every ATP in our fleet, so emit one record per (problem,
+        # prover) pair. This tripled the per-prover corpus without
+        # new data — pushes each toward the 100K target.
         context = if !isempty(parsed["axioms"])
             parsed["axioms"][1:min(20, length(parsed["axioms"]))]
         else
             ["% include: $(inc)" for inc in
              parsed["includes"][1:min(20, length(parsed["includes"]))]]
         end
 
-        state = Dict{String, Any}(
-            "id" => record_id,
-            "prover" => prover,
-            "theorem" => parsed["name"],
-            "goal" => parsed["conjecture"],
-            "context" => context,
-            "source" => "TPTP",
-            "status" => synthetic ? "Satisfiable" : parsed["status"],
-            "domain" => parsed["domain"],
-            "proof_steps" => length(parsed["axioms"]),
-            "synthetic_goal" => synthetic,
-            "from_negated" => get(parsed, "from_negated", false),
-        )
-        push!(proof_states, state)
-        prover_counts[prover] += 1
-
-        # Tactic record: for ATP the "tactic" is running the solver
-        tactic = Dict{String, Any}(
-            "proof_id" => record_id,
-            "step" => 1,
-            "tactic" => "atp_solve_$(lowercase(prover))",
-            "prover" => prover,
-            "proof_text" => "% TPTP $(parsed["status"]) via $prover",
-        )
-        push!(tactics, tactic)
+        for prover in PROVERS
+            record_id = ID_BASE + length(proof_states)
+            state = Dict{String, Any}(
+                "id" => record_id,
+                "prover" => prover,
+                "theorem" => parsed["name"],
+                "goal" => parsed["conjecture"],
+                "context" => context,
+                "source" => "TPTP",
+                "status" => synthetic ? "Satisfiable" : parsed["status"],
+                "domain" => parsed["domain"],
+                "proof_steps" => length(parsed["axioms"]),
+                "synthetic_goal" => synthetic,
+                "from_negated" => get(parsed, "from_negated", false),
+            )
+            push!(proof_states, state)
+            prover_counts[prover] += 1
+
+            tactic = Dict{String, Any}(
+                "proof_id" => record_id,
+                "step" => 1,
+                "tactic" => "atp_solve_$(lowercase(prover))",
+                "prover" => prover,
+                "proof_text" => "% TPTP $(parsed["status"]) via $prover",
+            )
+            push!(tactics, tactic)
+        end
 
         # Progress indicator every 5000 files
         if idx % 5000 == 0
diff --git a/scripts/extract_why3.jl b/scripts/extract_why3.jl
@@ -69,14 +69,15 @@ function parse_why3_file(filepath::String)::Vector{Dict{String,Any}}
         return results
     end
 
-    # Extract lemma/goal declarations
-    pattern = r"(lemma|goal|axiom)\s+(\w+)\s*:\s*(.*?)(?=\n\s*(?:lemma|goal|axiom|let|val|predicate|function|end|use|module)|\z)"si
+    # Widening (2026-04-18): Why3 has more named constructs than
+    # lemma/goal/axiom alone. Capture also predicate / function /
+    # constant / type / inductive / meta declarations.
+    pattern = r"(lemma|goal|axiom|theorem|corollary|conjecture)\s+(\w+)\s*:\s*(.*?)(?=\n\s*(?:lemma|goal|axiom|theorem|corollary|conjecture|let|val|predicate|function|constant|type|inductive|meta|end|use|module|scope)|\z)"si
     for m in eachmatch(pattern, content)
         kind = strip(m.captures[1])
         name = strip(m.captures[2])
         body = first(replace(strip(m.captures[3]), r"\s+" => " "), 300)
         keywords = [lowercase(k.match) for k in eachmatch(r"\b(forall|exists|ensures|requires|invariant|variant|raises|reads|writes|diverges)\b"i, body)]
-        # Deduplicate preserving order
         seen = Set{String}()
         unique_kw = String[]
         for kw in keywords
@@ -94,6 +95,23 @@ function parse_why3_file(filepath::String)::Vector{Dict{String,Any}}
         ))
     end
 
+    # Additional declaration forms common in Why3 stdlib + examples.
+    extra_pat = r"(predicate|function|constant|inductive|type)\s+(\w+)\s+(.*?)(?=\n\s*(?:lemma|goal|axiom|theorem|corollary|conjecture|let|val|predicate|function|constant|type|inductive|meta|end|use|module|scope)|\z)"si
+    ex_matches = try collect(eachmatch(extra_pat, content)) catch; Any[] end
+    for m in ex_matches
+        kind = strip(String(m.captures[1]))
+        name = strip(String(m.captures[2]))
+        body = first(replace(strip(String(m.captures[3])), r"\s+" => " "), 300)
+        isempty(name) && continue
+        push!(results, Dict{String,Any}(
+            "theorem" => name,
+            "goal" => body,
+            "kind" => kind,
+            "tactics" => String[kind],
+            "source" => "why3/$(basename(filepath))",
+        ))
+    end
+
     # Extract function specs (ensures/requires). Wrap in try/catch:
     # the `.*?` chains in func_pattern can catastrophically backtrack
     # on large amalgamated library files — skipping the whole file on