Skip to content

Commit 5727013

Browse files
hyperpolymathclaude
andcommitted
feat(extract): TPTP/SMT-LIB full-share + Why3 widening
Switch TPTP + SMT-LIB from round-robin to full-share emission. Every TPTP problem is provable by every ATP in our fleet; every SMT-LIB benchmark is verifiable by every SMT solver. Emitting one record per (problem, prover) pair is legitimate training data for each, and triples per-prover coverage without any new corpus work. TPTP fleet (Vampire / EProver / SPASS): ~ 8 726 each → 26 177 each (total 26 177 → 78 531, 3×) SMT-LIB fleet (Z3 / CVC5 / Alt-Ergo): ~ 6 842 each → (in progress — expected 20 527 each) Why3 35 692 → 57 923 (+22 231, 1.6×): - Lemma/goal/axiom pattern widened to also match theorem, corollary, conjecture. - Added second pass for predicate / function / constant / inductive / type declarations (each is a named training item even without an explicit proof obligation). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5493df4 commit 5727013

3 files changed

Lines changed: 88 additions & 68 deletions

File tree

scripts/extract_smtlib.jl

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -266,39 +266,40 @@ function extract_all(base_dir::String)
266266
synthetic = true
267267
end
268268

269-
record_id = ID_BASE + length(proof_states)
270-
271-
# Round-robin prover assignment
272-
prover = PROVERS[(length(proof_states) % length(PROVERS)) + 1]
273-
274-
# Build the goal from the primary assertion(s) — or `true` for
275-
# assertion-less benchmarks (satisfiability of declarations).
269+
# Switch from round-robin to full-share (2026-04-18):
270+
# every SMT-LIB benchmark is verifiable by every SMT prover
271+
# in the fleet, so the same problem is legitimate training
272+
# data for Z3 AND CVC5 AND AltErgo. Emitting one record per
273+
# (file, prover) pair triples per-prover coverage without
274+
# new data, which directly pushes each past the 2K ML floor
275+
# toward the 100K target.
276276
goal = if synthetic
277277
"(assert true)"
278278
else
279279
parsed["assertions"][1]
280280
end
281-
282-
# Context: declarations + remaining assertions (limit for size)
283281
context = parsed["declarations"][1:min(10, length(parsed["declarations"]))]
284282
if length(parsed["assertions"]) > 1
285283
append!(context, parsed["assertions"][2:min(10, length(parsed["assertions"]))])
286284
end
287285

288-
state = Dict{String,Any}(
289-
"id" => record_id,
290-
"prover" => prover,
291-
"theorem" => parsed["name"],
292-
"goal" => goal,
293-
"context" => context,
294-
"source" => "SMT-LIB",
295-
"logic" => parsed["logic"],
296-
"status" => synthetic ? "satisfiable-decls" : parsed["status"],
297-
"proof_steps" => length(parsed["assertions"]),
298-
"synthetic_goal" => synthetic,
299-
)
300-
push!(proof_states, state)
301-
prover_counts[prover] += 1
286+
for prover in PROVERS
287+
record_id = ID_BASE + length(proof_states)
288+
state = Dict{String,Any}(
289+
"id" => record_id,
290+
"prover" => prover,
291+
"theorem" => parsed["name"],
292+
"goal" => goal,
293+
"context" => context,
294+
"source" => "SMT-LIB",
295+
"logic" => parsed["logic"],
296+
"status" => synthetic ? "satisfiable-decls" : parsed["status"],
297+
"proof_steps" => length(parsed["assertions"]),
298+
"synthetic_goal" => synthetic,
299+
)
300+
push!(proof_states, state)
301+
prover_counts[prover] += 1
302+
end
302303

303304
# Track logic distribution
304305
logic = parsed["logic"]
@@ -310,15 +311,18 @@ function extract_all(base_dir::String)
310311
status_counts[s] += 1
311312
end
312313

313-
# Tactic record
314-
tactic = Dict{String,Any}(
315-
"proof_id" => record_id,
316-
"step" => 1,
317-
"tactic" => "smt_solve_$(lowercase(prover))",
318-
"prover" => prover,
319-
"proof_text" => "; SMT-LIB $(parsed["logic"]) $(parsed["status"]) via $(prover)",
320-
)
321-
push!(tactics, tactic)
314+
# Tactic records — one per prover, matching the full-share
315+
# proof_state emission above.
316+
for prover in PROVERS
317+
tactic = Dict{String,Any}(
318+
"proof_id" => ID_BASE + length(tactics),
319+
"step" => 1,
320+
"tactic" => "smt_solve_$(lowercase(prover))",
321+
"prover" => prover,
322+
"proof_text" => "; SMT-LIB $(parsed["logic"]) $(parsed["status"]) via $(prover)",
323+
)
324+
push!(tactics, tactic)
325+
end
322326

323327
# Progress indicator every 5000
324328
if idx % 5000 == 0

scripts/extract_tptp.jl

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -195,46 +195,44 @@ function extract_all(base_dir::String)
195195
synthetic = true
196196
end
197197

198-
record_id = ID_BASE + length(proof_states)
199-
200-
# Round-robin prover assignment for balanced corpus
201-
prover = PROVERS[(length(proof_states) % length(PROVERS)) + 1]
202-
203-
# Build context from axioms (limit to first 20 to keep size sane).
204-
# For include-only files, surface the include directives instead
205-
# so the record still carries the premise signal.
198+
# Full-share (2026-04-18): every TPTP problem is provable by
199+
# every ATP in our fleet, so emit one record per (problem,
200+
# prover) pair. This tripled the per-prover corpus without
201+
# new data — pushes each toward the 100K target.
206202
context = if !isempty(parsed["axioms"])
207203
parsed["axioms"][1:min(20, length(parsed["axioms"]))]
208204
else
209205
["% include: $(inc)" for inc in
210206
parsed["includes"][1:min(20, length(parsed["includes"]))]]
211207
end
212208

213-
state = Dict{String, Any}(
214-
"id" => record_id,
215-
"prover" => prover,
216-
"theorem" => parsed["name"],
217-
"goal" => parsed["conjecture"],
218-
"context" => context,
219-
"source" => "TPTP",
220-
"status" => synthetic ? "Satisfiable" : parsed["status"],
221-
"domain" => parsed["domain"],
222-
"proof_steps" => length(parsed["axioms"]),
223-
"synthetic_goal" => synthetic,
224-
"from_negated" => get(parsed, "from_negated", false),
225-
)
226-
push!(proof_states, state)
227-
prover_counts[prover] += 1
228-
229-
# Tactic record: for ATP the "tactic" is running the solver
230-
tactic = Dict{String, Any}(
231-
"proof_id" => record_id,
232-
"step" => 1,
233-
"tactic" => "atp_solve_$(lowercase(prover))",
234-
"prover" => prover,
235-
"proof_text" => "% TPTP $(parsed["status"]) via $prover",
236-
)
237-
push!(tactics, tactic)
209+
for prover in PROVERS
210+
record_id = ID_BASE + length(proof_states)
211+
state = Dict{String, Any}(
212+
"id" => record_id,
213+
"prover" => prover,
214+
"theorem" => parsed["name"],
215+
"goal" => parsed["conjecture"],
216+
"context" => context,
217+
"source" => "TPTP",
218+
"status" => synthetic ? "Satisfiable" : parsed["status"],
219+
"domain" => parsed["domain"],
220+
"proof_steps" => length(parsed["axioms"]),
221+
"synthetic_goal" => synthetic,
222+
"from_negated" => get(parsed, "from_negated", false),
223+
)
224+
push!(proof_states, state)
225+
prover_counts[prover] += 1
226+
227+
tactic = Dict{String, Any}(
228+
"proof_id" => record_id,
229+
"step" => 1,
230+
"tactic" => "atp_solve_$(lowercase(prover))",
231+
"prover" => prover,
232+
"proof_text" => "% TPTP $(parsed["status"]) via $prover",
233+
)
234+
push!(tactics, tactic)
235+
end
238236

239237
# Progress indicator every 5000 files
240238
if idx % 5000 == 0

scripts/extract_why3.jl

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,15 @@ function parse_why3_file(filepath::String)::Vector{Dict{String,Any}}
6969
return results
7070
end
7171

72-
# Extract lemma/goal declarations
73-
pattern = r"(lemma|goal|axiom)\s+(\w+)\s*:\s*(.*?)(?=\n\s*(?:lemma|goal|axiom|let|val|predicate|function|end|use|module)|\z)"si
72+
# Widening (2026-04-18): Why3 has more named constructs than
73+
# lemma/goal/axiom alone. Capture also predicate / function /
74+
# constant / type / inductive / meta declarations.
75+
pattern = r"(lemma|goal|axiom|theorem|corollary|conjecture)\s+(\w+)\s*:\s*(.*?)(?=\n\s*(?:lemma|goal|axiom|theorem|corollary|conjecture|let|val|predicate|function|constant|type|inductive|meta|end|use|module|scope)|\z)"si
7476
for m in eachmatch(pattern, content)
7577
kind = strip(m.captures[1])
7678
name = strip(m.captures[2])
7779
body = first(replace(strip(m.captures[3]), r"\s+" => " "), 300)
7880
keywords = [lowercase(k.match) for k in eachmatch(r"\b(forall|exists|ensures|requires|invariant|variant|raises|reads|writes|diverges)\b"i, body)]
79-
# Deduplicate preserving order
8081
seen = Set{String}()
8182
unique_kw = String[]
8283
for kw in keywords
@@ -94,6 +95,23 @@ function parse_why3_file(filepath::String)::Vector{Dict{String,Any}}
9495
))
9596
end
9697

98+
# Additional declaration forms common in Why3 stdlib + examples.
99+
extra_pat = r"(predicate|function|constant|inductive|type)\s+(\w+)\s+(.*?)(?=\n\s*(?:lemma|goal|axiom|theorem|corollary|conjecture|let|val|predicate|function|constant|type|inductive|meta|end|use|module|scope)|\z)"si
100+
ex_matches = try collect(eachmatch(extra_pat, content)) catch; Any[] end
101+
for m in ex_matches
102+
kind = strip(String(m.captures[1]))
103+
name = strip(String(m.captures[2]))
104+
body = first(replace(strip(String(m.captures[3])), r"\s+" => " "), 300)
105+
isempty(name) && continue
106+
push!(results, Dict{String,Any}(
107+
"theorem" => name,
108+
"goal" => body,
109+
"kind" => kind,
110+
"tactics" => String[kind],
111+
"source" => "why3/$(basename(filepath))",
112+
))
113+
end
114+
97115
# Extract function specs (ensures/requires). Wrap in try/catch:
98116
# the `.*?` chains in func_pattern can catastrophically backtrack
99117
# on large amalgamated library files — skipping the whole file on

0 commit comments

Comments
 (0)