Skip to content

Commit 6cd39bb

Browse files
authored
[Benchmark] Add support for SciDocBench benchmark (#1511)
* [Benchmark] Add support for SciDocBench benchmark - Add SciDocBench dataset class with multi-page image handling - Support three evaluation methods: json_match, judge, exec_match - Add reasoning verification as secondary scoring pass - Register default judge model (gpt-4o-mini) in run.py * [Fix] SciDocBench: fix isort lint and add parallel judge evaluation with checkpoint resume - Fix import ordering to pass isort pre-commit hook - Use track_progress_rich for parallel judge/reasoning calls - Add pkl checkpoint for resumable evaluation on interruption
1 parent a9343a1 commit 6cd39bb

3 files changed

Lines changed: 486 additions & 1 deletion

File tree

run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,8 @@ def get_judge_kwargs(dataset_name, dataset_type, args):
317317
judge_kwargs['model'] = 'gpt-4o-mini'
318318
elif listinstr(['MaCBench'], dataset_name):
319319
judge_kwargs['model'] = 'gpt-4o-mini'
320+
elif listinstr(['SciDocBench'], dataset_name):
321+
judge_kwargs['model'] = 'gpt-4o-mini'
320322

321323
if args.use_verifier:
322324
judge_kwargs['use_verifier'] = True

vlmeval/dataset/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
from .refspatialbench import RefSpatialBench
108108
from .robospatialbench import RoboSpatialBench
109109
from .sarena import SArena
110+
from .scidocbench import SciDocBench
110111
from .sfebench import SFE
111112
from .SGI_Bench_1_0.deep_research import SGI_Bench_Deep_Research
112113
from .SGI_Bench_1_0.dry_experiment import SGI_Bench_Dry_Experiment
@@ -290,7 +291,8 @@ def evaluate(self, eval_file, **judge_kwargs):
290291
UniSVG, SArena, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
291292
FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles, # noqa: E501
292293
MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset, # noqa: E501
293-
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED # noqa: E501
294+
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED, # noqa: E501
295+
SciDocBench,
294296
]
295297

296298
# add by EASI team

0 commit comments

Comments
 (0)