Skip to content

Commit 9ecbe2b

Browse files
chore: Option to run multiple instance ids by name (#854)
# Motivation There currently isn't a way to evaluate multiple instance ids by name on swe bench # Content Adds one more argument to `run_eval_command`, `run_eval` and `get_sweb_ench_examples` # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent e81996d commit 9ecbe2b

File tree

3 files changed

+23
-7
lines changed

3 files changed

+23
-7
lines changed

codegen-examples/examples/swebench_agent_run/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,7 @@
3232
--length INTEGER The number of examples to process.
3333
--instance-id TEXT The instance ID of the example to process.
3434
--repo TEXT The repo to use.
35+
--instance-ids LIST_OF_STRINGS The instance IDs of the examples to process.
36+
Example: --instance-ids <instance_id1>,<instance_id2>,...
3537
--help Show this message and exit.
3638
```

codegen-examples/examples/swebench_agent_run/run_eval.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,14 @@ async def run_eval(
278278
repo: str | None = None,
279279
num_workers: int = 2,
280280
model: str = "claude-3-7-sonnet-latest",
281+
instance_ids: list[str] = [],
281282
):
282283
run_id = use_existing_preds or str(uuid.uuid4())
283284
print(f"Run ID: {run_id}")
284285
predictions_dir = PREDS_DNAME / f"results_{run_id}"
285286

286287
dataset_enum = DATASET_DICT[dataset]
287-
examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
288+
examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo, instance_ids=instance_ids)
288289

289290
try:
290291
if use_existing_preds is None:
@@ -352,6 +353,12 @@ async def run_eval(
352353
raise
353354

354355

356+
def list_of_strings(value: str) -> list[str]:
357+
if value == "":
358+
return []
359+
return value.split(",")
360+
361+
355362
@click.command()
356363
@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite")
357364
@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
@@ -363,10 +370,15 @@ async def run_eval(
363370
"--num-workers", help="The number of workers to use. This is the number of examples that will be processed concurrently. A large number may lead to rate limiting issues.", type=int, default=5
364371
)
365372
@click.option("--model", help="The model to use.", type=str, default="claude-3-7-sonnet-latest")
366-
def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model):
373+
@click.option("--instance-ids", help="The instance IDs of the examples to process. Example: --instance-ids <instance_id1>,<instance_id2>,...", type=list_of_strings, default="")
374+
def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model, instance_ids):
367375
print(f"Repo: {repo}")
368376
print(f"Model: {model}")
369-
asyncio.run(run_eval(dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model))
377+
asyncio.run(
378+
run_eval(
379+
dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model, instance_ids=instance_ids
380+
)
381+
)
370382

371383

372384
if __name__ == "__main__":

src/codegen/extensions/swebench/utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,25 +73,27 @@ def get_swe_bench_examples(
7373
split: Literal["train", "dev", "test"] = "test",
7474
length: int | None = None,
7575
instance_id: str | None = None,
76+
instance_ids: list[str] = [],
7677
repo: str | None = None,
7778
) -> list[SweBenchExample]:
7879
"""Fetch examples from the SWE-bench dataset using the datasets library.
7980
8081
Args:
8182
dataset: The dataset to use ("lite", "full", or "verified")
8283
split: The dataset split to use
83-
offset: Starting index for examples
8484
length: Number of examples to fetch
8585
instance_id: Optional specific instance ID to fetch
86+
instance_ids: Optional list of instance IDs to fetch
87+
repo: Optional specific repo to fetch
8688
8789
Returns:
8890
List of SweBenchExample objects
8991
"""
90-
# Convert string dataset name to enum
91-
9292
# Load the dataset with caching enabled
93-
instance_ids = []
9493
if isinstance(dataset, SWEBenchLiteSubset):
94+
if instance_ids:
95+
msg = "instance_ids is not supported for lite subsets. Please pass a list of instance IDs instead."
96+
raise ValueError(msg)
9597
swe_bench_dataset = load_dataset(SWEBenchDataset.LITE.value, download_mode="reuse_dataset_if_exists")
9698
instance_ids = LITE_SUBSETS[dataset]
9799
else:

0 commit comments

Comments
 (0)