chore: Option to run multiple instance ids by name (#854)

jemeza-codegen · web-flow · commit 9ecbe2bbf7ef · 2025-03-17T10:37:44.000-07:00
# Motivation

There currently isn't a way to evaluate multiple instance ids by name on
swe bench

# Content

Adds one more argument to `run_eval_command`, `run_eval` and
`get_sweb_ench_examples`

# Testing

&lt;!-- How was the change tested? --&gt;

# Please check the following before marking your PR as ready for review

- [ ] I have added tests for my changes
- [ ] I have updated the documentation or added new documentation as
needed
diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md
@@ -32,5 +32,7 @@
   --length INTEGER                The number of examples to process.
   --instance-id TEXT              The instance ID of the example to process.
   --repo TEXT                     The repo to use.
+  --instance-ids LIST_OF_STRINGS  The instance IDs of the examples to process.
+                                  Example: --instance-ids <instance_id1>,<instance_id2>,...
   --help                          Show this message and exit.
   ```
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -278,13 +278,14 @@ async def run_eval(
     repo: str | None = None,
     num_workers: int = 2,
     model: str = "claude-3-7-sonnet-latest",
+    instance_ids: list[str] = [],
 ):
     run_id = use_existing_preds or str(uuid.uuid4())
     print(f"Run ID: {run_id}")
     predictions_dir = PREDS_DNAME / f"results_{run_id}"
 
     dataset_enum = DATASET_DICT[dataset]
-    examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
+    examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo, instance_ids=instance_ids)
 
     try:
         if use_existing_preds is None:
@@ -352,6 +353,12 @@ async def run_eval(
         raise
 
 
+def list_of_strings(value: str) -> list[str]:
+    if value == "":
+        return []
+    return value.split(",")
+
+
 @click.command()
 @click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite")
 @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
@@ -363,10 +370,15 @@ async def run_eval(
     "--num-workers", help="The number of workers to use. This is the number of examples that will be processed concurrently. A large number may lead to rate limiting issues.", type=int, default=5
 )
 @click.option("--model", help="The model to use.", type=str, default="claude-3-7-sonnet-latest")
-def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model):
+@click.option("--instance-ids", help="The instance IDs of the examples to process. Example: --instance-ids <instance_id1>,<instance_id2>,...", type=list_of_strings, default="")
+def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model, instance_ids):
     print(f"Repo: {repo}")
     print(f"Model: {model}")
-    asyncio.run(run_eval(dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model))
+    asyncio.run(
+        run_eval(
+            dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model, instance_ids=instance_ids
+        )
+    )
 
 
 if __name__ == "__main__":
diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py
@@ -73,25 +73,27 @@ def get_swe_bench_examples(
     split: Literal["train", "dev", "test"] = "test",
     length: int | None = None,
     instance_id: str | None = None,
+    instance_ids: list[str] = [],
     repo: str | None = None,
 ) -> list[SweBenchExample]:
     """Fetch examples from the SWE-bench dataset using the datasets library.
 
     Args:
         dataset: The dataset to use ("lite", "full", or "verified")
         split: The dataset split to use
-        offset: Starting index for examples
         length: Number of examples to fetch
         instance_id: Optional specific instance ID to fetch
+        instance_ids: Optional list of instance IDs to fetch
+        repo: Optional specific repo to fetch
 
     Returns:
         List of SweBenchExample objects
     """
-    # Convert string dataset name to enum
-
     # Load the dataset with caching enabled
-    instance_ids = []
     if isinstance(dataset, SWEBenchLiteSubset):
+        if instance_ids:
+            msg = "instance_ids is not supported for lite subsets. Please pass a list of instance IDs instead."
+            raise ValueError(msg)
         swe_bench_dataset = load_dataset(SWEBenchDataset.LITE.value, download_mode="reuse_dataset_if_exists")
         instance_ids = LITE_SUBSETS[dataset]
     else: