diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md index 277578f16..ddbf86814 100644 --- a/codegen-examples/examples/swebench_agent_run/README.md +++ b/codegen-examples/examples/swebench_agent_run/README.md @@ -32,5 +32,7 @@ --length INTEGER The number of examples to process. --instance-id TEXT The instance ID of the example to process. --repo TEXT The repo to use. + --instance-ids LIST_OF_STRINGS The instance IDs of the examples to process. + Example: --instance-ids ,,... --help Show this message and exit. ``` diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py index b1567f8f6..98d300855 100644 --- a/codegen-examples/examples/swebench_agent_run/run_eval.py +++ b/codegen-examples/examples/swebench_agent_run/run_eval.py @@ -278,13 +278,14 @@ async def run_eval( repo: str | None = None, num_workers: int = 2, model: str = "claude-3-7-sonnet-latest", + instance_ids: list[str] = [], ): run_id = use_existing_preds or str(uuid.uuid4()) print(f"Run ID: {run_id}") predictions_dir = PREDS_DNAME / f"results_{run_id}" dataset_enum = DATASET_DICT[dataset] - examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo) + examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo, instance_ids=instance_ids) try: if use_existing_preds is None: @@ -352,6 +353,12 @@ async def run_eval( raise +def list_of_strings(value: str) -> list[str]: + if value == "": + return [] + return value.split(",") + + @click.command() @click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified", "lite_small", "lite_medium", "lite_large"]), default="lite") @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None) @@ -363,10 +370,15 @@ async def run_eval( "--num-workers", help="The number of workers to use. This is the number of examples that will be processed concurrently. A large number may lead to rate limiting issues.", type=int, default=5 ) @click.option("--model", help="The model to use.", type=str, default="claude-3-7-sonnet-latest") -def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model): +@click.option("--instance-ids", help="The instance IDs of the examples to process. Example: --instance-ids ,,...", type=list_of_strings, default="") +def run_eval_command(dataset, use_existing_preds, length, instance_id, local, repo, num_workers, model, instance_ids): print(f"Repo: {repo}") print(f"Model: {model}") - asyncio.run(run_eval(dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model)) + asyncio.run( + run_eval( + dataset=dataset, use_existing_preds=use_existing_preds, length=length, instance_id=instance_id, local=local, repo=repo, num_workers=num_workers, model=model, instance_ids=instance_ids + ) + ) if __name__ == "__main__": diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py index d977dce4e..c5054b2d0 100644 --- a/src/codegen/extensions/swebench/utils.py +++ b/src/codegen/extensions/swebench/utils.py @@ -73,6 +73,7 @@ def get_swe_bench_examples( split: Literal["train", "dev", "test"] = "test", length: int | None = None, instance_id: str | None = None, + instance_ids: list[str] = [], repo: str | None = None, ) -> list[SweBenchExample]: """Fetch examples from the SWE-bench dataset using the datasets library. @@ -80,18 +81,19 @@ def get_swe_bench_examples( Args: dataset: The dataset to use ("lite", "full", or "verified") split: The dataset split to use - offset: Starting index for examples length: Number of examples to fetch instance_id: Optional specific instance ID to fetch + instance_ids: Optional list of instance IDs to fetch + repo: Optional specific repo to fetch Returns: List of SweBenchExample objects """ - # Convert string dataset name to enum - # Load the dataset with caching enabled - instance_ids = [] if isinstance(dataset, SWEBenchLiteSubset): + if instance_ids: + msg = "instance_ids is not supported for lite subsets. Please pass a list of instance IDs instead." + raise ValueError(msg) swe_bench_dataset = load_dataset(SWEBenchDataset.LITE.value, download_mode="reuse_dataset_if_exists") instance_ids = LITE_SUBSETS[dataset] else: