diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx index ab4b7b4fa..e21a7e017 100644 --- a/docs/building-with-codegen/parsing-codebases.mdx +++ b/docs/building-with-codegen/parsing-codebases.mdx @@ -7,9 +7,9 @@ iconType: "solid" The primary entrypoint to programs leveraging Codegen is the [Codebase](../api-reference/core/Codebase) class. -Construct a Codebase by passing in a path to a local `git` repository. - +## Local Codebases +Construct a Codebase by passing in a path to a local `git` repository. ```python from codegen import Codebase @@ -20,13 +20,46 @@ codebase = Codebase("path/to/repository") # Parse from current directory codebase = Codebase("./") ``` -This will automatically infer the programming language of the codebase and parse all files in the codebase. -The initial parse may take a few minutes for large codebases. This pre-computation enables constant-time operations afterward. [Learn more here.](/introduction/how-it-works) + + This will automatically infer the programming language of the codebase and + parse all files in the codebase. + + + + The initial parse may take a few minutes for large codebases. This + pre-computation enables constant-time operations afterward. [Learn more + here.](/introduction/how-it-works) + + +## Remote Repositories + +To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function. + +```python +import codegen + +# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name}) +codebase = codegen.fetch_codebase('fastapi/fastapi') + +# Customize temp directory, clone depth, or specific commit +codebase = codegen.fetch_codebase( + 'fastapi/fastapi', + tmp_dir='/custom/temp/dir', # Optional: custom temp directory + shallow=False, # Optional: full clone instead of shallow + commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit +) +``` + + + Remote repositories are cloned to the `/tmp/codegen/{repo_name}` directory by + default. The clone is shallow by default for better performance. + ## Supported Languages Codegen currently supports: + - [Python](/api-reference/python) - [TypeScript/JavaScript](/api-reference/javascript) - [React/JSX](/building-with-codegen/react-and-jsx) diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py new file mode 100644 index 000000000..994f9147d --- /dev/null +++ b/src/graph_sitter/fetch_codebase.py @@ -0,0 +1,75 @@ +import logging +import os + +from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator +from graph_sitter.codebase.config import DefaultConfig, ProjectConfig +from graph_sitter.core.codebase import Codebase +from graph_sitter.utils import determine_project_language + +logger = logging.getLogger(__name__) + +DEFAULT_CODEGEN_DIR = "/tmp/codegen" + + +def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool = True, commit_hash: str | None = None) -> Codebase: + """Fetches a codebase from GitHub and returns a Codebase instance. + + Args: + repo_name (str): The name of the repository in format "owner/repo" + tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen + shallow (bool): Whether to do a shallow clone. Defaults to True + commit_hash (Optional[str]): The specific commit hash to clone. Defaults to HEAD + Returns: + Codebase: A Codebase instance initialized with the cloned repository + Example: + ```python + import graph_sitter + import logging + # Enable logging to see progress + logging.basicConfig(level=logging.INFO) + # Clone a repository to default location (/tmp/codegen) + codebase = graph_sitter.fetch_codebase('facebook/react') + # Or specify a custom directory + codebase = graph_sitter.fetch_codebase('facebook/react', tmp_dir='~/my_repos') + # Or clone a specific commit + codebase = graph_sitter.fetch_codebase('facebook/react', commit_hash='abc123') + ``` + """ + logger.info(f"Fetching codebase for {repo_name}") + + # Parse repo name + if "/" not in repo_name: + raise ValueError("repo_name must be in format 'owner/repo'") + owner, repo = repo_name.split("/") + + # Setup temp directory + if tmp_dir is None: + tmp_dir = DEFAULT_CODEGEN_DIR + os.makedirs(tmp_dir, exist_ok=True) + logger.info(f"Using directory: {tmp_dir}") + + # Setup repo path and URL + repo_path = os.path.join(tmp_dir, repo) + repo_url = f"https://github.com/{repo_name}.git" + logger.info(f"Will clone {repo_url} to {repo_path}") + + try: + # Use LocalRepoOperator to fetch the repository + logger.info("Cloning repository...") + repo_operator = LocalRepoOperator.create_from_commit( + repo_path=repo_path, + default_branch="main", # We'll get the actual default branch after clone + commit=commit_hash or "HEAD", + url=repo_url, + ) + logger.info("Clone completed successfully") + + # Initialize and return codebase with proper context + logger.info("Initializing Codebase...") + project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path)) + codebase = Codebase(projects=[project], config=DefaultConfig) + logger.info("Codebase initialization complete") + return codebase + except Exception as e: + logger.error(f"Failed to initialize codebase: {e}") + raise