diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx
index ab4b7b4fa..e21a7e017 100644
--- a/docs/building-with-codegen/parsing-codebases.mdx
+++ b/docs/building-with-codegen/parsing-codebases.mdx
@@ -7,9 +7,9 @@ iconType: "solid"
The primary entrypoint to programs leveraging Codegen is the [Codebase](../api-reference/core/Codebase) class.
-Construct a Codebase by passing in a path to a local `git` repository.
-
+## Local Codebases
+Construct a Codebase by passing in a path to a local `git` repository.
```python
from codegen import Codebase
@@ -20,13 +20,46 @@ codebase = Codebase("path/to/repository")
# Parse from current directory
codebase = Codebase("./")
```
-This will automatically infer the programming language of the codebase and parse all files in the codebase.
-The initial parse may take a few minutes for large codebases. This pre-computation enables constant-time operations afterward. [Learn more here.](/introduction/how-it-works)
+
+ This will automatically infer the programming language of the codebase and
+ parse all files in the codebase.
+
+
+
+ The initial parse may take a few minutes for large codebases. This
+ pre-computation enables constant-time operations afterward. [Learn more
+ here.](/introduction/how-it-works)
+
+
+## Remote Repositories
+
+To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function.
+
+```python
+import codegen
+
+# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name})
+codebase = codegen.fetch_codebase('fastapi/fastapi')
+
+# Customize temp directory, clone depth, or specific commit
+codebase = codegen.fetch_codebase(
+ 'fastapi/fastapi',
+ tmp_dir='/custom/temp/dir', # Optional: custom temp directory
+ shallow=False, # Optional: full clone instead of shallow
+ commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit
+)
+```
+
+
+ Remote repositories are cloned to the `/tmp/codegen/{repo_name}` directory by
+ default. The clone is shallow by default for better performance.
+
## Supported Languages
Codegen currently supports:
+
- [Python](/api-reference/python)
- [TypeScript/JavaScript](/api-reference/javascript)
- [React/JSX](/building-with-codegen/react-and-jsx)
diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py
new file mode 100644
index 000000000..994f9147d
--- /dev/null
+++ b/src/graph_sitter/fetch_codebase.py
@@ -0,0 +1,75 @@
+import logging
+import os
+
+from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator
+from graph_sitter.codebase.config import DefaultConfig, ProjectConfig
+from graph_sitter.core.codebase import Codebase
+from graph_sitter.utils import determine_project_language
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_CODEGEN_DIR = "/tmp/codegen"
+
+
+def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool = True, commit_hash: str | None = None) -> Codebase:
+ """Fetches a codebase from GitHub and returns a Codebase instance.
+
+ Args:
+ repo_name (str): The name of the repository in format "owner/repo"
+ tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen
+ shallow (bool): Whether to do a shallow clone. Defaults to True
+ commit_hash (Optional[str]): The specific commit hash to clone. Defaults to HEAD
+ Returns:
+ Codebase: A Codebase instance initialized with the cloned repository
+ Example:
+ ```python
+ import graph_sitter
+ import logging
+ # Enable logging to see progress
+ logging.basicConfig(level=logging.INFO)
+ # Clone a repository to default location (/tmp/codegen)
+ codebase = graph_sitter.fetch_codebase('facebook/react')
+ # Or specify a custom directory
+ codebase = graph_sitter.fetch_codebase('facebook/react', tmp_dir='~/my_repos')
+ # Or clone a specific commit
+ codebase = graph_sitter.fetch_codebase('facebook/react', commit_hash='abc123')
+ ```
+ """
+ logger.info(f"Fetching codebase for {repo_name}")
+
+ # Parse repo name
+ if "/" not in repo_name:
+ raise ValueError("repo_name must be in format 'owner/repo'")
+ owner, repo = repo_name.split("/")
+
+ # Setup temp directory
+ if tmp_dir is None:
+ tmp_dir = DEFAULT_CODEGEN_DIR
+ os.makedirs(tmp_dir, exist_ok=True)
+ logger.info(f"Using directory: {tmp_dir}")
+
+ # Setup repo path and URL
+ repo_path = os.path.join(tmp_dir, repo)
+ repo_url = f"https://github.com/{repo_name}.git"
+ logger.info(f"Will clone {repo_url} to {repo_path}")
+
+ try:
+ # Use LocalRepoOperator to fetch the repository
+ logger.info("Cloning repository...")
+ repo_operator = LocalRepoOperator.create_from_commit(
+ repo_path=repo_path,
+ default_branch="main", # We'll get the actual default branch after clone
+ commit=commit_hash or "HEAD",
+ url=repo_url,
+ )
+ logger.info("Clone completed successfully")
+
+ # Initialize and return codebase with proper context
+ logger.info("Initializing Codebase...")
+ project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path))
+ codebase = Codebase(projects=[project], config=DefaultConfig)
+ logger.info("Codebase initialization complete")
+ return codebase
+ except Exception as e:
+ logger.error(f"Failed to initialize codebase: {e}")
+ raise