From eb4073e012f06a728be7ab69612fdf0035fc76a4 Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:29:40 -0800 Subject: [PATCH 1/7] no tests --- .../parsing-codebases.mdx | 45 ++++++++++-- src/graph_sitter/fetch_codebase.py | 70 +++++++++++++++++++ 2 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 src/graph_sitter/fetch_codebase.py diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx index ab4b7b4fa..13f05086c 100644 --- a/docs/building-with-codegen/parsing-codebases.mdx +++ b/docs/building-with-codegen/parsing-codebases.mdx @@ -7,9 +7,9 @@ iconType: "solid" The primary entrypoint to programs leveraging Codegen is the [Codebase](../api-reference/core/Codebase) class. -Construct a Codebase by passing in a path to a local `git` repository. - +## Local Codebases +Construct a Codebase by passing in a path to a local `git` repository. ```python from codegen import Codebase @@ -20,13 +20,50 @@ codebase = Codebase("path/to/repository") # Parse from current directory codebase = Codebase("./") ``` -This will automatically infer the programming language of the codebase and parse all files in the codebase. -The initial parse may take a few minutes for large codebases. This pre-computation enables constant-time operations afterward. [Learn more here.](/introduction/how-it-works) + + This will automatically infer the programming language of the codebase and + parse all files in the codebase. + + + + The initial parse may take a few minutes for large codebases. This + pre-computation enables constant-time operations afterward. [Learn more + here.](/introduction/how-it-works) + + + + This will automatically infer the programming language of the codebase and + parse all files in the codebase. + + +## Remote Repositories + +To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function. + +```python +import codegen + +# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name}) +codebase = codegen.fetch_codebase('facebook/react') + +# Customize temp directory and clone depth +codebase = codegen.fetch_codebase( + 'facebook/react', + tmp_dir='/custom/temp/dir', # Optional: custom temp directory + shallow=False # Optional: full clone instead of shallow +) +``` + + + Remote repositories are cloned to a temporary directory by default. The clone + is shallow by default for better performance. + ## Supported Languages Codegen currently supports: + - [Python](/api-reference/python) - [TypeScript/JavaScript](/api-reference/javascript) - [React/JSX](/building-with-codegen/react-and-jsx) diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py new file mode 100644 index 000000000..32e6451a0 --- /dev/null +++ b/src/graph_sitter/fetch_codebase.py @@ -0,0 +1,70 @@ +import logging +import os +from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator +from graph_sitter.core.codebase import Codebase + +logger = logging.getLogger(__name__) + +DEFAULT_CODEGEN_DIR = "/tmp/codegen" + + +def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool = True, commit_hash: str | None = None) -> Codebase: + """Fetches a codebase from GitHub and returns a Codebase instance. + Args: + repo_name (str): The name of the repository in format "owner/repo" + tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen + shallow (bool): Whether to do a shallow clone. Defaults to True + commit_hash (Optional[str]): The specific commit hash to clone. Defaults to HEAD + Returns: + Codebase: A Codebase instance initialized with the cloned repository + Example: + ```python + import graph_sitter + import logging + # Enable logging to see progress + logging.basicConfig(level=logging.INFO) + # Clone a repository to default location (/tmp/codegen) + codebase = graph_sitter.fetch_codebase('facebook/react') + # Or specify a custom directory + codebase = graph_sitter.fetch_codebase('facebook/react', tmp_dir='~/my_repos') + # Or clone a specific commit + codebase = graph_sitter.fetch_codebase('facebook/react', commit_hash='abc123') + ``` + """ + logger.info(f"Fetching codebase for {repo_name}") + + # Parse repo name + if "/" not in repo_name: + raise ValueError("repo_name must be in format 'owner/repo'") + owner, repo = repo_name.split("/") + + # Setup temp directory + if tmp_dir is None: + tmp_dir = DEFAULT_CODEGEN_DIR + os.makedirs(tmp_dir, exist_ok=True) + logger.info(f"Using directory: {tmp_dir}") + + # Setup repo path and URL + repo_path = os.path.join(tmp_dir, repo) + repo_url = f"https://github.com/{repo_name}.git" + logger.info(f"Will clone {repo_url} to {repo_path}") + + try: + # Use LocalRepoOperator to fetch the repository + logger.info("Cloning repository...") + LocalRepoOperator.create_from_commit( + repo_path=repo_path, + default_branch="main", # We'll get the actual default branch after clone + commit=commit_hash or "HEAD", + url=repo_url + ) + logger.info("Clone completed successfully") + + # Initialize and return codebase + logger.info("Initializing Codebase...") + codebase = Codebase(repo_path) + logger.info("Codebase initialization complete") + return codebase + except Exception as e: + logger.error(f"Failed to initialize codebase: {e}") + raise \ No newline at end of file From f7605163e4bb1e545167edc51b812e9da29d1bac Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:39:46 -0800 Subject: [PATCH 2/7] added commit_hash --- .../parsing-codebases.mdx | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx index 13f05086c..e21a7e017 100644 --- a/docs/building-with-codegen/parsing-codebases.mdx +++ b/docs/building-with-codegen/parsing-codebases.mdx @@ -32,11 +32,6 @@ codebase = Codebase("./") here.](/introduction/how-it-works) - - This will automatically infer the programming language of the codebase and - parse all files in the codebase. - - ## Remote Repositories To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function. @@ -45,19 +40,20 @@ To fetch and parse a repository directly from GitHub, use the `fetch_codebase` f import codegen # Fetch and parse a repository (defaults to /tmp/codegen/{repo_name}) -codebase = codegen.fetch_codebase('facebook/react') +codebase = codegen.fetch_codebase('fastapi/fastapi') -# Customize temp directory and clone depth +# Customize temp directory, clone depth, or specific commit codebase = codegen.fetch_codebase( - 'facebook/react', + 'fastapi/fastapi', tmp_dir='/custom/temp/dir', # Optional: custom temp directory - shallow=False # Optional: full clone instead of shallow + shallow=False, # Optional: full clone instead of shallow + commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit ) ``` - Remote repositories are cloned to a temporary directory by default. The clone - is shallow by default for better performance. + Remote repositories are cloned to the `/tmp/codegen/{repo_name}` directory by + default. The clone is shallow by default for better performance. ## Supported Languages From 28c400e6b172ba76ca15c035454ea0605506f858 Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:45:11 -0800 Subject: [PATCH 3/7] . --- src/graph_sitter/fetch_codebase.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py index 32e6451a0..fc0859485 100644 --- a/src/graph_sitter/fetch_codebase.py +++ b/src/graph_sitter/fetch_codebase.py @@ -1,5 +1,8 @@ import logging + +# ramp import os + from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator from graph_sitter.core.codebase import Codebase @@ -10,6 +13,7 @@ def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool = True, commit_hash: str | None = None) -> Codebase: """Fetches a codebase from GitHub and returns a Codebase instance. + Args: repo_name (str): The name of the repository in format "owner/repo" tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen @@ -56,7 +60,7 @@ def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool repo_path=repo_path, default_branch="main", # We'll get the actual default branch after clone commit=commit_hash or "HEAD", - url=repo_url + url=repo_url, ) logger.info("Clone completed successfully") @@ -67,4 +71,4 @@ def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool return codebase except Exception as e: logger.error(f"Failed to initialize codebase: {e}") - raise \ No newline at end of file + raise From 39eb0405b353b340dd156deb0d2928a515e0ca76 Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:45:38 -0800 Subject: [PATCH 4/7] . --- src/graph_sitter/fetch_codebase.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py index fc0859485..4b63700b8 100644 --- a/src/graph_sitter/fetch_codebase.py +++ b/src/graph_sitter/fetch_codebase.py @@ -1,6 +1,4 @@ import logging - -# ramp import os from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator From 500e63b5cd1ec00d06c2889235052335c916a148 Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:48:59 -0800 Subject: [PATCH 5/7] . --- jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog diff --git a/jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog b/jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog new file mode 100644 index 000000000..e69de29bb From f1589ca8e76d7a2a658ad635a7d4e0ee49c0aeaa Mon Sep 17 00:00:00 2001 From: jayhack Date: Wed, 22 Jan 2025 16:54:27 -0800 Subject: [PATCH 6/7] . --- src/graph_sitter/fetch_codebase.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/graph_sitter/fetch_codebase.py b/src/graph_sitter/fetch_codebase.py index 4b63700b8..994f9147d 100644 --- a/src/graph_sitter/fetch_codebase.py +++ b/src/graph_sitter/fetch_codebase.py @@ -2,7 +2,9 @@ import os from codegen_git.repo_operator.local_repo_operator import LocalRepoOperator +from graph_sitter.codebase.config import DefaultConfig, ProjectConfig from graph_sitter.core.codebase import Codebase +from graph_sitter.utils import determine_project_language logger = logging.getLogger(__name__) @@ -54,7 +56,7 @@ def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool try: # Use LocalRepoOperator to fetch the repository logger.info("Cloning repository...") - LocalRepoOperator.create_from_commit( + repo_operator = LocalRepoOperator.create_from_commit( repo_path=repo_path, default_branch="main", # We'll get the actual default branch after clone commit=commit_hash or "HEAD", @@ -62,9 +64,10 @@ def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool ) logger.info("Clone completed successfully") - # Initialize and return codebase + # Initialize and return codebase with proper context logger.info("Initializing Codebase...") - codebase = Codebase(repo_path) + project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path)) + codebase = Codebase(projects=[project], config=DefaultConfig) logger.info("Codebase initialization complete") return codebase except Exception as e: From c47598eba4b8185657c86aa5b976f2f467850185 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Wed, 22 Jan 2025 16:58:27 -0800 Subject: [PATCH 7/7] Delete jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog --- jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog diff --git a/jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog b/jayhack-cg-10349-support-codegenfetch_codebaseposthogposthog deleted file mode 100644 index e69de29bb..000000000