From ceeb41a2a92944deeddb779dec0fc24eb85ca7c2 Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Sun, 26 Jan 2025 17:37:03 -0800 Subject: [PATCH 1/3] . --- docs/building-with-codegen/symbol-api.mdx | 2 +- docs/mint.json | 1 + docs/tutorials/training-data.mdx | 233 ++++++++++++++++++ src/codegen/cli/api/endpoints.py | 20 ++ src/codegen/cli/commands/init/render.py | 3 +- .../cli/workspace/initialize_workspace.py | 13 + 6 files changed, 269 insertions(+), 3 deletions(-) create mode 100644 docs/tutorials/training-data.mdx diff --git a/docs/building-with-codegen/symbol-api.mdx b/docs/building-with-codegen/symbol-api.mdx index c39ae90bb..8545fb5fe 100644 --- a/docs/building-with-codegen/symbol-api.mdx +++ b/docs/building-with-codegen/symbol-api.mdx @@ -38,7 +38,7 @@ All symbols share common APIs for manipulation: - [symbol.source](/api-reference/core/Symbol#source) - [symbol.docstring](/api-reference/core/Symbol#docstring) - Edit operations - - [symbol.set_docstring](/api-reference/core/Symbol#add_comment) + - [symbol.set_docstring](/api-reference/core/Symbol#set-docstring) - [symbol.move_to_file](/api-reference/core/Symbol#move-to-file) (see [Moving Symbols](/building-with-codegen/moving-symbols)) - Graph relations (See [Usages and Dependencies](/building-with-codegen/dependencies-and-usages)) - [symbol.usages](/api-reference/core/Symbol#usages) diff --git a/docs/mint.json b/docs/mint.json index df6519df0..74d27f063 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -75,6 +75,7 @@ "tutorials/modularity", "tutorials/deleting-dead-code", "tutorials/increase-type-coverage", + "tutorials/training-data", "tutorials/manage-feature-flags", "tutorials/managing-typescript-exports", "tutorials/converting-default-exports", diff --git a/docs/tutorials/training-data.mdx b/docs/tutorials/training-data.mdx new file mode 100644 index 000000000..b988750d9 --- /dev/null +++ b/docs/tutorials/training-data.mdx @@ -0,0 +1,233 @@ +--- +title: "Generating Training Data for LLMs" +sidebarTitle: "Training Data" +description: "Learn how to generate training data for large language models using Codegen" +icon: "network-wired" +iconType: "solid" +--- + +This guide demonstrates how to use Codegen to generate high-quality training data for large language models (LLMs) by extracting function implementations along with their dependencies and usages. This approach is similar to [word2vec](https://www.tensorflow.org/text/tutorials/word2vec) or [node2vec](https://snap.stanford.edu/node2vec/) - given the context of a function, learn to predict the function's implementation. + +View the full code in our [examples repository](https://github.com/codegen-sh/codegen-examples/blob/main/generate_training_data/run.py) + +## Overview + +The process involves three main steps: + +1. Finding all functions in the codebase +2. Extracting their implementations, dependencies, and usages +3. Generating structured training data + +Let's walk through each step using Codegen. + +## Step 1: Finding Functions and Their Context + +First, we will do a "graph expansion" for each function - grab the function's source, as well as the full source of all usages of the function and all dependencies. + +See [dependencies and usages](/building-with-codegen/dependencies-and-usages) to learn more about navigating the code graph + +First, let's import the types we need from Codegen: + +```python +import codegen +from codegen import Codebase +from codegen.sdk.core.external_module import ExternalModule +from codegen.sdk.core.import_resolution import Import +from codegen.sdk.core.symbol import Symbol +``` + +Here's how we get the full context for each function: + +```python +def get_function_context(function) -> dict: + """Get the implementation, dependencies, and usages of a function.""" + context = { + "implementation": {"source": function.source, "filepath": function.filepath}, + "dependencies": [], + "usages": [], + } + + # Add dependencies + for dep in function.dependencies: + # Hop through imports to find the root symbol source + if isinstance(dep, Import): + dep = hop_through_imports(dep) + + context["dependencies"].append({"source": dep.source, "filepath": dep.filepath}) + + # Add usages + for usage in function.usages: + context["usages"].append({ + "source": usage.usage_symbol.source, + "filepath": usage.usage_symbol.filepath, + }) + + return context +``` + +Notice how we use `hop_through_imports` to resolve dependencies. When working with imports, symbols can be re-exported multiple times. For example, a helper function might be imported and re-exported through several files before being used. We need to follow this chain to find the actual implementation: + +```python +def hop_through_imports(imp: Import) -> Symbol | ExternalModule: + """Finds the root symbol for an import.""" + if isinstance(imp.imported_symbol, Import): + return hop_through_imports(imp.imported_symbol) + return imp.imported_symbol +``` + +This creates a structured representation of each function's context: + +```json +{ + "implementation": { + "source": "def process_data(input: str) -> dict: ...", + "filepath": "src/data_processor.py" + }, + "dependencies": [ + { + "source": "def validate_input(data: str) -> bool: ...", + "filepath": "src/validators.py" + } + ], + "usages": [ + { + "source": "result = process_data(user_input)", + "filepath": "src/api.py" + } + ] +} +``` + +## Step 2: Processing the Codebase + +Next, we process all functions in the codebase to generate our training data: + +```python +def run(codebase: Codebase): + """Generate training data using a node2vec-like approach for code embeddings.""" + # Track all function contexts + training_data = { + "functions": [], + "metadata": { + "total_functions": len(codebase.functions), + "total_processed": 0, + "avg_dependencies": 0, + "avg_usages": 0, + }, + } + + # Process each function in the codebase + for function in codebase.functions: + # Skip if function is too small + if len(function.source.split("\n")) < 2: + continue + + # Get function context + context = get_function_context(function) + + # Only keep functions with enough context + if len(context["dependencies"]) + len(context["usages"]) > 0: + training_data["functions"].append(context) + + # Update metadata + training_data["metadata"]["total_processed"] = len(training_data["functions"]) + if training_data["functions"]: + training_data["metadata"]["avg_dependencies"] = sum( + len(f["dependencies"]) for f in training_data["functions"] + ) / len(training_data["functions"]) + training_data["metadata"]["avg_usages"] = sum( + len(f["usages"]) for f in training_data["functions"] + ) / len(training_data["functions"]) + + return training_data +``` + +## Step 3: Running the Generator + +Finally, we can run our training data generator on any codebase. + +See [parsing codebases](/building-with-codegen/parsing-codebases) to learn more + +```python +if __name__ == "__main__": + print("Initializing codebase...") + codebase = Codebase.from_repo("fastapi/fastapi") + + print("Generating training data...") + training_data = run(codebase) + + print("Saving training data...") + with open("training_data.json", "w") as f: + json.dump(training_data, f, indent=2) + print("Training data saved to training_data.json") +``` + +This will: +1. Load the target codebase +2. Process all functions +3. Save the structured training data to a JSON file + + + You can use any Git repository as your source codebase by passing the repo URL + to [Codebase.from_repo(...)](/api-reference/core/codebase#from-repo). + + +## Using the Training Data + +The generated data can be used to train LLMs in several ways: + +1. **Masked Function Prediction**: Hide a function's implementation and predict it from dependencies and usages +2. **Code Embeddings**: Generate embeddings that capture semantic relationships between functions +3. **Dependency Prediction**: Learn to predict which functions are likely to be dependencies +4. **Usage Pattern Learning**: Train models to understand common usage patterns + +For example, to create a masked prediction task: + +```python +def create_training_example(function_data): + """Create a masked prediction example from function data.""" + return { + "context": { + "dependencies": function_data["dependencies"], + "usages": function_data["usages"] + }, + "target": function_data["implementation"] + } + +# Create training examples +examples = [create_training_example(f) for f in training_data["functions"]] +``` + +## Best Practices + +1. **Filter Small Functions**: Skip trivial functions that won't provide meaningful training data: +```python +if len(function.source.split("\n")) < 2: + continue +``` + +2. **Ensure Sufficient Context**: Only use functions with dependencies or usages: +```python +if len(context["dependencies"]) + len(context["usages"]) > 0: + training_data["functions"].append(context) +``` + +3. **Track Metadata**: Keep statistics about your training data: +```python +training_data["metadata"] = { + "total_functions": len(codebase.functions), + "total_processed": len(training_data["functions"]), + "avg_dependencies": average_dependencies, + "avg_usages": average_usages +} +``` + +4. **Handle Import Chains**: Follow import chains to find root implementations: +```python +def hop_through_imports(imp: Import) -> Symbol | ExternalModule: + if isinstance(imp.imported_symbol, Import): + return hop_through_imports(imp.imported_symbol) + return imp.imported_symbol +``` + +By following these guidelines, you can generate high-quality training data for your LLM projects while maintaining code quality and consistency. \ No newline at end of file diff --git a/src/codegen/cli/api/endpoints.py b/src/codegen/cli/api/endpoints.py index c44e0b81a..10c30e044 100644 --- a/src/codegen/cli/api/endpoints.py +++ b/src/codegen/cli/api/endpoints.py @@ -9,3 +9,23 @@ LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-lookup.modal.run" RUN_ON_PR_ENDPOINT = f"https://{MODAL_PREFIX}--cli-run-on-pull-request.modal.run" PR_LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-pr-lookup.modal.run" + +# Base URLs +CODEGEN_API_URL = "https://api.codegen.sh" +CODEGEN_WEB_URL = "https://codegen.sh" + +# API endpoints +CODEGEN_API_DOCS = f"{CODEGEN_API_URL}/docs" +CODEGEN_API_EXAMPLES = f"{CODEGEN_API_URL}/examples" +CODEGEN_API_CODEMOD = f"{CODEGEN_API_URL}/codemod" +CODEGEN_API_CODEMOD_DEPLOY = f"{CODEGEN_API_URL}/codemod/deploy" +CODEGEN_API_CODEMOD_DEPLOY_STATUS = f"{CODEGEN_API_URL}/codemod/deploy/status" +CODEGEN_API_CODEMOD_DEPLOY_CANCEL = f"{CODEGEN_API_URL}/codemod/deploy/cancel" +CODEGEN_API_CODEMOD_DEPLOY_LOGS = f"{CODEGEN_API_URL}/codemod/deploy/logs" + +# Web URLs +CODEGEN_WEB_PLAYGROUND = f"{CODEGEN_WEB_URL}/playground" +CODEGEN_WEB_DOCS = f"{CODEGEN_WEB_URL}/docs" + +# System prompt URL +CODEGEN_SYSTEM_PROMPT_URL = "https://gist.githubusercontent.com/jayhack/15681a2ceaccd726f19e6fdb3a44738b/raw/17c08054e3931b3b7fdf424458269c9e607541e8/codegen-system-prompt.txt" diff --git a/src/codegen/cli/commands/init/render.py b/src/codegen/cli/commands/init/render.py index 27b02749a..665db8246 100644 --- a/src/codegen/cli/commands/init/render.py +++ b/src/codegen/cli/commands/init/render.py @@ -6,5 +6,4 @@ def get_success_message(codegen_dir: Path, docs_dir: Path, examples_dir: Path) - return """📁 .codegen configuration folder created: [dim]config.toml[/dim] Project configuration [dim]codemods/[/dim] Your codemod implementations - [dim]jupyter/[/dim] Notebooks for codebase exploration - [dim]prompts/[/dim] AI system prompts (gitignored)""" + [dim]codegen-system-prompt.txt[/dim] AI system prompt (gitignored)""" diff --git a/src/codegen/cli/workspace/initialize_workspace.py b/src/codegen/cli/workspace/initialize_workspace.py index eac71b5c3..bfcdbcf78 100644 --- a/src/codegen/cli/workspace/initialize_workspace.py +++ b/src/codegen/cli/workspace/initialize_workspace.py @@ -2,6 +2,7 @@ from contextlib import nullcontext from pathlib import Path +import requests import rich import toml from rich.status import Status @@ -78,6 +79,7 @@ def initialize_codegen( CONFIG_PATH = CODEGEN_FOLDER / "config.toml" JUPYTER_DIR = CODEGEN_FOLDER / "jupyter" CODEMODS_DIR = CODEGEN_FOLDER / "codemods" + SYSTEM_PROMPT_PATH = CODEGEN_FOLDER / "codegen-system-prompt.txt" # If status is a string, create a new spinner context = create_spinner(f" {status} folders...") if isinstance(status, str) else nullcontext() @@ -91,6 +93,16 @@ def initialize_codegen( JUPYTER_DIR.mkdir(parents=True, exist_ok=True) CODEMODS_DIR.mkdir(parents=True, exist_ok=True) + # Download system prompt + try: + from codegen.cli.api.endpoints import CODEGEN_SYSTEM_PROMPT_URL + + response = requests.get(CODEGEN_SYSTEM_PROMPT_URL) + response.raise_for_status() + SYSTEM_PROMPT_PATH.write_text(response.text) + except Exception as e: + rich.print(f"[yellow]Warning: Could not download system prompt: {e}[/yellow]") + if not repo: rich.print("No git repository found. Please run this command in a git repository.") else: @@ -152,6 +164,7 @@ def modify_gitignore(codegen_folder: Path): "examples/", "prompts/", "jupyter/", + "codegen-system-prompt.txt", # Add system prompt to gitignore "", "# Python cache files", "__pycache__/", From 5a4ddd7e70e0964215569a0ab7d531c99a13b980 Mon Sep 17 00:00:00 2001 From: codegen-bot Date: Sun, 26 Jan 2025 17:38:32 -0800 Subject: [PATCH 2/3] . --- docs/tutorials/training-data.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/tutorials/training-data.mdx b/docs/tutorials/training-data.mdx index b988750d9..6cd9c72d1 100644 --- a/docs/tutorials/training-data.mdx +++ b/docs/tutorials/training-data.mdx @@ -10,6 +10,8 @@ This guide demonstrates how to use Codegen to generate high-quality training dat View the full code in our [examples repository](https://github.com/codegen-sh/codegen-examples/blob/main/generate_training_data/run.py) +This example works with both Python and Typescript repositories without modification + ## Overview The process involves three main steps: From 5102ce5e3bef499aed261dcc256915c8740a966f Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Sun, 26 Jan 2025 17:44:01 -0800 Subject: [PATCH 3/3] Update endpoints.py --- src/codegen/cli/api/endpoints.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/codegen/cli/api/endpoints.py b/src/codegen/cli/api/endpoints.py index 10c30e044..f8b56513b 100644 --- a/src/codegen/cli/api/endpoints.py +++ b/src/codegen/cli/api/endpoints.py @@ -9,23 +9,4 @@ LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-lookup.modal.run" RUN_ON_PR_ENDPOINT = f"https://{MODAL_PREFIX}--cli-run-on-pull-request.modal.run" PR_LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-pr-lookup.modal.run" - -# Base URLs -CODEGEN_API_URL = "https://api.codegen.sh" -CODEGEN_WEB_URL = "https://codegen.sh" - -# API endpoints -CODEGEN_API_DOCS = f"{CODEGEN_API_URL}/docs" -CODEGEN_API_EXAMPLES = f"{CODEGEN_API_URL}/examples" -CODEGEN_API_CODEMOD = f"{CODEGEN_API_URL}/codemod" -CODEGEN_API_CODEMOD_DEPLOY = f"{CODEGEN_API_URL}/codemod/deploy" -CODEGEN_API_CODEMOD_DEPLOY_STATUS = f"{CODEGEN_API_URL}/codemod/deploy/status" -CODEGEN_API_CODEMOD_DEPLOY_CANCEL = f"{CODEGEN_API_URL}/codemod/deploy/cancel" -CODEGEN_API_CODEMOD_DEPLOY_LOGS = f"{CODEGEN_API_URL}/codemod/deploy/logs" - -# Web URLs -CODEGEN_WEB_PLAYGROUND = f"{CODEGEN_WEB_URL}/playground" -CODEGEN_WEB_DOCS = f"{CODEGEN_WEB_URL}/docs" - -# System prompt URL CODEGEN_SYSTEM_PROMPT_URL = "https://gist.githubusercontent.com/jayhack/15681a2ceaccd726f19e6fdb3a44738b/raw/17c08054e3931b3b7fdf424458269c9e607541e8/codegen-system-prompt.txt"