feat: fetches system-prompt + guide (#111)

jayhack · codegen-bot · web-flow · commit 0a79e0402992 · 2025-01-26T17:55:43.000-08:00
# Motivation
&lt;!-- Why is this change necessary? --&gt;

# Content
&lt;!-- Please include a summary of the change --&gt;
# Testing
&lt;!-- How was the change tested? --&gt;
# Please check the following before marking your PR as ready for review

- [ ] I have added tests for my changes
- [ ] I have updated the documentation or added new documentation as
needed
- [ ] I have read and agree to the [Contributor License
Agreement](../CLA.md)

---------

Co-authored-by: codegen-bot &lt;team+codegenbot@codegen.sh&gt;
diff --git a/docs/building-with-codegen/symbol-api.mdx b/docs/building-with-codegen/symbol-api.mdx
@@ -38,7 +38,7 @@ All symbols share common APIs for manipulation:
   - [symbol.source](/api-reference/core/Symbol#source)
   - [symbol.docstring](/api-reference/core/Symbol#docstring)
 - Edit operations
-  - [symbol.set_docstring](/api-reference/core/Symbol#add_comment)
+  - [symbol.set_docstring](/api-reference/core/Symbol#set-docstring)
   - [symbol.move_to_file](/api-reference/core/Symbol#move-to-file) (see [Moving Symbols](/building-with-codegen/moving-symbols))
 - Graph relations (See [Usages and Dependencies](/building-with-codegen/dependencies-and-usages))
   - [symbol.usages](/api-reference/core/Symbol#usages)
diff --git a/docs/mint.json b/docs/mint.json
@@ -75,6 +75,7 @@
 				"tutorials/modularity",
 				"tutorials/deleting-dead-code",
 				"tutorials/increase-type-coverage",
+				"tutorials/training-data",
 				"tutorials/manage-feature-flags",
 				"tutorials/managing-typescript-exports",
 				"tutorials/converting-default-exports",
diff --git a/docs/tutorials/training-data.mdx b/docs/tutorials/training-data.mdx
@@ -0,0 +1,235 @@
+---
+title: "Generating Training Data for LLMs"
+sidebarTitle: "Training Data"
+description: "Learn how to generate training data for large language models using Codegen"
+icon: "network-wired"
+iconType: "solid"
+---
+
+This guide demonstrates how to use Codegen to generate high-quality training data for large language models (LLMs) by extracting function implementations along with their dependencies and usages. This approach is similar to [word2vec](https://www.tensorflow.org/text/tutorials/word2vec) or [node2vec](https://snap.stanford.edu/node2vec/) - given the context of a function, learn to predict the function's implementation.
+
+<Info>View the full code in our [examples repository](https://github.com/codegen-sh/codegen-examples/blob/main/generate_training_data/run.py)</Info>
+
+<Tip>This example works with both Python and Typescript repositories without modification</Tip>
+
+## Overview
+
+The process involves three main steps:
+
+1. Finding all functions in the codebase
+2. Extracting their implementations, dependencies, and usages
+3. Generating structured training data
+
+Let's walk through each step using Codegen.
+
+## Step 1: Finding Functions and Their Context
+
+First, we will do a "graph expansion" for each function - grab the function's source, as well as the full source of all usages of the function and all dependencies.
+
+<Info>See [dependencies and usages](/building-with-codegen/dependencies-and-usages) to learn more about navigating the code graph</Info>
+
+First, let's import the types we need from Codegen:
+
+```python
+import codegen
+from codegen import Codebase
+from codegen.sdk.core.external_module import ExternalModule
+from codegen.sdk.core.import_resolution import Import
+from codegen.sdk.core.symbol import Symbol
+```
+
+Here's how we get the full context for each function:
+
+```python
+def get_function_context(function) -> dict:
+    """Get the implementation, dependencies, and usages of a function."""
+    context = {
+        "implementation": {"source": function.source, "filepath": function.filepath},
+        "dependencies": [],
+        "usages": [],
+    }
+
+    # Add dependencies
+    for dep in function.dependencies:
+        # Hop through imports to find the root symbol source
+        if isinstance(dep, Import):
+            dep = hop_through_imports(dep)
+
+        context["dependencies"].append({"source": dep.source, "filepath": dep.filepath})
+
+    # Add usages
+    for usage in function.usages:
+        context["usages"].append({
+            "source": usage.usage_symbol.source,
+            "filepath": usage.usage_symbol.filepath,
+        })
+
+    return context
+```
+
+Notice how we use `hop_through_imports` to resolve dependencies. When working with imports, symbols can be re-exported multiple times. For example, a helper function might be imported and re-exported through several files before being used. We need to follow this chain to find the actual implementation:
+
+```python
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+    """Finds the root symbol for an import."""
+    if isinstance(imp.imported_symbol, Import):
+        return hop_through_imports(imp.imported_symbol)
+    return imp.imported_symbol
+```
+
+This creates a structured representation of each function's context:
+
+```json
+{
+  "implementation": {
+    "source": "def process_data(input: str) -> dict: ...",
+    "filepath": "src/data_processor.py"
+  },
+  "dependencies": [
+    {
+      "source": "def validate_input(data: str) -> bool: ...",
+      "filepath": "src/validators.py"
+    }
+  ],
+  "usages": [
+    {
+      "source": "result = process_data(user_input)",
+      "filepath": "src/api.py"
+    }
+  ]
+}
+```
+
+## Step 2: Processing the Codebase
+
+Next, we process all functions in the codebase to generate our training data:
+
+```python
+def run(codebase: Codebase):
+    """Generate training data using a node2vec-like approach for code embeddings."""
+    # Track all function contexts
+    training_data = {
+        "functions": [],
+        "metadata": {
+            "total_functions": len(codebase.functions),
+            "total_processed": 0,
+            "avg_dependencies": 0,
+            "avg_usages": 0,
+        },
+    }
+
+    # Process each function in the codebase
+    for function in codebase.functions:
+        # Skip if function is too small
+        if len(function.source.split("\n")) < 2:
+            continue
+
+        # Get function context
+        context = get_function_context(function)
+
+        # Only keep functions with enough context
+        if len(context["dependencies"]) + len(context["usages"]) > 0:
+            training_data["functions"].append(context)
+
+    # Update metadata
+    training_data["metadata"]["total_processed"] = len(training_data["functions"])
+    if training_data["functions"]:
+        training_data["metadata"]["avg_dependencies"] = sum(
+            len(f["dependencies"]) for f in training_data["functions"]
+        ) / len(training_data["functions"])
+        training_data["metadata"]["avg_usages"] = sum(
+            len(f["usages"]) for f in training_data["functions"]
+        ) / len(training_data["functions"])
+
+    return training_data
+```
+
+## Step 3: Running the Generator
+
+Finally, we can run our training data generator on any codebase.
+
+<Note>See [parsing codebases](/building-with-codegen/parsing-codebases) to learn more</Note>
+
+```python
+if __name__ == "__main__":
+    print("Initializing codebase...")
+    codebase = Codebase.from_repo("fastapi/fastapi")
+
+    print("Generating training data...")
+    training_data = run(codebase)
+
+    print("Saving training data...")
+    with open("training_data.json", "w") as f:
+        json.dump(training_data, f, indent=2)
+    print("Training data saved to training_data.json")
+```
+
+This will:
+1. Load the target codebase
+2. Process all functions
+3. Save the structured training data to a JSON file
+
+<Tip>
+  You can use any Git repository as your source codebase by passing the repo URL
+  to [Codebase.from_repo(...)](/api-reference/core/codebase#from-repo).
+</Tip>
+
+## Using the Training Data
+
+The generated data can be used to train LLMs in several ways:
+
+1. **Masked Function Prediction**: Hide a function's implementation and predict it from dependencies and usages
+2. **Code Embeddings**: Generate embeddings that capture semantic relationships between functions
+3. **Dependency Prediction**: Learn to predict which functions are likely to be dependencies
+4. **Usage Pattern Learning**: Train models to understand common usage patterns
+
+For example, to create a masked prediction task:
+
+```python
+def create_training_example(function_data):
+    """Create a masked prediction example from function data."""
+    return {
+        "context": {
+            "dependencies": function_data["dependencies"],
+            "usages": function_data["usages"]
+        },
+        "target": function_data["implementation"]
+    }
+
+# Create training examples
+examples = [create_training_example(f) for f in training_data["functions"]]
+```
+
+## Best Practices
+
+1. **Filter Small Functions**: Skip trivial functions that won't provide meaningful training data:
+```python
+if len(function.source.split("\n")) < 2:
+    continue
+```
+
+2. **Ensure Sufficient Context**: Only use functions with dependencies or usages:
+```python
+if len(context["dependencies"]) + len(context["usages"]) > 0:
+    training_data["functions"].append(context)
+```
+
+3. **Track Metadata**: Keep statistics about your training data:
+```python
+training_data["metadata"] = {
+    "total_functions": len(codebase.functions),
+    "total_processed": len(training_data["functions"]),
+    "avg_dependencies": average_dependencies,
+    "avg_usages": average_usages
+}
+```
+
+4. **Handle Import Chains**: Follow import chains to find root implementations:
+```python
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+    if isinstance(imp.imported_symbol, Import):
+        return hop_through_imports(imp.imported_symbol)
+    return imp.imported_symbol
+```
+
+By following these guidelines, you can generate high-quality training data for your LLM projects while maintaining code quality and consistency. 
diff --git a/src/codegen/cli/api/endpoints.py b/src/codegen/cli/api/endpoints.py
@@ -9,3 +9,4 @@
 LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-lookup.modal.run"
 RUN_ON_PR_ENDPOINT = f"https://{MODAL_PREFIX}--cli-run-on-pull-request.modal.run"
 PR_LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-pr-lookup.modal.run"
+CODEGEN_SYSTEM_PROMPT_URL = "https://gist.githubusercontent.com/jayhack/15681a2ceaccd726f19e6fdb3a44738b/raw/17c08054e3931b3b7fdf424458269c9e607541e8/codegen-system-prompt.txt"
diff --git a/src/codegen/cli/commands/init/render.py b/src/codegen/cli/commands/init/render.py
@@ -6,5 +6,4 @@ def get_success_message(codegen_dir: Path, docs_dir: Path, examples_dir: Path) -
     return """📁 .codegen configuration folder created:
    [dim]config.toml[/dim]  Project configuration
    [dim]codemods/[/dim]    Your codemod implementations
-   [dim]jupyter/[/dim]     Notebooks for codebase exploration
-   [dim]prompts/[/dim]     AI system prompts (gitignored)"""
+   [dim]codegen-system-prompt.txt[/dim]     AI system prompt (gitignored)"""
diff --git a/src/codegen/cli/workspace/initialize_workspace.py b/src/codegen/cli/workspace/initialize_workspace.py
@@ -2,6 +2,7 @@
 from contextlib import nullcontext
 from pathlib import Path
 
+import requests
 import rich
 import toml
 from rich.status import Status
@@ -78,6 +79,7 @@ def initialize_codegen(
     CONFIG_PATH = CODEGEN_FOLDER / "config.toml"
     JUPYTER_DIR = CODEGEN_FOLDER / "jupyter"
     CODEMODS_DIR = CODEGEN_FOLDER / "codemods"
+    SYSTEM_PROMPT_PATH = CODEGEN_FOLDER / "codegen-system-prompt.txt"
 
     # If status is a string, create a new spinner
     context = create_spinner(f"   {status} folders...") if isinstance(status, str) else nullcontext()
@@ -91,6 +93,16 @@ def initialize_codegen(
         JUPYTER_DIR.mkdir(parents=True, exist_ok=True)
         CODEMODS_DIR.mkdir(parents=True, exist_ok=True)
 
+        # Download system prompt
+        try:
+            from codegen.cli.api.endpoints import CODEGEN_SYSTEM_PROMPT_URL
+
+            response = requests.get(CODEGEN_SYSTEM_PROMPT_URL)
+            response.raise_for_status()
+            SYSTEM_PROMPT_PATH.write_text(response.text)
+        except Exception as e:
+            rich.print(f"[yellow]Warning: Could not download system prompt: {e}[/yellow]")
+
         if not repo:
             rich.print("No git repository found. Please run this command in a git repository.")
         else:
@@ -152,6 +164,7 @@ def modify_gitignore(codegen_folder: Path):
         "examples/",
         "prompts/",
         "jupyter/",
+        "codegen-system-prompt.txt",  # Add system prompt to gitignore
         "",
         "# Python cache files",
         "__pycache__/",