diff --git a/.codegen/.gitignore b/.codegen/.gitignore deleted file mode 100644 index 77d89d205..000000000 --- a/.codegen/.gitignore +++ /dev/null @@ -1,15 +0,0 @@ -# Codegen -docs/ -examples/ -prompts/ -jupyter/ -.venv/ -.env -codegen-system-prompt.txt - -# Python cache files -**/__pycache__/ -*.py[cod] -*$py.class -*.txt -*.pyc diff --git a/.codegen/codemods/no_link_backticks/no_link_backticks.py b/.codegen/codemods/no_link_backticks/no_link_backticks.py deleted file mode 100644 index e8cda5323..000000000 --- a/.codegen/codemods/no_link_backticks/no_link_backticks.py +++ /dev/null @@ -1,44 +0,0 @@ -import codegen -from codegen import Codebase - - -@codegen.function(name="no-link-backticks", subdirectories=["test/unit"]) -def run(codebase: Codebase): - import re - - # Define the pattern for Markdown links with backticks in the link text - link_pattern = re.compile(r"\[([^\]]*`[^\]]*`[^\]]*)\]\(([^)]+)\)") - - # Iterate over all .mdx files in the codebase - for file in codebase.files(extensions=["mdx"]): - if file.extension == ".mdx": - print(f"Processing {file.path}") - new_content = file.content - - # Find all markdown links with backticks in link text - matches = link_pattern.finditer(new_content) - - for match in matches: - # Original link text with backticks - original_text = match.group(1) - - # Remove backticks from the link text - new_text = original_text.replace("`", "") - - # Replace the link in content - new_content = new_content.replace(match.group(0), f"[{new_text}]({match.group(2)})") - - # Update file content if changes were made - if new_content != file.content: - file.edit(new_content) - - # Commit all changes - codebase.commit() - - -if __name__ == "__main__": - print("Parsing codebase...") - codebase = Codebase("./") - - print("Running function...") - codegen.run(run) diff --git a/.codegen/codemods/test_language/test_language.py b/.codegen/codemods/test_language/test_language.py deleted file mode 100644 index 19ae4c0bd..000000000 --- a/.codegen/codemods/test_language/test_language.py +++ /dev/null @@ -1,19 +0,0 @@ -import codegen -from codegen.sdk.core.codebase import Codebase -from codegen.shared.enums.programming_language import ProgrammingLanguage - - -@codegen.function("test-language", subdirectories=["src/codegen/cli"], language=ProgrammingLanguage.PYTHON) -def run(codebase: Codebase): - file = codebase.get_file("src/codegen/cli/errors.py") - print(f"File: {file.path}") - for s in file.symbols: - print(s.name) - - -if __name__ == "__main__": - print("Parsing codebase...") - codebase = Codebase("./") - - print("Running...") - run(codebase) diff --git a/.codegen/codemods/update_loggers/update_loggers.py b/.codegen/codemods/update_loggers/update_loggers.py deleted file mode 100644 index 74edee3e1..000000000 --- a/.codegen/codemods/update_loggers/update_loggers.py +++ /dev/null @@ -1,18 +0,0 @@ -import codegen -from codegen.sdk.core.codebase import PyCodebaseType - - -@codegen.function("update-loggers") -def run(codebase: PyCodebaseType) -> None: - """Updates all loggers in src/codegen to use the new get_logger function.""" - for file in codebase.files: - if not str(file.filepath).startswith("src/codegen/"): - continue - - if file.get_import("logging") is None: - continue - - if (logger := file.get_global_var("logger")) and logger.value.source == "logging.getLogger(__name__)": - print(f"Updating logger in {file.filepath}") - logger.set_value("get_logger(__name__)") - file.add_import_from_import_string("\nfrom codegen.shared.logging.get_logger import get_logger") diff --git a/.github/actions/setup-oss-repos/action.yml b/.github/actions/setup-oss-repos/action.yml index 4ec25be83..c7951d599 100644 --- a/.github/actions/setup-oss-repos/action.yml +++ b/.github/actions/setup-oss-repos/action.yml @@ -1,23 +1,8 @@ -# yaml-language-server: $schema=https://json.schemastore.org/github-action.json -name: "Setup OSS repos" -description: "Setup OSS repos" -# TODO: add size filter +name: "Setup OSS repos (disabled)" +description: "OSS repos setup has been disabled" runs: using: "composite" steps: - - name: Cache oss-repos - id: cache-oss-repos - uses: actions/cache@v4 - with: - path: oss_repos - key: ${{ runner.os }}-repo-cache-2-${{hashFiles('codegen-backend/codegen_tests/graph_sitter/codemod/repos/open_source/*.json')}} - - name: Populate oss-repos if the cache is empty - if: steps.cache-oss-repos.outputs.cache-hit != 'true' + - name: Skip OSS repos setup shell: bash - run: | - uv run --frozen python -m tests.shared.codemod.commands clone-repos --clean-cache - env: - GITHUB_WORKSPACE: $GITHUB_WORKSPACE - - name: Verify cache contents - shell: bash - run: ls -la $GITHUB_WORKSPACE/oss_repos/ + run: echo "OSS repos setup is disabled" diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 153fe1d9b..483a1a0a2 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -37,8 +37,9 @@ jobs: - run: uv run --frozen pre-commit run --show-diff-on-failure --color=always --all-files --source ${{ github.event.pull_request.base.sha || github.event.before }} --origin ${{ github.event.pull_request.head.sha || github.event.after }} shell: bash - - uses: stefanzweifel/git-auto-commit-action@v5 - if: ${{ always() && env.REPO_SCOPED_TOKEN && github.event_name == 'pull_request' }} - with: - commit_message: "Automated pre-commit update" - push_options: "--no-verify" + # Temporarily disabled to prevent infinite loop with version updates + # - uses: stefanzweifel/git-auto-commit-action@v5 + # if: ${{ always() && env.REPO_SCOPED_TOKEN && github.event_name == 'pull_request' }} + # with: + # commit_message: "Automated pre-commit update" + # push_options: "--no-verify" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c2d87b75f..b982d045f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -26,22 +26,8 @@ permissions: jobs: build: - name: Build 3.${{ matrix.python }} ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ - ubuntu-latest, - ubuntu-22.04-arm, # https://github.com/actions/partner-runner-images/issues/37 https://github.com/orgs/community/discussions/148648#discussioncomment-12099554 - macos-latest, - macos-14-large - ] - python: [ - 12, - 13, - ] - + name: Build Pure Python Wheel + runs-on: ubuntu-latest steps: - name: Github context env: @@ -58,11 +44,10 @@ jobs: uses: astral-sh/setup-uv@v5.4 id: setup-uv with: - enable-cache: false + enable-cache: true prune-cache: false - python-version: 3.${{ matrix.python }} + python-version: "3.12" # Use single Python version for building version: '0.5.24' - cache-suffix: 3.${{ matrix.python }} - name: Fetch tags if: ${{ inputs.release-tag || startsWith(github.ref, 'refs/tags/') }} @@ -70,16 +55,14 @@ jobs: git branch git fetch --depth=1 origin +refs/tags/*:refs/tags/* - # TODO: add cbuildwheel cache - name: Build wheel - uses: pypa/cibuildwheel@v2.23.3 - env: - CIBW_BUILD: "*cp3${{ matrix.python }}*" + run: | + uv build --wheel --out-dir dist/ - uses: actions/upload-artifact@v4 with: - name: wheels-${{ matrix.os }}-3.${{ matrix.python }} - path: ./wheelhouse/*.whl + name: wheels + path: ./dist/*.whl release: if: ${{ inputs.release-tag || startsWith(github.ref, 'refs/tags/') }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 95936543e..02c884cd3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -99,76 +99,7 @@ jobs: env: GITHUB_WORKSPACE: $GITHUB_WORKSPACE - parse-tests: - needs: access-check - if: contains(github.event.pull_request.labels.*.name, 'parse-tests') || github.event_name == 'push' || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest-32 - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: Setup environment - uses: ./.github/actions/setup-environment - - - name: Cache oss-repos - uses: ./.github/actions/setup-oss-repos - - - name: Install yarn and pnpm - run: | - npm install -g yarn & - npm install -g pnpm - - name: Test with pytest - timeout-minutes: 15 - env: - GITHUB_WORKSPACE: $GITHUB_WORKSPACE - run: | - uv run pytest \ - -n auto \ - -o junit_suite_name="${{github.job}}" \ - tests/integration/codemod/test_parse.py - - - uses: ./.github/actions/report - with: - flag: no-flag - codecov_token: ${{ secrets.CODECOV_TOKEN }} - - - name: Notify parse tests failure - uses: slackapi/slack-github-action@v2.1.0 - if: failure() && github.event_name == 'push' && false - with: - webhook: ${{ secrets.SLACK_WEBHOOK_URL }} - webhook-type: incoming-webhook - payload: | - { - "blocks": [ - { - "type": "header", - "text": { - "type": "plain_text", - "text": "❌ Parse Tests Failed", - "emoji": true - } - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "*Branch:* ${{ github.ref_name }}\n*Triggered by:* <${{ github.server_url }}/${{ github.actor }}|@${{ github.actor }}>\n\n*Details:*\n• <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View workflow run>" - } - }, - { - "type": "context", - "elements": [ - { - "type": "mrkdwn", - "text": "Failed at " - } - ] - } - ] - } integration-tests: needs: access-check diff --git a/.gitignore b/.gitignore index 2c38ccae0..00f68e676 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ alembic_versions_backup /.nvmrc **/build/test-results/test/TEST*.xml src/codegen/sdk/__init__.py +src/codegen/_version.py src/**/*.html .ccache/ uv-*.tar.gz diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eeea3f677..dda073d87 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,6 @@ default_language_version: python: python3.13 repos: - - repo: https://github.com/ComPWA/taplo-pre-commit rev: v0.9.3 hooks: @@ -24,12 +23,7 @@ repos: - id: biome-check language: node additional_dependencies: ["@biomejs/biome@1.9.4"] - exclude: (src/codemods/eval)|(tests/unit/skills/snapshots)|(tests/unit/codegen/sdk/output)|(tests/integration/verified_codemods)|(docs/samples) - - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.6 - hooks: - - id: cython-lint - - id: double-quote-cython-strings + exclude: (src/codemods/eval)|(tests/unit/skills/snapshots)|(tests/unit/codegen/sdk/output)|(tests/integration/verified_codemods)|(docs/) - repo: https://github.com/kynan/nbstripout rev: 0.8.1 @@ -88,13 +82,13 @@ repos: args: ["--frozen", "--all-packages", "--all-extras"] - repo: https://github.com/hukkin/mdformat - rev: 0.7.22 # Use the ref you want to point at + rev: 0.7.22 # Use the ref you want to point at hooks: - - id: mdformat - language: python - # Optionally add plugins - additional_dependencies: - - mdformat-gfm - - mdformat-ruff - - mdformat-config - - mdformat-pyproject + - id: mdformat + language: python + # Optionally add plugins + additional_dependencies: + - mdformat-gfm + - mdformat-ruff + - mdformat-config + - mdformat-pyproject diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index a71cfdd77..000000000 --- a/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG PYTHON_VERSION=3.13 -ARG CODEGEN_BOT_GHE_TOKEN="" -FROM ghcr.io/astral-sh/uv:python${PYTHON_VERSION}-bookworm-slim AS base_uv -ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy -ENV GITHUB_WORKSPACE=/workspace -## Change the working directory to the `codegen-sdk` directory -FROM base_uv AS install-tools -RUN apt-get update && apt-get install -y build-essential curl git -RUN curl -fsSL https://deb.nodesource.com/setup_23.x -o nodesource_setup.sh -RUN bash nodesource_setup.sh -RUN apt-get update && apt-get install -y jq nodejs -RUN corepack enable -RUN --mount=type=cache,target=/root/.cache/uv uv pip install --system coverage -RUN --mount=type=cache,target=/root/.cache/uv uv tool install codecov-cli --python 3.10 -RUN --mount=type=cache,target=/root/.cache/uv uv tool install pre-commit --with pre-commit-uv -WORKDIR /codegen-sdk -ENTRYPOINT [ "uv", "run", "--frozen", "/bin/bash"] -FROM install-tools AS base-image -## Install dependencies -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=uv.lock,target=uv.lock \ - --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - --mount=type=bind,source=hatch.toml,target=hatch.toml \ - uv sync --frozen --no-install-workspace --all-extras -ADD . /codegen-sdk -## Sync the project -RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync --frozen --all-extras -FROM base-image AS pre-commit -RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=cache,target=/root/.cache/pre-commit \ - uv run pre-commit install-hooks -FROM base-image AS extra-repos -ARG CODEGEN_BOT_GHE_TOKEN="" -RUN uv run python -m tests.shared.codemod.commands clone-repos --clean-cache --extra-repos --token ${CODEGEN_BOT_GHE_TOKEN} -FROM base-image AS oss-repos -ARG CODEGEN_BOT_GHE_TOKEN="" -RUN uv run python -m tests.shared.codemod.commands clone-repos --clean-cache --token ${CODEGEN_BOT_GHE_TOKEN} diff --git a/architecture/1. plumbing/file-discovery.md b/architecture/1. plumbing/file-discovery.md deleted file mode 100644 index f4c3998d0..000000000 --- a/architecture/1. plumbing/file-discovery.md +++ /dev/null @@ -1,19 +0,0 @@ -# File Discovery - -The file discovery process is responsible for identifying and organizing all relevant files in a project that need to be processed by the SDK. - -## Initialization - -- We take in either a list of projects or a path to a filesystem. -- If we get a path, we'll detect the programming language, initialize the git client based on the path and get a Project - -## File discovery - -- We discover files using the git client so we can respect gitignored files -- We then filter files based on the language and the project configuration - - If specified, we filter by subdirectories - - We also filter by file extensions - -## Next Step - -After file discovery is complete, the files are passed to the [Tree-sitter Parsing](../parsing/tree-sitter.md) phase, where each file is parsed into a concrete syntax tree. diff --git a/architecture/2. parsing/A. Tree Sitter.md b/architecture/2. parsing/A. Tree Sitter.md deleted file mode 100644 index 3500b65fd..000000000 --- a/architecture/2. parsing/A. Tree Sitter.md +++ /dev/null @@ -1,33 +0,0 @@ -# Tree-sitter Parsing - -Tree-sitter is used as the primary parsing engine for converting source code into concrete syntax trees. Tree-sitter supports two modes of operation: - -```python -def my_function(): - pass -``` - -Tree sitter parses this as the following: - -``` -module [0, 0] - [3, 0] - function_definition [0, 0] - [1, 8] - name: identifier [0, 4] - [0, 15] - parameters: parameters [0, 15] - [0, 17] - body: block [1, 4] - [1, 8] - pass_statement [1, 4] - [1, 8] -``` - -- An CST mode which includes syntax nodes (for example, the `def` keyword, spaces, or parentheses). The syntax nodes are "anonymous" and don't have any semantic meaning. - - You don't see these nodes in the tree-sitter output, but they are there. -- A AST mode where we only focus on the semantic nodes (for example, the `my_function` identifier, and the `pass` statement). These are 'named nodes' and have semantic meaning. - - This is different from field names (like 'body'). These mean nothing about the node, they indicate what role the child node ('block') plays in the parent node ('function_definition'). - -## Implementation Details - -- We construct a mapping between file type and the tree-sitter grammar -- For each file given to us (via git), we parse it using the appropriate grammar - -## Next Step - -Once the concrete syntax trees are built, they are transformed into our abstract syntax tree representation in the [AST Construction](./B.%20AST%20Construction.md) phase. diff --git a/architecture/2. parsing/B. AST Construction.md b/architecture/2. parsing/B. AST Construction.md deleted file mode 100644 index 06a1cd48c..000000000 --- a/architecture/2. parsing/B. AST Construction.md +++ /dev/null @@ -1,77 +0,0 @@ -# AST Construction - -The tree-sitter CST/AST is powerful but it focuses on syntax highlighting and not semantic meaning. -For example, take decorators: - -```python -@decorator -def my_function(): - pass -``` - -``` -module [0, 0] - [3, 0] - decorated_definition [0, 0] - [2, 8] - decorator [0, 0] - [0, 10] - identifier [0, 1] - [0, 10] - definition: function_definition [1, 0] - [2, 8] - name: identifier [1, 4] - [1, 15] - parameters: parameters [1, 15] - [1, 17] - body: block [2, 4] - [2, 8] - pass_statement [2, 4] - [2, 8] - -``` - -You can see the decorated_definition node has a decorator and a definition. This makes sense for syntax highlighting - the decorator is highlighted seperately from the function definition. - -However, this is not useful for semantic analysis. We need to know that the decorator is decorating the function definition - there is a single function definition which may contain multiple decorators. -This becomes visibile when we consider function call chains: - -```python -a().b().c().d() -``` - -``` -module [0, 0] - [2, 0] - expression_statement [0, 0] - [0, 15] - call [0, 0] - [0, 15] - function: attribute [0, 0] - [0, 13] - object: call [0, 0] - [0, 11] - function: attribute [0, 0] - [0, 9] - object: call [0, 0] - [0, 7] - function: attribute [0, 0] - [0, 5] - object: call [0, 0] - [0, 3] - function: identifier [0, 0] - [0, 1] - arguments: argument_list [0, 1] - [0, 3] - attribute: identifier [0, 4] - [0, 5] - arguments: argument_list [0, 5] - [0, 7] - attribute: identifier [0, 8] - [0, 9] - arguments: argument_list [0, 9] - [0, 11] - attribute: identifier [0, 12] - [0, 13] - arguments: argument_list [0, 13] - [0, 15] -``` - -You can see that the chain of calls is represented as a deeply nested structure. This is not useful for semantic analysis or performing edits on these nodes. Therefore, when parsing we need to build an AST that is more useful for semantic analysis. - -## Implementation - -- For each file, we parse a file-specific AST -- We offer two modes of parsing: - - Pattern based parsing: It maps a particular node type to a semantic node type. For example, we broadly map all identifiers to the `Name` node type. - - Custom parsing: It takes a CST and builds a custom node type. For example, we can turn a decorated_definition node into a function_definition node with decorators. This involves careful arranging of the CST nodes into a new structure. - -## Pattern based parsing - -To do this, we need to build a mapping between the tree-sitter node types and our semantic node types. These mappings are language specific and stored in node_classes. They are processed by parser.py at runtime. We can access these via many functions - child_by_field_name, \_parse_expression, etc. These methods both wrap the tree-sitter methods and parse the tree-sitter node into our semantic node. - -## Custom parsing - -These are more complex and require more work. Most symbols (classes, functions, etc), imports, exports, and other complex constructs are parsed using custom parsing. - -## Statement parsing - -Statements have another layer of complexity. They are essentially pattern based but the mapping and logic is defined directly in the parser.py file. - -## Next Step - -After the AST is constructed, the system moves on to [Directory Parsing](./C.%20Directory%20Parsing.md) to build a hierarchical representation of the codebase's directory structure. diff --git a/architecture/2. parsing/C. Directory Parsing.md b/architecture/2. parsing/C. Directory Parsing.md deleted file mode 100644 index f25de2e29..000000000 --- a/architecture/2. parsing/C. Directory Parsing.md +++ /dev/null @@ -1,50 +0,0 @@ -# Directory Parsing - -The Directory Parsing system is responsible for creating and maintaining a hierarchical representation of the codebase's directory structure in memory. Directories do not hold references to the file itself, but instead holds the names to the files and does a dynamic lookup when needed. - -In addition to providing a more cohesive API for listing directory files, the Directory API is also used for [TSConfig](../3.%20imports-exports/C.%20TSConfig.md)-based (Import Resolution)[../3.%20imports-exports/A.%20Imports.md]. - -## Core Components - -The Directory Tree is constructed during the initial build_graph step in codebase_context.py, and is recreated from scratch on every re-sync. More details are below: - -## Directory Tree Construction - -The directory tree is built through the following process: - -1. The `build_directory_tree` method in `CodebaseContext` is called during graph initialization or when the codebase structure changes. -1. The method iterates through all files in the repository, creating directory objects for each directory path encountered. -1. For each file, it adds the file to its parent directory using the `_add_file` method. -1. Directories are created recursively as needed using the `get_directory` method with create_on_missing=True\`. - -## Directory Representation - -The `Directory` class provides a rich interface for working with directories: - -- **Hierarchy Navigation**: Access parent directories and subdirectories -- **File Access**: Retrieve files by name or extension -- **Symbol Access**: Find symbols (classes, functions, etc.) within files in the directory -- **Directory Operations**: Rename, remove, or update directories - -Each `Directory` instance maintains: - -- A reference to its parent directory -- Lists of files and subdirectories -- Methods to recursively traverse the directory tree - -## File Representation - -Files are represented by the `File` class and its subclasses: - -- `File`: Base class for all files, supporting basic operations like reading and writing content -- `SourceFile`: Specialized class for source code files that can be parsed into an AST - -Files maintain references to: - -- Their parent directory -- Their content (loaded dynamically to preserve the source of truth) -- For source files, the parsed AST and symbols - -## Next Step - -After the directory structure is parsed, the system can perform [Import Resolution](../3.%20imports-exports/A.%20Imports.md) to analyze module dependencies and resolve symbols across files. diff --git a/architecture/3. imports-exports/A. Imports.md b/architecture/3. imports-exports/A. Imports.md deleted file mode 100644 index cca5951ab..000000000 --- a/architecture/3. imports-exports/A. Imports.md +++ /dev/null @@ -1,60 +0,0 @@ -# Import Resolution - -Import resolution follows AST construction in the code analysis pipeline. It identifies dependencies between modules and builds a graph of relationships across the codebase. - -> NOTE: This is an actively evolving part of Codegen SDK, so some details here may be imcomplete, outdated, or incorrect. - -## Purpose - -The import resolution system serves these purposes: - -1. **Dependency Tracking**: Maps relationships between files by resolving import statements. -1. **Symbol Resolution**: Connects imported symbols to their definitions. -1. **Module Graph Construction**: Builds a directed graph of module dependencies. -1. **(WIP) Cross-Language Support**: Provides implementations for different programming languages. - -## Core Components - -### ImportResolution Class - -The `ImportResolution` class represents the outcome of resolving an import statement. It contains: - -- The source file containing the imported symbol -- The specific symbol being imported (if applicable) -- Whether the import references an entire file/module - -### Import Base Class - -The `Import` class is the foundation for language-specific import implementations. It: - -- Stores metadata about the import (module path, symbol name, alias) -- Provides the abstract `resolve_import()` method -- Adds symbol resolution edges to the codebase graph - -### Language-Specific Implementations - -#### Python Import Resolution - -The `PyImport` class extends the base `Import` class with Python-specific logic: - -- Handles relative imports -- Supports module imports, named imports, and wildcard imports -- Resolves imports using configurable resolution paths and `sys.path` -- Handles special cases like `__init__.py` files - -#### TypeScript Import Resolution - -The `TSImport` class implements TypeScript-specific resolution: - -- Supports named imports, default imports, and namespace imports -- Handles type imports and dynamic imports -- Resolves imports using TSConfig path mappings -- Supports file extension resolution - -## Implementation - -After file and directory parse, we loop through all import nodes and perform `add_symbol_resolution_edge`. This then invokes the language-specific `resolve_import` method that converts the import statement into a resolvable `ImportResolution` object (or None if the import cannot be resolved). This import symbol and the `ImportResolution` object are then used to add a symbol resolution edge to the graph, where it can then be used in future steps to resolve symbols. - -## Next Step - -After import resolution, the system analyzes [Export Analysis](./B.%20Exports.md) and handles [TSConfig Support](./C.%20TSConfig.md) for TypeScript projects. This is followed by [Type Analysis](../4.%20type-analysis/A.%20Type%20Analysis.md). diff --git a/architecture/3. imports-exports/B. Exports.md b/architecture/3. imports-exports/B. Exports.md deleted file mode 100644 index 0e42c98c4..000000000 --- a/architecture/3. imports-exports/B. Exports.md +++ /dev/null @@ -1,75 +0,0 @@ -# Export Analysis - -Some languages contain additional metadata on "exported" symbols, specifying which symbols are made available to other modules. Export analysis follows import resolution in the code analysis pipeline. It identifies and processes exported symbols from modules, enabling the system to track what each module makes available to others. - -## Core Components - -### Export Base Class - -The `Export` class serves as the foundation for language-specific export implementations. It: - -- Stores metadata about the export (symbol name, is default, etc.) -- Tracks the relationship between the export and its declared symbol -- Adds export edges to the codebase graph - -### TypeScript Export Implementation - -The `TSExport` class implements TypeScript-specific export handling: - -- Supports various export styles (named exports, default exports, re-exports) -- Handles export declarations with and without values -- Processes wildcard exports (`export * from 'module'`) -- Manages export statements with multiple exports - -#### Export Types and Symbol Resolution - -The TypeScript implementation handles several types of exports: - -1. **Declaration Exports** - - - Function declarations (including generators) - - Class declarations - - Interface declarations - - Type alias declarations - - Enum declarations - - Namespace declarations - - Variable/constant declarations - -1. **Value Exports** - - - Object literals with property exports - - Arrow functions and function expressions - - Classes and class expressions - - Assignment expressions - - Primitive values and expressions - -1. **Special Export Forms** - - - Wildcard exports (`export * from 'module'`) - - Named re-exports (`export { name as alias } from 'module'`) - - Default exports with various value types - -#### Symbol Tracking and Dependencies - -The export system: - -- Maintains relationships between exported symbols and their declarations -- Validates export names match their declared symbols -- Tracks dependencies through the codebase graph -- Handles complex scenarios like: - - Shorthand property exports in objects - - Nested function and class declarations - - Re-exports from other modules - -#### Integration with Type System - -Exports are tightly integrated with the type system: - -- Exported type declarations are properly tracked -- Symbol resolution considers both value and type exports -- Re-exports preserve type information -- Export edges in the codebase graph maintain type relationships - -## Next Step - -After export analysis is complete, for TypeScript projects, the system processes [TSConfig Support](./C.%20TSConfig.md) configurations. Then it moves on to [Type Analysis](../4.%20type-analysis/A.%20Type%20Analysis.md) to build a complete understanding of types and symbols. diff --git a/architecture/3. imports-exports/C. TSConfig.md b/architecture/3. imports-exports/C. TSConfig.md deleted file mode 100644 index b2362a7c8..000000000 --- a/architecture/3. imports-exports/C. TSConfig.md +++ /dev/null @@ -1,81 +0,0 @@ -# TSConfig Support - -TSConfig support is a critical component for TypeScript projects in the import resolution system. It processes TypeScript configuration files (tsconfig.json) to correctly resolve module paths and dependencies. - -## Purpose - -The TSConfig support system serves these purposes: - -1. **Path Mapping**: Resolves custom module path aliases defined in the tsconfig.json file. -1. **Base URL Resolution**: Handles non-relative module imports using the baseUrl configuration. -1. **Project References**: Manages dependencies between TypeScript projects using the references field. -1. **Directory Structure**: Respects rootDir and outDir settings for maintaining proper directory structures. - -## Core Components - -### TSConfig Class - -The `TSConfig` class represents a parsed TypeScript configuration file. It: - -- Parses and stores the configuration settings from tsconfig.json -- Handles inheritance through the "extends" field -- Provides methods for translating between import paths and absolute file paths -- Caches computed values for performance optimization - -## Configuration Processing - -### Configuration Inheritance - -TSConfig files can extend other configuration files through the "extends" field: - -1. Base configurations are loaded and parsed first -1. Child configurations inherit and can override settings from their parent -1. Path mappings, base URLs, and other settings are merged appropriately - -### Path Mapping Resolution - -The system processes the "paths" field in tsconfig.json to create a mapping between import aliases and file paths: - -1. Path patterns are normalized (removing wildcards, trailing slashes) -1. Relative paths are converted to absolute paths -1. Mappings are stored for efficient lookup during import resolution - -### Project References - -The "references" field defines dependencies between TypeScript projects: - -1. Referenced projects are identified and loaded -1. Their configurations are analyzed to determine import paths -1. Import resolution can cross project boundaries using these references - -## Import Resolution Process - -### Path Translation - -When resolving an import path in TypeScript: - -1. Check if the path matches any path alias in the tsconfig.json -1. If a match is found, translate the path according to the mapping -1. Apply baseUrl resolution for non-relative imports -1. Handle project references for cross-project imports - -### Optimization Techniques - -The system employs several optimizations: - -1. Caching computed values to avoid redundant processing -1. Early path checking for common patterns (e.g., paths starting with "@" or "~") -1. Hierarchical resolution that respects the configuration inheritance chain - -## Integration with Import Resolution - -The TSConfig support integrates with the broader import resolution system: - -1. Each TypeScript file is associated with its nearest tsconfig.json -1. Import statements are processed using the file's associated configuration -1. Path mappings are applied during the module resolution process -1. Project references are considered when resolving imports across project boundaries - -## Next Step - -After TSConfig processing is complete, the system proceeds to [Type Analysis](../4.%20type-analysis/A.%20Type%20Analysis.md) where it builds a complete understanding of types, symbols, and their relationships. diff --git a/architecture/4. type-analysis/A. Type Analysis.md b/architecture/4. type-analysis/A. Type Analysis.md deleted file mode 100644 index 9f2d9c28c..000000000 --- a/architecture/4. type-analysis/A. Type Analysis.md +++ /dev/null @@ -1,25 +0,0 @@ -# Type Analysis - -The type analysis system builds a complete understanding of types and symbols across the codebase. - -## Basic flow - -- Discover names that need to be resolved -- Resolve names -- Convert resolutions into graph edges - -## The resolution stack - -To accomplish this, we have an in house computation engine - the ResolutionStack. Each stack frame contains a reference to it's parent frame. However, a parent can have multiple child frames (IE: Union Types). - -When we resolve types on a node, we call resolved_type_frames to get the resolved types. Once we know what goes in the next frame, we call with_resolution_frame to construct the next frame. This is a generator that yields the next frame until we've resolved all the types. Resolved_type_frames is a property caches a list of the generated frames. -Therefore, once you have computed type resolution on a node, you don't need to recompute it. That way, we can start at arbitrary nodes without performance overhead. - -This is similar to how other's implement incremental computation engines with a few weaknesses: - -- There is only 1 query in the query engine -- Partial cache invalidation isn't implemented - -## Next Step - -After understanding the type analysis system overview, let's look at how we [walk the syntax tree](./B.%20Tree%20Walking.md) to analyze code structure. diff --git a/architecture/4. type-analysis/B. Tree Walking.md b/architecture/4. type-analysis/B. Tree Walking.md deleted file mode 100644 index c0c777dc4..000000000 --- a/architecture/4. type-analysis/B. Tree Walking.md +++ /dev/null @@ -1,49 +0,0 @@ -# Tree Walking - -To compute dependencies, we have to walk the entire AST for every file. -At a high level, the procedure is pretty simple - -```python -def compute_dependencies(self): - for child in self.children: - compute_dependencies(child) -``` - -We start at the root node and walk the tree until we have computed all dependencies. - -## Usage Kind identification - -We have to identify the kind of usage for each node. This is done by looking at the parent node and the child node. - -```python -def foo() -> c: - c() -``` - -We will classify the usage kind of the `c` callsite differently from the return type. - -```python -class PyFunction(...): - ... - - def _compute_dependencies(self, usage_kind: UsageKind): - self.return_type._compute_dependencies(UsageKind.RETURN_TYPE) - self.body._compute_dependencies(UsageKind.BODY) -``` - -By default, we just pass the usage kind to the children. - -## Resolvable Nodes - -At no step in this process described so far have we actually computed any dependencies. That's because there are some special nodes ("Resolvables") that do the heavy lifting. All of the tree walking is just to identify these nodes and the context they are used in. Resolvables are anything inheriting from `Resolvable`: - -- [Name Resolution](./C.%20Name%20Resolution.md) -- [Chained Attributes](./D.%20Chained%20Attributes.md) -- [Function Calls](./E.%20Function%20Calls.md) -- [Subscript Expression](./G.%20Subscript%20Expression.md) - -These are all processed using the [Type Analysis](./A.%20Type%20Analysis.md) to get the definition of the node. They are then converted into [Graph Edges](./H.%20Graph%20Edges.md) and added to the graph. - -## Next Step - -After understanding how we walk the tree, let's look at how we [resolve names](./C.%20Name%20Resolution.md) in the code. diff --git a/architecture/4. type-analysis/C. Name Resolution.md b/architecture/4. type-analysis/C. Name Resolution.md deleted file mode 100644 index bd6516708..000000000 --- a/architecture/4. type-analysis/C. Name Resolution.md +++ /dev/null @@ -1,70 +0,0 @@ -# Name Resolution - -The name resolution system handles symbol references, scoping rules, and name binding across the codebase. - -## What's in a name? - -A name is a `Name` node. It is just a string of text. -For example, `foo` is a name. - -```python -from my_module import foo - -foo() -``` - -Tree sitter parses this into: - -``` -module [0, 0] - [2, 0] - import_from_statement [0, 0] - [0, 25] - module_name: dotted_name [0, 5] - [0, 14] - identifier [0, 5] - [0, 14] - name: dotted_name [0, 22] - [0, 25] - identifier [0, 22] - [0, 25] - expression_statement [1, 0] - [1, 5] - call [1, 0] - [1, 5] - function: identifier [1, 0] - [1, 3] - arguments: argument_list [1, 3] - [1, 5] -``` - -We can map the identifier nodes to `Name` nodes. -You'll see there are actually 3 name nodes here: `foo`, `my_module`, and `foo`. - -- `my_module` is the module name. -- `foo` is the name imported from the module. -- `foo` is the name of the function being called. - -## Name Resolution - -Name resolution is the process of resolving a name to its definition. To do this, all we need to do is - -1. Get the name we're looking for. (e.g. `foo`) -1. Find the scope we're looking in. (in this case, the global file scope) -1. Recursively search the scope for the name (which will return the node corresponding `from my_module import foo`). -1. Use the type engine to get the definition of the name (which will return the function definition). - -## Scoping - -```python -# Local vs global scope -from my_module import foo, bar, fuzz - - -def outer(): - def foo(): ... - - foo() - bar() - fuzz() - - def fuzz(): ... -``` - -If we wanted to resolve `foo` in this case, we would start at the name foo, then check it's parent recursively till we arrive at the function outer. We would then check for the name foo and find there is a nested function with that name. We would then return the function definition. -However, if we wanted to resolve `bar`, we would then check for the name bar and find there is no nested function, variable, or parameter with that name. We would then return the import statement. -Finally for fuzz, when we check for the name fuzz, we would find there is a nested function with that name, but it is defined after the call to `fuzz()`. We would then return the import. - -## Next Step - -These simple cases let us build up to more complex cases. [Chained Attributes](./D.%20Chained%20Attributes.md) covers how we handle method and property access chains. diff --git a/architecture/4. type-analysis/D. Chained Attributes.md b/architecture/4. type-analysis/D. Chained Attributes.md deleted file mode 100644 index 57a3b941c..000000000 --- a/architecture/4. type-analysis/D. Chained Attributes.md +++ /dev/null @@ -1,89 +0,0 @@ -# Chained Attributes - -```python -class Foo: - def foo(self): ... - - -a = Foo() -a.foo() -``` - -A core functionality is to be able to calculate that `a.foo()` is a usage of `foo` in the `Foo` class. -To do this, we must first understand how tree-sitter parses the code. - -``` -module [0, 0] - [5, 0] - class_definition [0, 0] - [2, 11] - name: identifier [0, 6] - [0, 9] - body: block [1, 4] - [2, 11] - function_definition [1, 4] - [2, 11] - name: identifier [1, 8] - [1, 11] - parameters: parameters [1, 11] - [1, 17] - identifier [1, 12] - [1, 16] - body: block [2, 8] - [2, 11] - expression_statement [2, 8] - [2, 11] - ellipsis [2, 8] - [2, 11] - expression_statement [3, 0] - [3, 9] - assignment [3, 0] - [3, 9] - left: identifier [3, 0] - [3, 1] - right: call [3, 4] - [3, 9] - function: identifier [3, 4] - [3, 7] - arguments: argument_list [3, 7] - [3, 9] - expression_statement [4, 0] - [4, 7] - call [4, 0] - [4, 7] - function: attribute [4, 0] - [4, 5] - object: identifier [4, 0] - [4, 1] - attribute: identifier [4, 2] - [4, 5] - arguments: argument_list [4, 5] - [4, 7] -``` - -If we look at this parse tree - we can see that the `a.foo()` call has a name of type attribute. The object of the call is an identifier for `a`, and the `foo` is an attribute of the identifier for `a`. Typescript has a similar structure. These are the core building blocks of chained attributes. -Chained attributes contain 2 parts: - -1. The object: `a` -1. The attribute: `foo` - -All we must do to resolve the definition of `a.foo` is - -1. Find the definition of the object `a` (the class `Foo`) -1. Get the attribute (`foo`) on the resolved object (`Foo`) (the function `foo`) -1. Resolve the attribute to it's original definition (in this case, the function `foo`) - -## Step 1: Resolve the object - -We can resolve the object by calling resolved_types to get potential types of the object. -If it is a name (like `a`) we can use the name resolution to get the definition of the name. -If it is another chained attribute, we can recursively resolve the chained attribute. -If the original type is a union, we can operate on multiple types and return all the possible results. - -## Step 2: Get the attribute - -We can get the attribute by calling resolve_attribute on the resolved object. Nodes which implement this inherit from `HasAttribute`. Examples include: - -- Class -- File -- Type aliases -- Enums - -## Step 3: Resolve the attribute - -Finally, we can resolve the attribute by calling resolved_types on the attribute. This is useful in cases, particularly for attributes of the class like the following: - -```python -def fuzz(): ... - - -class Foo: - foo = fuzz - - -a = Foo() -a.foo() -``` - -We can resolve the attribute by calling resolved_types on the attribute to go from the attribute (foo) to the underlying resolved type (fuzz). - -## Next Step - -After handling chained attributes, the system moves on to [Function Calls](./E.%20Function%20Calls.md) analysis for handling function and method invocations. diff --git a/architecture/4. type-analysis/E. Function Calls.md b/architecture/4. type-analysis/E. Function Calls.md deleted file mode 100644 index d4db8cd6b..000000000 --- a/architecture/4. type-analysis/E. Function Calls.md +++ /dev/null @@ -1,64 +0,0 @@ -# Function Call - -At a first glance, function calls are simple. We can resolve the function call by looking up the function name in the current scope. - -However, there are some complexities to consider. - -## Constructors - -In Python, we can call a class definition as if it were a function. This is known as a constructor. - -```python -class Foo: - def __init__(self): ... - - -a = Foo() -``` - -This changes the behavior of the function call from the name. The name resolves to Foo (the class definition) but the constructor resolves to the function definition. - -## Imports - -```typescript -require('foo') -``` - -In this case, we need to resolve the import statement to the module definition. - -## Return Types - -```python -class Foo: - def foo(self) -> int: - return 1 - - -class Bar: - def bar(self) -> Foo: ... - - -a = Bar() -a.bar().foo() -``` - -In this case, we need to resolve the return type of the function to the type of the return value. However, the function definition is not the same as the return type. This means we now have 3 different things going on with function calls: - -1. Resolving the function definition -1. Resolving the return type -1. Computing what this function call depends on (both the function definition and the arguments passed to the function) - -## Generics - -```python -def foo[T](a: list[T]) -> T: ... - - -foo([1, 2, 3]) -``` - -Generics depend on the types of the arguments to the function. We need to resolve the types of the arguments to the function to determine the type of the generic. [Generics](./F.%20Generics.md) covers how we handle generics. - -## Next Step - -After understanding function calls, let's look at how we handle [Generics](./F.%20Generics.md) in the type system. diff --git a/architecture/4. type-analysis/F. Generics.md b/architecture/4. type-analysis/F. Generics.md deleted file mode 100644 index 46df52bfc..000000000 --- a/architecture/4. type-analysis/F. Generics.md +++ /dev/null @@ -1,7 +0,0 @@ -# Generics Analysis - -TODO - -## Next Step - -After generics analysis, the system handles [Subscript Expressions](./G.%20Subscript%20Expression.md) for array and dictionary access. diff --git a/architecture/4. type-analysis/G. Subscript Expression.md b/architecture/4. type-analysis/G. Subscript Expression.md deleted file mode 100644 index e2bb1a80a..000000000 --- a/architecture/4. type-analysis/G. Subscript Expression.md +++ /dev/null @@ -1,7 +0,0 @@ -# Subscript Expression - -TODO - -## Next Step - -After handling subscript expressions, the system builds [Graph Edges](./H.%20Graph%20Edges.md) to represent relationships between types and symbols. diff --git a/architecture/4. type-analysis/H. Graph Edges.md b/architecture/4. type-analysis/H. Graph Edges.md deleted file mode 100644 index 46efd3c46..000000000 --- a/architecture/4. type-analysis/H. Graph Edges.md +++ /dev/null @@ -1,59 +0,0 @@ -# Graph Edges - -The SDK contains a graph of nodes and edges. -Nodes are the core of the graph and represent the symbols in the codebase. Examples include: - -- Symbols: Classes, functions, Assignments, etc. -- Imports, Exports -- Files -- Parameters, Attributes - Edges are between - each containes 4 elements: -- Source: The node that the edge is coming from -- Target: The node that the edge is going to -- Type: The type of the edge -- Metadata: Additional information about the edge - -## Edge Types - -We have 4 types of [edges](../src/codegen/sdk/enums.py#L10) - -- IMPORT_SYMBOL_RESOLUTION: An edge from an import to a symbol -- EXPORT: An edge from a symbol to an export -- SUBCLASS: An edge from a symbol to a subclass -- SYMBOL_USAGE: An edge from a symbol to a usage - -The only edges that are used in almost every API are SYMBOL_USAGE edges. They are also the only ones that have additional metadata. - -## Edge construction order - -To compute the graph we follow a specific order: - -1. Import edges are added first - - This is completely independent of the type engine -1. Symbol edges are added next - - these may export symbols that are imported from other files. - - This is almost entirely independent of the type engine -1. Subclass edges are added next - - these may reference symbols that are imported or exported from other files. - - This is fully dependent on the type engine -1. Usage edges are added last - - they reference symbols that are imported or exported from other files - - This is fully dependent on the type engine - - Subclass edges are computed beforehand as a performance optimization - -## Usages - -SYMBOL_USAGE edges contain additional [metadata](../src/codegen/sdk/core/dataclasses/usage.py) - -- match: The exact match of the usage -- usage_symbol: The symbol this object is used in. Derived from the match object -- usage_type: How this symbol was used. Derived from the resolution stack -- imported_by: The import that imported this symbol. Derived from the resolution stack -- kind: Where this symbol was used (IE: in a type parameter or in the body of the class, etc). Derived from the compute dependencies function - You may notice these edges are actually between the usage symbol and the match object but the match object is not on the graph. This way we have constructed triple edges. -- They are technically edges between the usage symbol and the symbol contained in the match object -- The edge metadata contains the match object - -## Next Step - -After constructing the type graph, the system moves on to [Edit Operations](../5.%20performing-edits/A.%20Edit%20Operations.md) where it can safely modify code while preserving type relationships. diff --git a/architecture/5. performing-edits/A. Transactions.md b/architecture/5. performing-edits/A. Transactions.md deleted file mode 100644 index c27c7e65f..000000000 --- a/architecture/5. performing-edits/A. Transactions.md +++ /dev/null @@ -1,54 +0,0 @@ -# Transactions - -Transactions represent atomic changes to files in the codebase. Each transaction defines a specific modification that can be queued, validated, and executed. - -## Transaction Types - -The transaction system is built around a base `Transaction` class with specialized subclasses: - -### Content Transactions - -- **RemoveTransaction**: Removes content between specified byte positions -- **InsertTransaction**: Inserts new content at a specified byte position -- **EditTransaction**: Replaces content between specified byte positions - -### File Transactions - -- **FileAddTransaction**: Creates a new file -- **FileRenameTransaction**: Renames an existing file -- **FileRemoveTransaction**: Deletes a file - -## Transaction Priority - -Transactions are executed in a specific order defined by the `TransactionPriority` enum: - -1. **Remove** (highest priority) -1. **Edit** -1. **Insert** -1. **FileAdd** -1. **FileRename** -1. **FileRemove** - -This ordering ensures that content is removed before editing or inserting, and that all content operations happen before file operations. - -## Key Concepts - -### Byte-Level Operations - -All content transactions operate at the byte level rather than on lines or characters. This provides precise control over modifications and allows transactions to work with any file type, regardless of encoding or line ending conventions. - -### Content Generation - -Transactions support both static content (direct strings) and dynamic content (generated at execution time). This flexibility allows for complex transformations where the new content depends on the state of the codebase at execution time. - -Most content transactions use static content, but dynamic content is supported for rare cases where the new content depends on the state of other transactions. One common example is handling whitespace during add and remove transactions. - -### File Operations - -File transactions are used to create, rename, and delete files. - -> NOTE: It is important to note that most file transactions such as `FileAddTransaction` are no-ops (AKA skiping Transaction Manager) and instead applied immediately once the `create_file` API is called. This allows for created files to be immediately available for edit and use. The reason file operations are still added to Transaction Manager is to help with optimizing graph re-parse and diff generation. (Keeping track of which files exist and don't exist anymore). - -## Next Step - -After understanding the transaction system, they are managed by the [Transaction Manager](./B.%20Transaction%20Manager.md) to ensure consistency and atomicity. diff --git a/architecture/5. performing-edits/B. Transaction Manager.md b/architecture/5. performing-edits/B. Transaction Manager.md deleted file mode 100644 index 4ed78a750..000000000 --- a/architecture/5. performing-edits/B. Transaction Manager.md +++ /dev/null @@ -1,93 +0,0 @@ -# Transaction Manager - -The Transaction Manager coordinates the execution of transactions across multiple files, handling conflict resolution, and enforcing resource limits. - -## High-level Concept - -Since all node operations are on byte positions of the original file, multiple operations that change the total byte length of the file will result in offset errors and broken code. - -Give this example over here: - -``` -Original: FooBar -Operations: Remove "Foo" (bytes 0-3), Insert "Hello" (bytes 0-5) - Remove "Bar" (bytes 3-6), Insert "World" (bytes 3-7) -``` - -If these operations were applied in order, the result would be: - -``` -Result: FooBar -Operation: Remove "Foo" (bytes 0-3), Insert "Hello" (bytes 0-5) -Result: HelloBar -Operation: Remove "Bar" (bytes 3-6), Insert "World" (bytes 3-7) -Result: HelWorldar -``` - -Resulting in an invalid output. - -⭐ The key with TransactionManager is that it queues up all transactions in a given Codemod run, the applies all of the ***backwards*** from the last byte range to the first. Given the same example as above but applied backwards: - -``` -Result: FooBar -Operation: Remove "Bar" (bytes 3-6), Insert "World" (bytes 3-7) -Result: FooWorld -Operation: Remove "Foo" (bytes 0-3), Insert "Hello" (bytes 0-5) -Result: HelloWorld -``` - -TransactionManager also performs some additional operations such detecting conflicts and coordinating (some basic) conflict resolutions. Overall, the core responsibilities are as follows: - -1. **Transaction Queueing**: Maintains a queue of pending transactions organized by file -1. **Conflict Resolution**: Detects and resolves conflicts between transactions -1. **Transaction Execution**: Applies transactions in the correct order -1. **Resource Management**: Enforces limits on transaction count and execution time -1. **Change Tracking**: Generates diffs for applied changes - -## Sorting Transactions - -Before execution, transactions are sorted based on (in this priority): - -1. Position in the file (higher byte positions first) -1. Transaction type (following the priority order) -1. User-defined priority -1. Creation order - -This sorting ensures that transactions are applied in a deterministic order that minimizes conflicts. Larger byte ranges are always edited first, removals happen before insertions, and older transactions are applied before newer ones. - -## Conflict Resolution - -### Conflict Types - -The manager identifies several types of conflicts: - -1. **Overlapping Transactions**: Multiple transactions affecting the same byte range -1. **Contained Transactions**: One transaction completely contained within another -1. **Adjacent Transactions**: Transactions affecting adjacent byte ranges - -In it's current implementation, TransactionManager only handles Contained Transactions that are trivially sovable. (If a remove transaction completely overlaps with another remove transaction, only the larger one will be kept) - -## Resource Management - -The Transaction Manager enforces two types of limits: - -1. **Transaction Count**: Optional maximum number of transactions -1. **Execution Time**: Optional time limit for transaction processing - -These limits prevent excessive resource usage and allow for early termination of long-running operations. - -## Commit Process - -The commit process applies queued transactions to the codebase: - -1. Transactions are sorted according to priority rules -1. Files are processed one by one -1. For each file, transactions are executed in order -1. Diffs are collected for each modified file -1. The queue is cleared after successful commit - -The diff's are later used during resyc to efficiently update the codebase graph as changes occur. See [Incremental Computation](../6.%20incremental-computation/A.%20Overview.md) for more details. - -## Next Step - -After managing transactions, the system handles [Incremental Computation](../6.%20incremental-computation/A.%20Overview.md) to efficiently update the codebase graph as changes occur. diff --git a/architecture/6. incremental-computation/A. Overview.md b/architecture/6. incremental-computation/A. Overview.md deleted file mode 100644 index 741cb426f..000000000 --- a/architecture/6. incremental-computation/A. Overview.md +++ /dev/null @@ -1,47 +0,0 @@ -# Incremental Computation - -After we performed some changes to the codebase, we may need to recompute the codebase graph. -This is not a trivial task, because we need to be able to recompute the codebase graph incrementally and efficiently. - -## Use Cases - -### 1. Repeated Moves - -```python -# file1.py -def foo(): - return bar() - - -def bar(): - return 42 -``` - -Let's move symbol `bar` to `file2.py` - -```python -# file2.py -def bar(): - return 42 -``` - -Then we move symbol `foo` to `file3.py` - -```python -# file3.py -from file2 import bar - - -def foo(): - return bar() -``` - -You'll notice we have added an import from file2, not file1. This means that before we can move foo to file3, we need to sync the graph to reflect the changes in file2. - -### 2. Branching - -If we want to checkout a different branch, we need to update the baseline state to the git commit of the new branch and recompute the codebase graph. - -## Next Step - -After understanding the overview of incremental computation, let's look at how we [detect changes](./B.%20Change%20Detection.md) in the codebase. diff --git a/architecture/6. incremental-computation/B. Change Detection.md b/architecture/6. incremental-computation/B. Change Detection.md deleted file mode 100644 index ca3322762..000000000 --- a/architecture/6. incremental-computation/B. Change Detection.md +++ /dev/null @@ -1,58 +0,0 @@ -# Change Detection - -## Lifecycle of an operation on the codebase graph - -Changes will go through 4 states. By default, we do not apply changes to the codebase graph, only to the filesystem. - -### Pending transactions - -After calling an edit or other transaction method, the changes are stored in a pending transaction. Pending transactions will be committed as described in the previous chapter. - -### Pending syncs - -After a transaction is committed, the file is marked as a pending sync. This means the filesystem state has been updated, but the codebase graph has not been updated yet. - -### Applied syncs - -When we sync the graph, we apply all the pending syncs and clear them. The codebase graph is updated to reflect the changes. We track all the applied syncs in the codebase graph. - -### Saved/baseline state - -Finally, we can set the baseline state to a git commit. This is the state we target when we reset the codebase graph. When we checkout branches, we update the baseline state. - -## Change Detection - -When we sync or build the graph, first we build a list of all files in 3 categories: - -- Removed files -- Added files -- Files to repase - -For example, if we move a file, it will be in the added and removed files -If we add a file, it will be in the added files even if we peformed edits on it later. - -## Codebase.commit logic - -We follow the following logic - -1. Commit all pending transactions -1. Write all buffered files to the disk -1. Store this to pending changes (usually we will skip the remaining steps if we commit without syncing the graph) -1. Build list of removed, added and modified files from pending changes -1. For removed files, we need to remove all the edges that point to the file. -1. For added files, we need to add all the edges that point to the file. -1. For modified files, we remove all the edges that point to the file and add all the edges that point to the new file. This is complicated since edges may pass through the modified file and need to be intelligently updated. -1. Mark all pending changes as applied - -## Reset logic - -Reset is just the inverse of commit. We need to - -1. Cancel all pending transactions -1. Restore file state to the state to the target git commit -1. Clear all pending changes to the graph -1. Reverse all applied syncs to the graph - -## Next Step - -After detecting changes, the system performs [Graph Recomputation](./C.%20Graph%20Recomputation.md) to update the dependency graph efficiently. diff --git a/architecture/6. incremental-computation/C. Graph Recomputation.md b/architecture/6. incremental-computation/C. Graph Recomputation.md deleted file mode 100644 index 2e2f378ee..000000000 --- a/architecture/6. incremental-computation/C. Graph Recomputation.md +++ /dev/null @@ -1,40 +0,0 @@ -# Graph Recomputation - -## Node Reparsing - -Some limitations we encounter are: - -- It is non-trivial to update tree sitter nodes, and the SDK has no method to do this. -- Therefore, all existing nodes are invalidated and need to be recomputed every time filesystem state changes. - -Therefore, to recompute the graph, we must first have the filesystem state updated. Then we can remove all nodes in the modified files and create new nodes in the modified files. - -## Edge Recomputation - -- Nodes may either use (out edges) or be used by (in edges) other nodes. - - Recomputing the out-edges is straightforward, we just need to reparse the file and compute dependencies again. - - Recomputing the in-edges is more difficult. - - The basic algorithm of any incremental computation engine is to: - - Detect what changed - - Update that query with the new data - - If the output of the query changed, we need to update all the queries that depend on that query. - -### Detecting what changed - -A difficulty is that the nodes are completely freshed for updated files. Therefore, this by default will include all nodes in updated files. - -### Updating the query - -To do this, we: - -- Wipe the entire cache of the query engine -- Remove all existing out edges of the node -- Recompute dependencies of that node - -### Update what changed - -This part has not been fully implemented yet. Currently, we update all the nodes that are descendants of the changed node and all the nodes in the file. - -## Next Step - -After graph recomputation, the system is ready for the next set of operations. The cycle continues with [File Discovery](../plumbing/file-discovery.md) for any new changes. diff --git a/architecture/architecture.md b/architecture/architecture.md deleted file mode 100644 index dd044e4dc..000000000 --- a/architecture/architecture.md +++ /dev/null @@ -1,113 +0,0 @@ -# Architecture of the Codegen SDK - -This is a technical document explaining the architecture of the Codegen SDK. - -## Purpose of the SDK - -This SDK is designed to accomplish a large set of use cases in one tool: - -- Parsing large, enterprise-scale codebases -- Making syntax aware changes to code while respecting original formatting -- Being user-friendly and easy to use -- Able to quickly execute large scale refactorings against a codebase -- Supporting multiple languages with common abstractions -- Aware of both project structure (tsconfig.json, pyproject.toml, etc.) and language-specific structure (imports, etc.) -- Able to perform type resolution -- Responding to changes to the codebase and updating the graph - -### Performance - -A key problem is performance. We must be able to quickly respond to user requests on enterprise codebases (IE: renaming a symbol). However, we don't know what those requests are in advance and the scope of these requests can be quite massive (They may choose to iterate over a large number of symbols and their usages). To respond to these problems, we introduced codegen cloud. We split operations into two parts: - -- A "parse" step that builds up a graph of the codebase - - This can take a long time to complete, but it only needs to be done once - - This computes the entire graph of the codebase -- A "run" step that performs operations on the codebase - - This can be done quickly, but it needs to be done many times - - This uses the graph to perform operations on the codebase - -This allows us to perform operations on the codebase without having to parse it every time. - -## Existing Solutions - -To accomplish these goals, we can look at existing classes of solutions: - -### Language Server Architecture - -The immediate question is: why not use a language server? They have a lot of the same goals as codegen, but do not address many of our goals: - -- Language servers can handle many of these same use cases, but they are not as performant as we need. -- Generally, language servers compute their results lazily. This doesn't work for us because we need to perform a large number of operations on the codebase. -- While the LSP protocol is powerful, it is not designed to be scriptable the way codegen is. -- In Python, many of the language servers are an aglamation of many different tools and libraries. None are very good at refactoring or offer the comprehensive set of features that codegen does. - -Generally language servers parse codebases in response to user actions. This is not a good fit for us because we need to perform a large number of operations on the codebase without knowing which symbols are being changed or queried. - -### Compiler Architecture - -Many of the same goals can be accomplished with a compiler. C However, compilers are not as user-friendly as we need. - -- They do not generally offer easy-to-use apis -- They do not focus on refactoring code after parsing -- They generally don't handle graph-updates -- They aren't common or complete in python/typescript - -Generally compilers build up knowledge of the entire codebase in a single pass. This is a much better fit for our use case. - -## Architecture - -The codegen SDK combines aspects of both systems to accomplish our goals. -At a high level our architecture is: - -1. We discover files to parse - -## Processing Steps - -The SDK processes code through several distinct steps: - -1. \[File Discovery\](./1. plumbing/file-discovery.md) - - - Project structure analysis - - File system traversal - -1. \[Tree-sitter Parsing\](./2. parsing/A. Tree Sitter.md) - - - Initial syntax tree construction - - Language-specific parsing rules - - Error recovery - -1. \[AST Construction\](./2. parsing/B. AST Construction.md) - - - Abstract syntax tree building - - Node type assignment - - Syntax validation - -1. \[Import & Export Resolution\](./3. imports-exports/A. Imports.md) - - - Module dependency analysis - - \[Export Analysis\](./3. imports-exports/B. Exports.md) - - \[TSConfig Support\](./3. imports-exports/C. TSConfig.md) - - Path resolution - -1. \[Type Analysis\](./4. type-analysis/A. Type Analysis.md) - - - \[Type Analysis\](./4. type-analysis/A. Type Analysis.md) - - \[Tree Walking\](./4. type-analysis/B. Tree Walking.md) - - \[Name Resolution\](./4. type-analysis/C. Name Resolution.md) - - \[Chained Attributes\](./4. type-analysis/D. Chained Attributes.md) - - \[Function Calls\](./4. type-analysis/E. Function Calls.md) - - \[Generics\](./4. type-analysis/F. Generics.md) - - \[Subscript Expression\](./4. type-analysis/G. Subscript Expression.md) - - \[Graph Edges\](./4. type-analysis/H. Graph Edges.md) - -1. \[Performing Edits\](./5. performing-edits/A. Edit Operations.md) - - - \[Transaction Manager\](./5. performing-edits/B. Transaction Manager.md) - - Change validation - - Format preservation - -1. \[Incremental Computation\](./6. incremental-computation/A. Overview.md) - - - \[Detecting Changes\](./6. incremental-computation/B. Change Detection.md) - - \[Recomputing Graph\](./6. incremental-computation/C. Graph Recomputation.md) - - Cache invalidation diff --git a/architecture/external/dependency-manager.md b/architecture/external/dependency-manager.md deleted file mode 100644 index ed8e42a3d..000000000 --- a/architecture/external/dependency-manager.md +++ /dev/null @@ -1,100 +0,0 @@ -# Dependency Manager - -> WARNING: Dependency manager is an experimental feature designed for Codegen Cloud! The current implementation WILL delete any existing `node_modules` folder! - -## Motivation - -A future goal of Codegen is to support resolving symbols directly from dependencies, instead of falling back to `ExternalModule`s. (In fact, some experimental Codegen features such as [Type Engine](./type-engine.md) already parse and use 3rd party dependencies from `node_modules`) - -This requires us to pull and install dependencies from a repository's `package.json`. However, simply installing dependencies from `package.json` is not enough, as many projects require internal dependencies that use custom NPM registries. Others require custom post-install scripts that may not run on our codemod environments. - -Dependency Manager is an experimental solution to this problem. It creates a shadow tree of `package.json` files that includes all core dependencies and settings from the repository's original `package.json` without any custom registries or potentially problematic settings. - -> NOTE: Currently, this is only implemented for TypeScript projects. - -## Implementation - -Given this example codebase structure: - -``` -repo/ -├── package.json -├── node_modules/ -├── src/ -│ ├── frontend/ -│ │ └── package.json -│ └── backend/ -│ └── package.json -└── tests/ - └── package.json -``` - -Dependency Manager first deletes any existing `node_modules` folder in the user's repository. After this step, Dependency Manager initializes itself to use the correct version of NPM, Yarn, or PNPM for the user's repository. - -Dependency Manager then creates a "shadow copy" of the repository's original `package.json` file. This shadow copy is used to later revert any changes made by Codegen before running codemods. With these steps, the codebase structure now looks like this: - -``` -repo/ -├── package.json -├── package.json.gs_internal.bak -├── src/ -│ ├── frontend/ -│ │ └── package.json -│ │ └── package.json.gs_internal.bak -│ └── backend/ -│ └── package.json -│ └── package.json.gs_internal.bak -└── tests/ - └── package.json - └── package.json.gs_internal.bak -``` - -Next, Dependency Manager iterates through all the `package.json` files and creates a "clean" version of each file. This "clean" version only includes a subset of information from the original, including: - -- Name -- Version -- Package Manager Details -- Workspaces - -Most importantly, this step iterates through `dependencies` and `devDependencies` of each `package.json` file and validates them against the npm registry. If a package is not found, it is added to a list of invalid dependencies and removed from the `package.json` file. - -After this step, the codebase structure now looks like this: - -``` -repo/ -├── package.json (modified) -├── package.json.gs_internal.bak -├── src/ -│ ├── frontend/ -│ │ └── package.json (modified) -│ │ └── package.json.gs_internal.bak -│ └── backend/ -│ └── package.json (modified) -│ └── package.json.gs_internal.bak -└── tests/ - └── package.json (modified) - └── package.json.gs_internal.bak -``` - -After the shadow and cleaning steps, Dependency Manager proceeds to install the user's dependencies through NPM, Yarn, or PNPM, depending on the detected installer type. Finally, Dependency Manager restores the original `package.json` files and removes the shadow copies. - -The final codebase structure looks like this: - -``` -repo/ -├── package.json -├── node_modules/ -├── src/ -│ ├── frontend/ -│ │ └── package.json -│ └── backend/ -│ └── package.json -└── tests/ - └── package.json -``` - -If all goes well, Dependency Manager will have successfully installed the user's dependencies and prepared the codebase for codemods. - -## Next Step - -The dependency manager works closely with the [Type Engine](./type-engine.md) to ensure type compatibility across dependencies. diff --git a/architecture/external/type-engine.md b/architecture/external/type-engine.md deleted file mode 100644 index 42b96f643..000000000 --- a/architecture/external/type-engine.md +++ /dev/null @@ -1,25 +0,0 @@ -# Type Engine - -Type Engine is an experimental feature of Codegen that leverages the [TypeScript Compiler API](https://github.com/microsoft/TypeScript/wiki/Using-the-Compiler-API) to provide deeper insight into a user's codebase (such as resolving return types). - -> NOTE: Currently, this is only implemented for TypeScript projects. - -There are currently two experimental implementations of TypeScript's Type Engine: an external process-based implementation and a V8-based implementation. - -## Implementation (External Process) - -During codebase parsing, the Type Engine spawns a type inference subprocess (defined in `src/codegen/sdk/typescript/external/typescript_analyzer/run_full.ts`) that concurrently parses the codebase with the TypeScript API to resolve return types. The final analyzer output is placed in `/tmp/typescript-analysis.json` and is read in by Codegen to resolve return types. - -## Implementation (V8) - -The V8-based implementation is much more flexible and powerful in comparison but is currently not as stable. It uses the [PyMiniRacer](https://github.com/sqreen/py_mini_racer) package to spawn a V8-based JavaScript engine that can parse the codebase with the TypeScript API to resolve return types. - -The entirety of `src/codegen/sdk/typescript/external/typescript_analyzer` is compiled down using [Rollup.js](https://rollupjs.org/) into a single `index.js` file. A couple of patches are applied to the engine source to remove `require` and `export` statements, which are not supported by MiniRacer. - -Then, the entire `index.js` file is loaded into the MiniRacer context. To work around file read limitations with V8, an in-memory shadow filesystem is created that mimics the user's repository's filesystem. These are defined in `fsi.ts` (`FileSystemInterface`) and `fs_proxy.ts` (`ProxyFileSystem`). The TypeScript Compiler then uses the custom `ProxyFileSystem.readFile` function instead of the traditional `fs.readFile`. - -Once the analyzer is initialized and the codebase is parsed, the entire TypeScript Compiler API is available in the MiniRacer context. The analyzer can then be used to resolve return types for any function in the codebase or to parse the codebase and generate a full type analysis. - -## Next Step - -The type engine works in conjunction with the [Dependency Manager](./dependency-manager.md) to ensure type safety across project dependencies. diff --git a/codegen-examples/CONTRIBUTING.md b/codegen-examples/CONTRIBUTING.md deleted file mode 100644 index 752b5d6aa..000000000 --- a/codegen-examples/CONTRIBUTING.md +++ /dev/null @@ -1,19 +0,0 @@ -# Contributing to Codegen Examples - -Thank you for your interest in contributing to `codegen-examples`! This document outlines the process and guidelines for contributing. - -## Contributor License Agreement - -By contributing to Codegen Examples, you agree that: - -1. Your contributions will be licensed under the project's license. -1. You have the right to license your contribution under the project's license. -1. You grant Codegen a perpetual, worldwide, non-exclusive, royalty-free license to use your contribution. - -## Pull Request Process - -1. Fork the repository and create your branch from `main`. -1. Ensure your code passes all tests. -1. Update documentation as needed. -1. Submit a pull request to the `main` branch. -1. Include a clear description of your changes in the PR. diff --git a/codegen-examples/LICENSE b/codegen-examples/LICENSE deleted file mode 100644 index 261eeb9e9..000000000 --- a/codegen-examples/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/codegen-examples/README.md b/codegen-examples/README.md deleted file mode 100644 index 3e430024c..000000000 --- a/codegen-examples/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# Codegen Examples - -[](https://docs.codegen.com) - -This is a collection of examples using [Codegen](https://codegen.com). You can use these examples to learn how to use Codegen and build custom code transformations. - -## Setup - -We recommend using [`uv`](https://github.com/astral-sh/uv) with Python 3.13 for the best experience. - -To install Codegen, please follow the [official installation guide](https://docs.codegen.com/introduction/installation). Once Codegen is installed, use these steps to run the examples in this repository: - -Install the Codegen CLI globally - -```bash -uv tool install codegen -``` - -Initialize Codegen in your project - -```bash -codegen init -``` - -Activate the virtual environment - -```bash -source .codegen/.venv/bin/activate -``` - -Your environment is now ready to run example codemods. - -### IDE Configuration (Optional) - -To configure your IDE for optimal use with Codegen, follow our [IDE setup guide](https://docs.codegen.com/introduction/ide-usage#configuring-your-ide-interpreter). - -## Examples - -Within the examples folder, each subdirectory contains a self-contained example with: - -- An explanation of the transformation (`README.md`) -- A Codegen script that performs the transformation (`run.py`) -- Sample code to transform, if not using a repository (`input_repo/`) - -To see a transformation, simply run the `run.py` script within the desired directory. - -## Learn More - -- [Documentation](https://docs.codegen.com) -- [Getting Started Guide](https://docs.codegen.com/introduction/getting-started) -- [Tutorials](https://docs.codegen.com/tutorials/at-a-glance) -- [API Reference](https://docs.codegen.com/api-reference) - -## Contributing - -Have a useful example to share? We'd love to include it! Please see our [Contributing Guide](CONTRIBUTING.md) for instructions. - -## License - -The [Apache 2.0 license](LICENSE). diff --git a/codegen-examples/STRUCTURE.md b/codegen-examples/STRUCTURE.md deleted file mode 100644 index f4695135d..000000000 --- a/codegen-examples/STRUCTURE.md +++ /dev/null @@ -1,180 +0,0 @@ -# Structuring Codegen Examples - -This guide explains how to structure examples for the Codegen library. A well-structured example helps both humans and AI understand the code's purpose and how to use it effectively. - -## Core Principles - -1. **Single Responsibility**: Each example should demonstrate one clear use case -1. **Self-Contained**: Examples should work independently with minimal setup -1. **Clear Structure**: Follow a consistent file organization pattern -1. **Good Documentation**: Include README.md with clear explanations and examples - -## Standard File Structure - -``` -example-name/ -├── README.md # Documentation and usage examples -├── run.py # Main implementation -└── input_repo/ # (Optional) Sample code for transformation -``` - -## Code Organization in `run.py` - -Your `run.py` should follow this structure, demonstrated well in the `generate_training_data` example: - -1. **Imports at the top** - - ```python - import codegen - from codegen import Codebase - from codegen.sdk.core import Function - # ... other imports - ``` - -1. **Utility functions with clear docstrings** - - ```python - def hop_through_imports(imp: Import) -> Symbol | ExternalModule: - """Finds the root symbol for an import""" - # Implementation... - ``` - -1. **Main Codegen function with decorator** - - ```python - @codegen.function("your-function-name") - def run(codebase: Codebase): - """Clear docstring explaining what the function does. - - Include: - 1. Purpose of the function - 2. Key steps or transformations - 3. Expected output - """ - # Implementation... - ``` - -1. **Entry point at bottom** - - ```python - if __name__ == "__main__": - # Initialize codebase - # Run transformation - # Save/display results - ``` - -## Working with Codebases - -Prefer using public repositories for examples when possible. However, sometimes you need a specific code structure to demonstrate a concept clearly. Here's how to handle both cases: - -```python -# Preferred: Use a well-known public repo that demonstrates the concept well -codebase = Codebase.from_repo("fastapi/fastapi") - -# Alternative: Create a minimal example repo when you need specific code structure -# 1. Create an input_repo/ directory in your example -# 2. Add minimal code that clearly demonstrates the transformation -codebase = Codebase("./input_repo") -``` - -For example: - -``` -example-name/ -├── README.md -├── run.py -└── input_repo/ # Your minimal example code - ├── app.py - └── utils.py -``` - -Choose between these approaches based on: - -1. Can you find a public repo that clearly shows the concept? -1. Is the transformation specific enough that a custom example would be clearer? -1. Would a minimal example be more educational than a complex real-world one? - -## Best Practices - -1. **Function Decorator** - - - Always use `@codegen.function()` with a descriptive name - - Name should match the example's purpose - -1. **Utility Functions** - - - Break down complex logic into smaller, focused functions - - Each utility should demonstrate one clear concept - - Include type hints and docstrings - -1. **Main Function** - - - Name it `run()` for consistency - - Include comprehensive docstring explaining the transformation - - Return meaningful data that can be used programmatically - -1. **Entry Point** - - - Include a `__name__ == "__main__"` block - - Show both initialization and execution - - Add progress messages for better UX - -1. **Error Handling** - - - Include appropriate error handling for common cases - - Provide clear error messages - -## Example Reference Implementation - -The `generate_training_data` example demonstrates these principles well: - -```python -# Focused utility function -def get_function_context(function) -> dict: - """Get the implementation, dependencies, and usages of a function.""" - # Clear, focused implementation... - - -# Main transformation with decorator -@codegen.function("generate-training-data") -def run(codebase: Codebase): - """Generate training data using a node2vec-like approach... - - This codemod: - 1. Finds all functions... - 2. For each function... - 3. Outputs structured JSON... - """ - # Clear implementation with good structure... - - -# Clean entry point -if __name__ == "__main__": - print("Initializing codebase...") - codebase = Codebase.from_repo("fastapi/fastapi") - run(codebase) - # ... rest of execution -``` - -## Documentation Requirements - -Every example should include: - -1. **README.md** - - Clear explanation of purpose - - Explains key syntax and program function - - Code examples showing the transformation (before/after) - - If using `input_repo/`, explain its structure and contents - - Output format (if applicable) - - Setup and running instructions - -## Testing Your Example - -Before submitting: - -1. Test with a fresh environment -1. Verify all dependencies are listed -1. Ensure the example runs with minimal setup -1. Check that documentation is clear and accurate - -Remember: Your example might be used by both humans and AI to understand Codegen's capabilities. Clear structure and documentation help everyone use your code effectively. diff --git a/codegen-examples/examples/ai_impact_analysis/README.md b/codegen-examples/examples/ai_impact_analysis/README.md deleted file mode 100644 index e34e1a8af..000000000 --- a/codegen-examples/examples/ai_impact_analysis/README.md +++ /dev/null @@ -1,124 +0,0 @@ -# AI Impact Analysis - -This script analyzes a codebase to measure and report the impact of AI-generated code contributions. It provides detailed insights about AI vs human contributions, helping teams understand the role of AI in their development process. - -## Features - -- **Repository Analysis**: Automatically detects and analyzes git repositories: - - - Uses current directory if it's a git repo - - - Searches parent directories for a git repo - - - Falls back to cloning a specified repository if needed - - ```python - # Basic repository setup - repo_path = os.getcwd() - repo_config = RepoConfig.from_repo_path(repo_path) - repo_operator = RepoOperator(repo_config=repo_config) - project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON) - codebase = Codebase(projects=[project]) - ``` - -- **Comprehensive Statistics**: - - - Total number of commits and AI vs human contribution percentages - - Files with significant AI contribution (>50%) - - AI-touched symbols and their impact - - Detailed contributor breakdown (human and AI contributors) - - ```python - # Run the analysis - ai_authors = ["github-actions[bot]", "dependabot[bot]"] - results = analyze_ai_impact(codebase, ai_authors) - - # Access statistics - stats = results["stats"] - print(f"Total commits: {stats['total_commits']}") - print(f"AI commits: {stats['ai_commits']} ({stats['ai_percentage']:.1f}%)") - print(f"Files with >50% AI: {stats['ai_file_count']} of {stats['total_file_count']}") - - # View contributors - for author, count in results["contributors"]: - is_ai = any(ai_name in author for ai_name in ai_authors) - print(f"{'🤖' if is_ai else '👤'} {author}: {count} commits") - ``` - -- **High-Impact Code Detection**: - - - Identifies AI-written code that is heavily used by other parts of the codebase - - Shows dependency relationships for AI-contributed code - - ```python - # Access high-impact AI symbols - for symbol in results["high_impact_symbols"]: - print(f"Symbol: {symbol['name']} ({symbol['filepath']})") - print(f"Used by {symbol['usage_count']} other symbols") - print(f"Last edited by: {symbol['last_editor']}") - - # View top AI-contributed files - for file_path, percentage in stats["top_ai_files"]: - print(f"{file_path}: {percentage:.1f}% AI contribution") - ``` - -- **Detailed Attribution**: - - - Maps symbols to git history - - Tracks last editor and complete editor history for each symbol - - Flags AI-authored symbols - - ```python - # Get attribution information for a specific symbol - symbol = codebase.get_symbol("path/to/file.py:MyClass.my_method") - - # Access attribution data - print(f"Last editor: {symbol.last_editor}") - print(f"Editor history: {symbol.editor_history}") - print(f"AI authored: {symbol.is_ai_authored}") - - # Find all AI-authored symbols - ai_symbols = [s for s in codebase.get_symbols() if s.is_ai_authored] - for symbol in ai_symbols: - print(f"AI symbol: {symbol.name}") - ``` - -## Output - -The script generates: - -1. Console output with summary statistics -1. Detailed analysis in `ai_impact_analysis.json` -1. Attribution information added to codebase symbols - -## Usage - -```bash -python run.py -``` - -The script will automatically: - -1. Initialize and analyze the codebase -1. Process git history -1. Generate attribution information -1. Output detailed statistics - -You can also visualize the AI impact analysis results using a dashboard. For setup and usage instructions, please see the documentation in the `/dashboard` subdirectory. - -## Symbol Attribution - -After running the analysis, symbols in the codebase will have the following attribution information: - -- `symbol.last_editor`: The last person who edited the symbol -- `symbol.editor_history`: List of all editors who have touched the symbol -- `symbol.is_ai_authored`: Boolean indicating if the symbol was authored by AI - -## Learn More - -- [Attributions](https://docs.codegen.com/tutorials/attributions) -- [Codegen Documentation](https://docs.codegen.com) - -## Contributing - -Feel free to submit issues and enhancement requests! diff --git a/codegen-examples/examples/ai_impact_analysis/dashboard/README.md b/codegen-examples/examples/ai_impact_analysis/dashboard/README.md deleted file mode 100644 index cde758b55..000000000 --- a/codegen-examples/examples/ai_impact_analysis/dashboard/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# AI Impact Analysis Dashboard - -A web dashboard for visualizing AI-generated code contributions in your codebase. This dashboard provides detailed insights about AI vs human contributions, helping understand the role of AI in a codebase development process. - -## Setup - -### Backend - -1. Install dependencies: - -```bash -uv venv -source .venv/bin/activate -uv pip install modal codegen fastapi -``` - -2. Deploy or serve the Modal endpoint: - -```bash -modal serve backend/api.py -``` - -```bash -modal deploy backend/api.py -``` - -### Frontend - -1. Install dependencies: - -```bash -cd frontend -npm install -``` - -2. Update the API endpoint: - Edit the fetch URL on line 29 in `components/repo-analysis-dashboard.tsx` to point to your Modal endpoint: - -```bash - fetch(`[your-modal-deployment-url]/analyze?repo_full_name=${repoFullName}`, { - method: 'POST', - }) -``` - -3. Start the development server: - -```bash -npm run dev -``` - -## Usage - -1. Visit the dashboard in your browser (default: http://localhost:3000) -1. Enter a GitHub repository name (format: username/repo) -1. Click "Analyze Repo" to generate insights - -The dashboard will display: - -- Summary statistics of AI contributions -- Monthly contribution timeline -- Top files with AI contributions -- High-impact AI-authored symbols -- Contributor breakdown visualization - -## Architecture - -- **Backend**: Modal-deployed FastAPI service that: - - - Clones and analyzes repositories - - Processes git history - - Calculates AI impact metrics - - Returns structured analysis data - -- **Frontend**: Next.js application with: - - - Interactive charts - - Visualized AI impact metrics - -## Learn More - -- [AI Impact Analysis Documentation](https://docs.codegen.com/tutorials/attributions) -- [Codegen Documentation](https://docs.codegen.com) - -## Contributing - -Feel free to submit issues and enhancement requests! diff --git a/codegen-examples/examples/ai_impact_analysis/dashboard/backend/api.py b/codegen-examples/examples/ai_impact_analysis/dashboard/backend/api.py deleted file mode 100644 index ddb08115d..000000000 --- a/codegen-examples/examples/ai_impact_analysis/dashboard/backend/api.py +++ /dev/null @@ -1,54 +0,0 @@ -from codegen import Codebase -from codegen.extensions.attribution.main import ( - add_attribution_to_symbols, - analyze_ai_impact, -) -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -import modal - -image = modal.Image.debian_slim().apt_install("git").pip_install("codegen", "fastapi", "intervaltree", "pygit2", "requests") - -app = modal.App(name="ai-impact-analysis", image=image) - -fastapi_app = FastAPI() - -fastapi_app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -@fastapi_app.post("/analyze") -async def analyze(repo_full_name: str): - codebase = Codebase.from_repo(repo_full_name=repo_full_name, language="python", full_history=True) - - print("🤖 Analyzing AI impact on codebase...") - - ai_authors = [ - "renovate[bot]", - "dependabot[bot]", - "github-actions[bot]", - "devin-ai-integration[bot]", - ] - - results = analyze_ai_impact(codebase, ai_authors) - - print("\n🏷️ Adding attribution information to symbols...") - add_attribution_to_symbols(codebase, ai_authors) - print("✅ Attribution information added to symbols") - - return results - - -@app.function(image=image) -@modal.asgi_app() -def fastapi_modal_app(): - return fastapi_app - - -if __name__ == "__main__": - app.deploy("ai-impact-analysis") diff --git a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/favicon.ico b/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/favicon.ico deleted file mode 100644 index fd8587746..000000000 Binary files a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/favicon.ico and /dev/null differ diff --git a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/globals.css b/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/globals.css deleted file mode 100644 index 1535f872d..000000000 --- a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/globals.css +++ /dev/null @@ -1,76 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -@layer base { - :root { - --background: 0 0% 100%; - --foreground: 222.2 84% 4.9%; - - --card: 0 0% 100%; - --card-foreground: 222.2 84% 4.9%; - - --popover: 0 0% 100%; - --popover-foreground: 222.2 84% 4.9%; - - --primary: 221.2 83.2% 53.3%; - --primary-foreground: 210 40% 98%; - - --secondary: 210 40% 96.1%; - --secondary-foreground: 222.2 47.4% 11.2%; - - --muted: 210 40% 96.1%; - --muted-foreground: 215.4 16.3% 46.9%; - - --accent: 210 40% 96.1%; - --accent-foreground: 222.2 47.4% 11.2%; - - --destructive: 0 84.2% 60.2%; - --destructive-foreground: 210 40% 98%; - - --border: 214.3 31.8% 91.4%; - --input: 214.3 31.8% 91.4%; - --ring: 221.2 83.2% 53.3%; - - --radius: 0.5rem; - } - - .dark { - --background: 222.2 84% 4.9%; - --foreground: 210 40% 98%; - - --card: 222.2 84% 4.9%; - --card-foreground: 210 40% 98%; - - --popover: 222.2 84% 4.9%; - --popover-foreground: 210 40% 98%; - - --primary: 217.2 91.2% 59.8%; - --primary-foreground: 222.2 47.4% 11.2%; - - --secondary: 217.2 32.6% 17.5%; - --secondary-foreground: 210 40% 98%; - - --muted: 217.2 32.6% 17.5%; - --muted-foreground: 215 20.2% 65.1%; - - --accent: 217.2 32.6% 17.5%; - --accent-foreground: 210 40% 98%; - - --destructive: 0 62.8% 30.6%; - --destructive-foreground: 210 40% 98%; - - --border: 217.2 32.6% 17.5%; - --input: 217.2 32.6% 17.5%; - --ring: 224.3 76.3% 48%; - } -} - -@layer base { - * { - @apply border-border; - } - body { - @apply bg-background text-foreground; - } -} diff --git a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/layout.tsx b/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/layout.tsx deleted file mode 100644 index 264632940..000000000 --- a/codegen-examples/examples/ai_impact_analysis/dashboard/frontend/app/layout.tsx +++ /dev/null @@ -1,34 +0,0 @@ -import type { Metadata } from "next"; -import { Inter } from "next/font/google"; -import type React from "react"; -import "./globals.css"; -import { ThemeProvider } from "@/components/theme-provider"; - -const inter = Inter({ subsets: ["latin"] }); - -export const metadata: Metadata = { - title: "AI Code Impact Analysis", -}; - -export default function RootLayout({ - children, -}: { - children: React.ReactNode; -}) { - return ( - -
-- Analyze AI-generated code contributions in your repository -
-This may take a few seconds...
-- {stats.ai_percentage.toFixed(1)}% of total commits -
-- {((stats.ai_file_count / stats.total_file_count) * 100).toFixed(1)}% - of files have >50% AI contribution -
-- {((ai_symbol_count / total_symbol_count) * 100).toFixed(1)}% of code - symbols -
-- AI-written symbols with high usage -
-