Skip to content

Commit 4b7b7f9

Browse files
committed
feat: add python support
1 parent cf65929 commit 4b7b7f9

File tree

14 files changed

+524
-56
lines changed

14 files changed

+524
-56
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ dramatically reducing token usage and cognitive load.
9494
Language support requires writing [Tree-sitter queries](https://github.com/st3v3nmw/sourcerer-mcp/blob/main/internal/parser/go.go)
9595
to identify functions, classes, interfaces, and other code structures for each language.
9696

97-
**Supported:** Go, Markdown
97+
**Supported:** Go, Markdown, Python
9898

99-
**Planned:** Python, TypeScript, JavaScript
99+
**Planned:** TypeScript, JavaScript
100100

101101
## Contributing
102102

cmd/sourcerer/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v0.3.1
1+
v0.4.0

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ require (
1313
github.com/tree-sitter-grammars/tree-sitter-markdown v0.5.0
1414
github.com/tree-sitter/go-tree-sitter v0.25.0
1515
github.com/tree-sitter/tree-sitter-go v0.23.4
16+
github.com/tree-sitter/tree-sitter-python v0.23.6
1617
)
1718

1819
require (

internal/analyzer/languages.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ type Language string
1212
const (
1313
Go Language = "go"
1414
Markdown Language = "markdown"
15+
Python Language = "python"
1516
UnknownLang Language = "unknown"
1617
)
1718

@@ -75,5 +76,14 @@ func init() {
7576
[]string{".md", ".markdown"},
7677
func(workspaceRoot string) (*parser.Parser, error) {
7778
return parser.NewMarkdownParser(workspaceRoot)
78-
})
79+
},
80+
)
81+
82+
languages.register(
83+
Python,
84+
[]string{".py"},
85+
func(workspaceRoot string) (*parser.Parser, error) {
86+
return parser.NewPythonParser(workspaceRoot)
87+
},
88+
)
7989
}

internal/parser/go.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,11 @@ var GoSpec = &LanguageSpec{
3939
NameQuery: `(const_declaration (const_spec name: (identifier) @name))`,
4040
},
4141
},
42-
CommentTypes: []string{"comment"},
43-
IgnoreTypes: []string{
44-
"package_clause", // pollutes results with single-line matches
42+
FoldIntoNextNode: []string{"comment"},
43+
SkipTypes: []string{
44+
// These pollute results
45+
"package_clause",
46+
"import_declaration",
4547
},
4648
FileTypeRules: []FileTypeRule{
4749
{Pattern: "**/*_test.go", Type: FileTypeTests},

internal/parser/go_test.go

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,6 @@ func (s *GoParserTestSuite) TestFunctionParsing() {
3838
startLine: 1,
3939
endLine: 1,
4040
},
41-
{
42-
name: "Imports Hashing",
43-
path: "44983311c5db2e3",
44-
summary: "import (",
45-
source: `import (
46-
"context"
47-
"fmt"
48-
)`,
49-
startLine: 5,
50-
endLine: 8,
51-
},
5241
{
5342
name: "Simple Function",
5443
path: "SimpleFunction",
@@ -513,6 +502,15 @@ var (
513502
startLine: 63,
514503
endLine: 68,
515504
},
505+
{
506+
name: "Another Multi Var Declaration",
507+
path: "b9303a3de4b66c8b",
508+
summary: "var x, y string",
509+
source: `// Another multi var declaration
510+
var x, y string`,
511+
startLine: 70,
512+
endLine: 71,
513+
},
516514
{
517515
name: "Single Variable",
518516
path: "DefaultConfig",
@@ -522,8 +520,8 @@ var DefaultConfig = BasicStruct{
522520
Field1: "default",
523521
Field2: 42,
524522
}`,
525-
startLine: 70,
526-
endLine: 74,
523+
startLine: 73,
524+
endLine: 77,
527525
},
528526
}
529527

internal/parser/markdown.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77

88
var MarkdownSpec = &LanguageSpec{
99
ExtractChildrenIn: []string{"section"},
10-
IgnoreTypes: []string{
10+
SkipTypes: []string{
1111
// Headings are organizational markers, not containers.
1212
"atx_heading", "setext_heading",
1313
// We're chunking by section so lower level nodes don't get their own chunks

internal/parser/parser.go

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,35 @@ func (c *Chunk) ID() string {
5757
}
5858

5959
// newChunk creates a new Chunk from related tree-sitter nodes
60-
func newChunk(
60+
func (p *Parser) newChunk(
6161
node *tree_sitter.Node,
6262
source []byte,
6363
path string,
6464
usedPaths map[string]bool,
6565
fileType FileType,
66-
comments []*tree_sitter.Node,
66+
folded []*tree_sitter.Node,
67+
extractor *NamedChunkExtractor,
6768
) *Chunk {
6869
finalPath := resolvePath(path, usedPaths)
69-
startPos, startByte, endPos, endByte := calculateChunkBounds(node, comments)
70-
nodeText := node.Utf8Text(source)
70+
startPos, startByte, endPos, endByte := calculateChunkBounds(node, folded)
71+
72+
// Determine which node to use for the summary
73+
summaryNode := node
74+
if extractor != nil && extractor.SummaryNodeQuery != "" {
75+
// Use the existing executeQuery method to find the summary node
76+
matches, err := p.executeQuery(extractor.SummaryNodeQuery, node, source)
77+
if err == nil && len(matches) > 0 {
78+
summaryNode = matches[0]
79+
}
80+
}
81+
82+
summaryText := summaryNode.Utf8Text(source)
7183
fullText := source[startByte:endByte]
7284

7385
return &Chunk{
7486
Path: finalPath,
7587
Type: string(fileType),
76-
Summary: summarize(nodeText),
88+
Summary: summarize(summaryText),
7789
Source: string(fullText),
7890
StartLine: startPos.Row + 1,
7991
StartColumn: startPos.Column + 1,
@@ -101,8 +113,8 @@ func resolvePath(path string, usedPaths map[string]bool) string {
101113
}
102114

103115
// calculateChunkBounds determines the start and end positions for a chunk,
104-
// extending to include any preceding comments
105-
func calculateChunkBounds(node *tree_sitter.Node, comments []*tree_sitter.Node) (
116+
// extending to include any preceding folded nodes
117+
func calculateChunkBounds(node *tree_sitter.Node, folded []*tree_sitter.Node) (
106118
startPos tree_sitter.Point, startByte uint,
107119
endPos tree_sitter.Point, endByte uint,
108120
) {
@@ -111,10 +123,10 @@ func calculateChunkBounds(node *tree_sitter.Node, comments []*tree_sitter.Node)
111123
endPos = node.EndPosition()
112124
endByte = node.EndByte()
113125

114-
if len(comments) > 0 {
115-
firstComment := comments[0]
116-
startPos = firstComment.StartPosition()
117-
startByte = firstComment.StartByte()
126+
if len(folded) > 0 {
127+
firstFolded := folded[0]
128+
startPos = firstFolded.StartPosition()
129+
startByte = firstFolded.StartByte()
118130
}
119131

120132
return startPos, startByte, endPos, endByte
@@ -147,15 +159,16 @@ func summarize(source string) string {
147159
type LanguageSpec struct {
148160
NamedChunks map[string]NamedChunkExtractor // node types that can be extracted by name
149161
ExtractChildrenIn []string // node types whose children should be recursively processed
150-
CommentTypes []string // node types that represent comments
151-
IgnoreTypes []string // node types to completely skip
162+
FoldIntoNextNode []string // node types to fold into next node, e.g., comments
163+
SkipTypes []string // node types to completely skip
152164
FileTypeRules []FileTypeRule // language-specific file type classification rules
153165
}
154166

155167
// NamedChunkExtractor defines tree-sitter queries for extracting named code entities
156168
type NamedChunkExtractor struct {
157-
NameQuery string // query to extract the entity name
158-
ParentNameQuery string // optional query to extract parent entity name for hierarchical paths
169+
NameQuery string // query to extract the entity name
170+
ParentNameQuery string // optional query to extract parent entity name for hierarchical paths
171+
SummaryNodeQuery string // optional query to extract a specific node for the summary instead of the main node
159172
}
160173

161174
// FileTypeRule defines a pattern-based rule for classifying file types
@@ -246,7 +259,6 @@ func (p *Parser) classifyFileType(filePath string) FileType {
246259
}
247260

248261
// extractChunks recursively extracts semantic chunks from an AST node.
249-
// Comments are collected and folded into the next non-comment chunk to improve context.
250262
func (p *Parser) extractChunks(
251263
node *tree_sitter.Node,
252264
source []byte,
@@ -255,31 +267,31 @@ func (p *Parser) extractChunks(
255267
) []*Chunk {
256268
var chunks []*Chunk
257269
usedPaths := map[string]bool{}
258-
var comments []*tree_sitter.Node
270+
var folded []*tree_sitter.Node
259271

260272
for i := uint(0); i < node.ChildCount(); i++ {
261273
child := node.Child(i)
262274
kind := child.Kind()
263275

264-
if slices.Contains(p.spec.IgnoreTypes, kind) {
265-
// Process any preceding comments as standalone chunks
266-
for _, comment := range comments {
267-
chunks = append(chunks, p.extractNode(comment, source, usedPaths, fileType, nil))
276+
if slices.Contains(p.spec.SkipTypes, kind) {
277+
// Process any remaining folded nodes as standalone chunks
278+
for _, foldedNode := range folded {
279+
chunks = append(chunks, p.extractNode(foldedNode, source, usedPaths, fileType, nil))
268280
}
269-
comments = nil
281+
folded = nil
270282

271283
continue
272284
}
273285

274-
if slices.Contains(p.spec.CommentTypes, kind) {
275-
comments = append(comments, child)
286+
if slices.Contains(p.spec.FoldIntoNextNode, kind) {
287+
folded = append(folded, child)
276288
continue
277289
}
278290

279-
// Process code nodes & fold comments, if any
280-
chunk, path := p.createChunkFromNode(child, source, parentPath, fileType, usedPaths, comments)
291+
// Process code nodes & folded nodes, if any
292+
chunk, path := p.createChunkFromNode(child, source, parentPath, fileType, usedPaths, folded)
281293
chunks = append(chunks, chunk)
282-
comments = nil
294+
folded = nil
283295

284296
// Recursively process children if specified
285297
if slices.Contains(p.spec.ExtractChildrenIn, kind) {
@@ -288,9 +300,9 @@ func (p *Parser) extractChunks(
288300
}
289301
}
290302

291-
// Process any remaining comments as standalone chunks
292-
for _, comment := range comments {
293-
chunks = append(chunks, p.extractNode(comment, source, usedPaths, fileType, nil))
303+
// Process any remaining folded nodes as standalone chunks
304+
for _, foldedNode := range folded {
305+
chunks = append(chunks, p.extractNode(foldedNode, source, usedPaths, fileType, nil))
294306
}
295307

296308
return chunks
@@ -303,21 +315,21 @@ func (p *Parser) createChunkFromNode(
303315
parentPath string,
304316
fileType FileType,
305317
usedPaths map[string]bool,
306-
comments []*tree_sitter.Node,
318+
folded []*tree_sitter.Node,
307319
) (*Chunk, string) {
308320
kind := node.Kind()
309321
extractor, exists := p.spec.NamedChunks[kind]
310322

311323
if exists {
312324
chunkPath, err := p.buildChunkPath(extractor, node, source, parentPath)
313325
if err == nil {
314-
chunk := newChunk(node, source, chunkPath, usedPaths, fileType, comments)
326+
chunk := p.newChunk(node, source, chunkPath, usedPaths, fileType, folded, &extractor)
315327
return chunk, chunkPath
316328
}
317329
}
318330

319331
// No named extractor or building chunk path failed, use content-hash
320-
return p.extractNode(node, source, usedPaths, fileType, comments), parentPath
332+
return p.extractNode(node, source, usedPaths, fileType, folded), parentPath
321333
}
322334

323335
// extractNode creates a chunk from a node using content-based hashing for the path
@@ -326,12 +338,12 @@ func (p *Parser) extractNode(
326338
source []byte,
327339
usedPaths map[string]bool,
328340
fileType FileType,
329-
comments []*tree_sitter.Node,
341+
folded []*tree_sitter.Node,
330342
) *Chunk {
331343
nodeSource := node.Utf8Text(source)
332344
hash := fmt.Sprintf("%x", xxhash.Sum64String(nodeSource))
333345

334-
return newChunk(node, source, hash, usedPaths, fileType, comments)
346+
return p.newChunk(node, source, hash, usedPaths, fileType, folded, nil)
335347
}
336348

337349
// buildChunkPath constructs a hierarchical path for a named chunk using tree-sitter queries

internal/parser/python.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package parser
2+
3+
import (
4+
tree_sitter "github.com/tree-sitter/go-tree-sitter"
5+
tree_sitter_python "github.com/tree-sitter/tree-sitter-python/bindings/go"
6+
)
7+
8+
var PythonSpec = &LanguageSpec{
9+
NamedChunks: map[string]NamedChunkExtractor{
10+
"function_definition": {
11+
NameQuery: `(function_definition name: (identifier) @name)`,
12+
},
13+
"class_definition": {
14+
NameQuery: `(class_definition name: (identifier) @name)`,
15+
},
16+
"method_definition": {
17+
NameQuery: `
18+
(class_definition
19+
body: (block
20+
(function_definition name: (identifier) @name)))`,
21+
},
22+
"decorated_definition": {
23+
NameQuery: `(decorated_definition definition: [
24+
(function_definition name: (identifier) @name)
25+
(class_definition name: (identifier) @name)
26+
])`,
27+
SummaryNodeQuery: `(decorated_definition definition: [
28+
(function_definition) @summary
29+
(class_definition) @summary
30+
])`,
31+
},
32+
},
33+
FoldIntoNextNode: []string{"comment"},
34+
SkipTypes: []string{
35+
// These pollute results
36+
"import_statement",
37+
},
38+
FileTypeRules: []FileTypeRule{
39+
{Pattern: "**/test*.py", Type: FileTypeTests},
40+
{Pattern: "**/*_test.py", Type: FileTypeTests},
41+
{Pattern: "**/__pycache__/**", Type: FileTypeIgnore},
42+
{Pattern: "**/venv/**", Type: FileTypeIgnore},
43+
{Pattern: "**/.venv/**", Type: FileTypeIgnore},
44+
{Pattern: "**/env/**", Type: FileTypeIgnore},
45+
{Pattern: "**/.env/**", Type: FileTypeIgnore},
46+
{Pattern: "**/site-packages/**", Type: FileTypeIgnore},
47+
},
48+
}
49+
50+
func NewPythonParser(workspaceRoot string) (*Parser, error) {
51+
parser := tree_sitter.NewParser()
52+
parser.SetLanguage(tree_sitter.NewLanguage(tree_sitter_python.Language()))
53+
54+
return &Parser{
55+
workspaceRoot: workspaceRoot,
56+
parser: parser,
57+
spec: PythonSpec,
58+
}, nil
59+
}

0 commit comments

Comments
 (0)