blob: 4b573d96254c81871f42074cae11fc6f9079682f [file] [log] [blame]
Josh Bleecher Snydera997be62025-05-07 22:52:46 +00001// Package onstart provides codebase analysis used to inform the initial system prompt.
2package onstart
3
4import (
5 "bufio"
6 "cmp"
7 "context"
8 "fmt"
9 "io"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "slices"
14 "strings"
15
16 "golang.org/x/sync/errgroup"
17)
18
19// Codebase contains metadata about the codebase.
20type Codebase struct {
21 // ExtensionCounts tracks the number of files with each extension
22 ExtensionCounts map[string]int
23 // Total number of files analyzed
24 TotalFiles int
25 // BuildFiles contains paths to build and configuration files
26 BuildFiles []string
27 // DocumentationFiles contains paths to documentation files
28 DocumentationFiles []string
29 // GuidanceFiles contains paths to files that provide context and guidance to LLMs
30 GuidanceFiles []string
31 // InjectFiles contains paths to critical guidance files (like DEAR_LLM.md, claude.md, and cursorrules)
32 // that need to be injected into the system prompt for highest visibility
33 InjectFiles []string
34 // InjectFileContents maps paths to file contents for critical inject files
35 // to avoid requiring an extra file read during template rendering
36 InjectFileContents map[string]string
37}
38
39// AnalyzeCodebase walks the codebase and analyzes the paths it finds.
40func AnalyzeCodebase(ctx context.Context, repoPath string) (*Codebase, error) {
41 // TODO: do a filesystem walk instead?
42 // There's a balance: git ls-files skips node_modules etc,
43 // but some guidance files might be locally .gitignored.
44 cmd := exec.Command("git", "ls-files")
45 cmd.Dir = repoPath
46
47 r, w := io.Pipe() // stream and scan rather than buffer
48 cmd.Stdout = w
49
50 err := cmd.Start()
51 if err != nil {
52 return nil, err
53 }
54
55 extCounts := make(map[string]int)
56 var buildFiles []string
57 var documentationFiles []string
58 var guidanceFiles []string
59 var injectFiles []string
60 injectFileContents := make(map[string]string)
61 var totalFiles int
62
63 eg, _ := errgroup.WithContext(ctx)
64
65 eg.Go(func() error {
66 defer r.Close()
67
68 scanner := bufio.NewScanner(r)
69 for scanner.Scan() {
70 file := scanner.Text()
71 file = strings.TrimSpace(file)
72 if file == "" {
73 continue
74 }
75 totalFiles++
76 ext := strings.ToLower(filepath.Ext(file))
77 ext = cmp.Or(ext, "<no-extension>")
78 extCounts[ext]++
79
80 fileCategory := categorizeFile(file)
81 // fmt.Println(file, "->", fileCategory)
82 switch fileCategory {
83 case "build":
84 buildFiles = append(buildFiles, file)
85 case "documentation":
86 documentationFiles = append(documentationFiles, file)
87 case "guidance":
88 guidanceFiles = append(guidanceFiles, file)
89 case "inject":
90 injectFiles = append(injectFiles, file)
91 }
92 }
93 return scanner.Err()
94 })
95
96 // Wait for the command to complete
97 eg.Go(func() error {
98 err := cmd.Wait()
99 if err != nil {
100 w.CloseWithError(err)
101 } else {
102 w.Close()
103 }
104 return err
105 })
106
107 if err := eg.Wait(); err != nil {
108 return nil, err
109 }
110
111 // Read content of inject files
112 for _, filePath := range injectFiles {
113 absPath := filepath.Join(repoPath, filePath)
114 content, err := os.ReadFile(absPath)
115 if err != nil {
116 fmt.Printf("Warning: Failed to read inject file %s: %v\n", filePath, err)
117 continue
118 }
119 injectFileContents[filePath] = string(content)
120 }
121
122 return &Codebase{
123 ExtensionCounts: extCounts,
124 TotalFiles: totalFiles,
125 BuildFiles: buildFiles,
126 DocumentationFiles: documentationFiles,
127 GuidanceFiles: guidanceFiles,
128 InjectFiles: injectFiles,
129 InjectFileContents: injectFileContents,
130 }, nil
131}
132
133// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
134// Returns an empty string if the file doesn't belong to any of these categories.
135// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
136// Returns an empty string if the file doesn't belong to any of these categories.
137// The path parameter is relative to the repository root as returned by git ls-files.
138func categorizeFile(path string) string {
139 filename := filepath.Base(path)
140 lowerPath := strings.ToLower(path)
141 lowerFilename := strings.ToLower(filename)
142
143 // InjectFiles - critical guidance files that should be injected into the system prompt
144 // These are repository root files only - files directly in the repo root, not in subdirectories
145 // Since git ls-files returns paths relative to repo root, we just need to check for absence of path separators
146 isRepoRootFile := !strings.Contains(path, "/")
147 if isRepoRootFile {
148 if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
149 strings.HasPrefix(lowerFilename, "dear_llm") ||
150 strings.Contains(lowerFilename, "cursorrules") {
151 return "inject"
152 }
153 }
154
155 // BuildFiles - build and configuration files
156 if strings.HasPrefix(lowerFilename, "makefile") ||
157 strings.HasSuffix(lowerPath, ".vscode/tasks.json") {
158 return "build"
159 }
160
161 // DocumentationFiles - general documentation files
162 if strings.HasPrefix(lowerFilename, "readme") ||
163 strings.HasPrefix(lowerFilename, "contributing") {
164 return "documentation"
165 }
166
167 // GuidanceFiles - other files that provide guidance but aren't critical enough to inject
168 // Non-root directory claude.md files, and other guidance files
169 if !isRepoRootFile && strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md") {
170 return "guidance"
171 }
172
173 return ""
174}
175
176// TopExtensions returns the top 5 most common file extensions in the codebase
177func (c *Codebase) TopExtensions() []string {
178 type extCount struct {
179 ext string
180 count int
181 }
182 pairs := make([]extCount, 0, len(c.ExtensionCounts))
183 for ext, count := range c.ExtensionCounts {
184 pairs = append(pairs, extCount{ext, count})
185 }
186
187 // Sort by count (descending), then by extension (ascending)
188 slices.SortFunc(pairs, func(a, b extCount) int {
189 return cmp.Or(
190 -cmp.Compare(a.count, b.count),
191 cmp.Compare(a.ext, b.ext),
192 )
193 })
194
195 const nTop = 5
196 count := min(nTop, len(pairs))
197 result := make([]string, count)
198 for i := range count {
199 result[i] = fmt.Sprintf("%v: %v (%0.0f%%)", pairs[i].ext, pairs[i].count, 100*float64(pairs[i].count)/float64(c.TotalFiles))
200 }
201
202 return result
203}