blob: e63b38075f6dc50b1ea4ae591eaf1528f1d0e6ae [file] [log] [blame]
Josh Bleecher Snydera997be62025-05-07 22:52:46 +00001// Package onstart provides codebase analysis used to inform the initial system prompt.
2package onstart
3
4import (
5 "bufio"
Marc-Antoine Ruelde19aca2025-06-08 13:20:43 -04006 "bytes"
Josh Bleecher Snydera997be62025-05-07 22:52:46 +00007 "cmp"
8 "context"
9 "fmt"
10 "io"
11 "os"
12 "os/exec"
13 "path/filepath"
14 "slices"
15 "strings"
16
17 "golang.org/x/sync/errgroup"
18)
19
20// Codebase contains metadata about the codebase.
21type Codebase struct {
22 // ExtensionCounts tracks the number of files with each extension
23 ExtensionCounts map[string]int
24 // Total number of files analyzed
25 TotalFiles int
26 // BuildFiles contains paths to build and configuration files
27 BuildFiles []string
28 // DocumentationFiles contains paths to documentation files
29 DocumentationFiles []string
30 // GuidanceFiles contains paths to files that provide context and guidance to LLMs
31 GuidanceFiles []string
32 // InjectFiles contains paths to critical guidance files (like DEAR_LLM.md, claude.md, and cursorrules)
33 // that need to be injected into the system prompt for highest visibility
34 InjectFiles []string
35 // InjectFileContents maps paths to file contents for critical inject files
36 // to avoid requiring an extra file read during template rendering
37 InjectFileContents map[string]string
38}
39
40// AnalyzeCodebase walks the codebase and analyzes the paths it finds.
41func AnalyzeCodebase(ctx context.Context, repoPath string) (*Codebase, error) {
42 // TODO: do a filesystem walk instead?
43 // There's a balance: git ls-files skips node_modules etc,
44 // but some guidance files might be locally .gitignored.
Marc-Antoine Ruelde19aca2025-06-08 13:20:43 -040045 cmd := exec.Command("git", "ls-files", "-z")
Josh Bleecher Snydera997be62025-05-07 22:52:46 +000046 cmd.Dir = repoPath
47
48 r, w := io.Pipe() // stream and scan rather than buffer
49 cmd.Stdout = w
50
51 err := cmd.Start()
52 if err != nil {
53 return nil, err
54 }
55
56 extCounts := make(map[string]int)
57 var buildFiles []string
58 var documentationFiles []string
59 var guidanceFiles []string
60 var injectFiles []string
61 injectFileContents := make(map[string]string)
62 var totalFiles int
63
64 eg, _ := errgroup.WithContext(ctx)
65
66 eg.Go(func() error {
67 defer r.Close()
68
69 scanner := bufio.NewScanner(r)
Marc-Antoine Ruelde19aca2025-06-08 13:20:43 -040070 scanner.Split(scanZero)
Josh Bleecher Snydera997be62025-05-07 22:52:46 +000071 for scanner.Scan() {
72 file := scanner.Text()
73 file = strings.TrimSpace(file)
74 if file == "" {
75 continue
76 }
77 totalFiles++
78 ext := strings.ToLower(filepath.Ext(file))
79 ext = cmp.Or(ext, "<no-extension>")
80 extCounts[ext]++
81
82 fileCategory := categorizeFile(file)
83 // fmt.Println(file, "->", fileCategory)
84 switch fileCategory {
85 case "build":
86 buildFiles = append(buildFiles, file)
87 case "documentation":
88 documentationFiles = append(documentationFiles, file)
89 case "guidance":
90 guidanceFiles = append(guidanceFiles, file)
91 case "inject":
92 injectFiles = append(injectFiles, file)
93 }
94 }
95 return scanner.Err()
96 })
97
98 // Wait for the command to complete
99 eg.Go(func() error {
100 err := cmd.Wait()
101 if err != nil {
102 w.CloseWithError(err)
103 } else {
104 w.Close()
105 }
106 return err
107 })
108
109 if err := eg.Wait(); err != nil {
110 return nil, err
111 }
112
113 // Read content of inject files
114 for _, filePath := range injectFiles {
115 absPath := filepath.Join(repoPath, filePath)
116 content, err := os.ReadFile(absPath)
117 if err != nil {
118 fmt.Printf("Warning: Failed to read inject file %s: %v\n", filePath, err)
119 continue
120 }
121 injectFileContents[filePath] = string(content)
122 }
123
124 return &Codebase{
125 ExtensionCounts: extCounts,
126 TotalFiles: totalFiles,
127 BuildFiles: buildFiles,
128 DocumentationFiles: documentationFiles,
129 GuidanceFiles: guidanceFiles,
130 InjectFiles: injectFiles,
131 InjectFileContents: injectFileContents,
132 }, nil
133}
134
135// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
136// Returns an empty string if the file doesn't belong to any of these categories.
137// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
138// Returns an empty string if the file doesn't belong to any of these categories.
139// The path parameter is relative to the repository root as returned by git ls-files.
140func categorizeFile(path string) string {
141 filename := filepath.Base(path)
142 lowerPath := strings.ToLower(path)
143 lowerFilename := strings.ToLower(filename)
144
145 // InjectFiles - critical guidance files that should be injected into the system prompt
146 // These are repository root files only - files directly in the repo root, not in subdirectories
147 // Since git ls-files returns paths relative to repo root, we just need to check for absence of path separators
148 isRepoRootFile := !strings.Contains(path, "/")
149 if isRepoRootFile {
150 if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
151 strings.HasPrefix(lowerFilename, "dear_llm") ||
Josh Bleecher Snyder276f4602025-05-15 17:57:51 -0700152 (strings.HasPrefix(lowerFilename, "agent.") && strings.HasSuffix(lowerFilename, ".md")) ||
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000153 strings.Contains(lowerFilename, "cursorrules") {
154 return "inject"
155 }
156 }
157
Josh Bleecher Snyder6f041f52025-05-21 17:31:08 +0000158 // GitHub Copilot: https://code.visualstudio.com/docs/copilot/copilot-customization
159 if path == ".github/copilot-instructions.md" {
160 return "inject"
161 }
162
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000163 // BuildFiles - build and configuration files
164 if strings.HasPrefix(lowerFilename, "makefile") ||
165 strings.HasSuffix(lowerPath, ".vscode/tasks.json") {
166 return "build"
167 }
168
169 // DocumentationFiles - general documentation files
170 if strings.HasPrefix(lowerFilename, "readme") ||
171 strings.HasPrefix(lowerFilename, "contributing") {
172 return "documentation"
173 }
174
175 // GuidanceFiles - other files that provide guidance but aren't critical enough to inject
176 // Non-root directory claude.md files, and other guidance files
Josh Bleecher Snyder276f4602025-05-15 17:57:51 -0700177 if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
178 (strings.HasPrefix(lowerFilename, "agent.") && strings.HasSuffix(lowerFilename, ".md")) {
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000179 return "guidance"
180 }
181
182 return ""
183}
184
185// TopExtensions returns the top 5 most common file extensions in the codebase
186func (c *Codebase) TopExtensions() []string {
187 type extCount struct {
188 ext string
189 count int
190 }
191 pairs := make([]extCount, 0, len(c.ExtensionCounts))
192 for ext, count := range c.ExtensionCounts {
193 pairs = append(pairs, extCount{ext, count})
194 }
195
196 // Sort by count (descending), then by extension (ascending)
197 slices.SortFunc(pairs, func(a, b extCount) int {
198 return cmp.Or(
199 -cmp.Compare(a.count, b.count),
200 cmp.Compare(a.ext, b.ext),
201 )
202 })
203
204 const nTop = 5
205 count := min(nTop, len(pairs))
206 result := make([]string, count)
207 for i := range count {
208 result[i] = fmt.Sprintf("%v: %v (%0.0f%%)", pairs[i].ext, pairs[i].count, 100*float64(pairs[i].count)/float64(c.TotalFiles))
209 }
210
211 return result
212}
Marc-Antoine Ruelde19aca2025-06-08 13:20:43 -0400213
214func scanZero(data []byte, atEOF bool) (advance int, token []byte, err error) {
215 if atEOF && len(data) == 0 {
216 return 0, nil, nil
217 }
218 if i := bytes.IndexByte(data, 0); i >= 0 {
219 // We have a full NUL line.
220 return i + 1, data[0:i], nil
221 }
222 // If we're at EOF, we have a final, non-terminated line. Return it.
223 if atEOF {
224 return len(data), data, nil
225 }
226 // Request more data.
227 return 0, nil, nil
228}