blob: 7ffade03f6dd6d44e9e90e808c499fc973640470 [file] [log] [blame]
Josh Bleecher Snydera997be62025-05-07 22:52:46 +00001// Package onstart provides codebase analysis used to inform the initial system prompt.
2package onstart
3
4import (
5 "bufio"
6 "cmp"
7 "context"
8 "fmt"
9 "io"
10 "os"
11 "os/exec"
12 "path/filepath"
13 "slices"
14 "strings"
15
16 "golang.org/x/sync/errgroup"
17)
18
19// Codebase contains metadata about the codebase.
20type Codebase struct {
21 // ExtensionCounts tracks the number of files with each extension
22 ExtensionCounts map[string]int
23 // Total number of files analyzed
24 TotalFiles int
25 // BuildFiles contains paths to build and configuration files
26 BuildFiles []string
27 // DocumentationFiles contains paths to documentation files
28 DocumentationFiles []string
29 // GuidanceFiles contains paths to files that provide context and guidance to LLMs
30 GuidanceFiles []string
31 // InjectFiles contains paths to critical guidance files (like DEAR_LLM.md, claude.md, and cursorrules)
32 // that need to be injected into the system prompt for highest visibility
33 InjectFiles []string
34 // InjectFileContents maps paths to file contents for critical inject files
35 // to avoid requiring an extra file read during template rendering
36 InjectFileContents map[string]string
37}
38
39// AnalyzeCodebase walks the codebase and analyzes the paths it finds.
40func AnalyzeCodebase(ctx context.Context, repoPath string) (*Codebase, error) {
41 // TODO: do a filesystem walk instead?
42 // There's a balance: git ls-files skips node_modules etc,
43 // but some guidance files might be locally .gitignored.
44 cmd := exec.Command("git", "ls-files")
45 cmd.Dir = repoPath
46
47 r, w := io.Pipe() // stream and scan rather than buffer
48 cmd.Stdout = w
49
50 err := cmd.Start()
51 if err != nil {
52 return nil, err
53 }
54
55 extCounts := make(map[string]int)
56 var buildFiles []string
57 var documentationFiles []string
58 var guidanceFiles []string
59 var injectFiles []string
60 injectFileContents := make(map[string]string)
61 var totalFiles int
62
63 eg, _ := errgroup.WithContext(ctx)
64
65 eg.Go(func() error {
66 defer r.Close()
67
68 scanner := bufio.NewScanner(r)
69 for scanner.Scan() {
70 file := scanner.Text()
71 file = strings.TrimSpace(file)
72 if file == "" {
73 continue
74 }
75 totalFiles++
76 ext := strings.ToLower(filepath.Ext(file))
77 ext = cmp.Or(ext, "<no-extension>")
78 extCounts[ext]++
79
80 fileCategory := categorizeFile(file)
81 // fmt.Println(file, "->", fileCategory)
82 switch fileCategory {
83 case "build":
84 buildFiles = append(buildFiles, file)
85 case "documentation":
86 documentationFiles = append(documentationFiles, file)
87 case "guidance":
88 guidanceFiles = append(guidanceFiles, file)
89 case "inject":
90 injectFiles = append(injectFiles, file)
91 }
92 }
93 return scanner.Err()
94 })
95
96 // Wait for the command to complete
97 eg.Go(func() error {
98 err := cmd.Wait()
99 if err != nil {
100 w.CloseWithError(err)
101 } else {
102 w.Close()
103 }
104 return err
105 })
106
107 if err := eg.Wait(); err != nil {
108 return nil, err
109 }
110
111 // Read content of inject files
112 for _, filePath := range injectFiles {
113 absPath := filepath.Join(repoPath, filePath)
114 content, err := os.ReadFile(absPath)
115 if err != nil {
116 fmt.Printf("Warning: Failed to read inject file %s: %v\n", filePath, err)
117 continue
118 }
119 injectFileContents[filePath] = string(content)
120 }
121
122 return &Codebase{
123 ExtensionCounts: extCounts,
124 TotalFiles: totalFiles,
125 BuildFiles: buildFiles,
126 DocumentationFiles: documentationFiles,
127 GuidanceFiles: guidanceFiles,
128 InjectFiles: injectFiles,
129 InjectFileContents: injectFileContents,
130 }, nil
131}
132
133// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
134// Returns an empty string if the file doesn't belong to any of these categories.
135// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
136// Returns an empty string if the file doesn't belong to any of these categories.
137// The path parameter is relative to the repository root as returned by git ls-files.
138func categorizeFile(path string) string {
139 filename := filepath.Base(path)
140 lowerPath := strings.ToLower(path)
141 lowerFilename := strings.ToLower(filename)
142
143 // InjectFiles - critical guidance files that should be injected into the system prompt
144 // These are repository root files only - files directly in the repo root, not in subdirectories
145 // Since git ls-files returns paths relative to repo root, we just need to check for absence of path separators
146 isRepoRootFile := !strings.Contains(path, "/")
147 if isRepoRootFile {
148 if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
149 strings.HasPrefix(lowerFilename, "dear_llm") ||
Josh Bleecher Snyder276f4602025-05-15 17:57:51 -0700150 (strings.HasPrefix(lowerFilename, "agent.") && strings.HasSuffix(lowerFilename, ".md")) ||
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000151 strings.Contains(lowerFilename, "cursorrules") {
152 return "inject"
153 }
154 }
155
Josh Bleecher Snyder6f041f52025-05-21 17:31:08 +0000156 // GitHub Copilot: https://code.visualstudio.com/docs/copilot/copilot-customization
157 if path == ".github/copilot-instructions.md" {
158 return "inject"
159 }
160
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000161 // BuildFiles - build and configuration files
162 if strings.HasPrefix(lowerFilename, "makefile") ||
163 strings.HasSuffix(lowerPath, ".vscode/tasks.json") {
164 return "build"
165 }
166
167 // DocumentationFiles - general documentation files
168 if strings.HasPrefix(lowerFilename, "readme") ||
169 strings.HasPrefix(lowerFilename, "contributing") {
170 return "documentation"
171 }
172
173 // GuidanceFiles - other files that provide guidance but aren't critical enough to inject
174 // Non-root directory claude.md files, and other guidance files
Josh Bleecher Snyder276f4602025-05-15 17:57:51 -0700175 if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
176 (strings.HasPrefix(lowerFilename, "agent.") && strings.HasSuffix(lowerFilename, ".md")) {
Josh Bleecher Snydera997be62025-05-07 22:52:46 +0000177 return "guidance"
178 }
179
180 return ""
181}
182
183// TopExtensions returns the top 5 most common file extensions in the codebase
184func (c *Codebase) TopExtensions() []string {
185 type extCount struct {
186 ext string
187 count int
188 }
189 pairs := make([]extCount, 0, len(c.ExtensionCounts))
190 for ext, count := range c.ExtensionCounts {
191 pairs = append(pairs, extCount{ext, count})
192 }
193
194 // Sort by count (descending), then by extension (ascending)
195 slices.SortFunc(pairs, func(a, b extCount) int {
196 return cmp.Or(
197 -cmp.Compare(a.count, b.count),
198 cmp.Compare(a.ext, b.ext),
199 )
200 })
201
202 const nTop = 5
203 count := min(nTop, len(pairs))
204 result := make([]string, count)
205 for i := range count {
206 result[i] = fmt.Sprintf("%v: %v (%0.0f%%)", pairs[i].ext, pairs[i].count, 100*float64(pairs[i].count)/float64(c.TotalFiles))
207 }
208
209 return result
210}