claudetool/onstart: add codebase analysis tool and basic memory support
This is a preliminary approach. Big high level questions include:
* should we keep the multichoice tool prompting?
* should we push the list of quidance files or respond with them during codereview?
* should we use the list of docs and build files at all?
* are there other files we should hint (e.g. editor settings, something from aider, etc.)?
We should probably also blog about dear_llm.md to stop the endless proliferation of new files.
Co-Authored-By: sketch <hello@sketch.dev>
diff --git a/claudetool/onstart/analyze.go b/claudetool/onstart/analyze.go
new file mode 100644
index 0000000..4b573d9
--- /dev/null
+++ b/claudetool/onstart/analyze.go
@@ -0,0 +1,203 @@
+// Package onstart provides codebase analysis used to inform the initial system prompt.
+package onstart
+
+import (
+ "bufio"
+ "cmp"
+ "context"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "slices"
+ "strings"
+
+ "golang.org/x/sync/errgroup"
+)
+
+// Codebase contains metadata about the codebase.
+type Codebase struct {
+ // ExtensionCounts tracks the number of files with each extension
+ ExtensionCounts map[string]int
+ // Total number of files analyzed
+ TotalFiles int
+ // BuildFiles contains paths to build and configuration files
+ BuildFiles []string
+ // DocumentationFiles contains paths to documentation files
+ DocumentationFiles []string
+ // GuidanceFiles contains paths to files that provide context and guidance to LLMs
+ GuidanceFiles []string
+ // InjectFiles contains paths to critical guidance files (like DEAR_LLM.md, claude.md, and cursorrules)
+ // that need to be injected into the system prompt for highest visibility
+ InjectFiles []string
+ // InjectFileContents maps paths to file contents for critical inject files
+ // to avoid requiring an extra file read during template rendering
+ InjectFileContents map[string]string
+}
+
+// AnalyzeCodebase walks the codebase and analyzes the paths it finds.
+func AnalyzeCodebase(ctx context.Context, repoPath string) (*Codebase, error) {
+ // TODO: do a filesystem walk instead?
+ // There's a balance: git ls-files skips node_modules etc,
+ // but some guidance files might be locally .gitignored.
+ cmd := exec.Command("git", "ls-files")
+ cmd.Dir = repoPath
+
+ r, w := io.Pipe() // stream and scan rather than buffer
+ cmd.Stdout = w
+
+ err := cmd.Start()
+ if err != nil {
+ return nil, err
+ }
+
+ extCounts := make(map[string]int)
+ var buildFiles []string
+ var documentationFiles []string
+ var guidanceFiles []string
+ var injectFiles []string
+ injectFileContents := make(map[string]string)
+ var totalFiles int
+
+ eg, _ := errgroup.WithContext(ctx)
+
+ eg.Go(func() error {
+ defer r.Close()
+
+ scanner := bufio.NewScanner(r)
+ for scanner.Scan() {
+ file := scanner.Text()
+ file = strings.TrimSpace(file)
+ if file == "" {
+ continue
+ }
+ totalFiles++
+ ext := strings.ToLower(filepath.Ext(file))
+ ext = cmp.Or(ext, "<no-extension>")
+ extCounts[ext]++
+
+ fileCategory := categorizeFile(file)
+ // fmt.Println(file, "->", fileCategory)
+ switch fileCategory {
+ case "build":
+ buildFiles = append(buildFiles, file)
+ case "documentation":
+ documentationFiles = append(documentationFiles, file)
+ case "guidance":
+ guidanceFiles = append(guidanceFiles, file)
+ case "inject":
+ injectFiles = append(injectFiles, file)
+ }
+ }
+ return scanner.Err()
+ })
+
+ // Wait for the command to complete
+ eg.Go(func() error {
+ err := cmd.Wait()
+ if err != nil {
+ w.CloseWithError(err)
+ } else {
+ w.Close()
+ }
+ return err
+ })
+
+ if err := eg.Wait(); err != nil {
+ return nil, err
+ }
+
+ // Read content of inject files
+ for _, filePath := range injectFiles {
+ absPath := filepath.Join(repoPath, filePath)
+ content, err := os.ReadFile(absPath)
+ if err != nil {
+ fmt.Printf("Warning: Failed to read inject file %s: %v\n", filePath, err)
+ continue
+ }
+ injectFileContents[filePath] = string(content)
+ }
+
+ return &Codebase{
+ ExtensionCounts: extCounts,
+ TotalFiles: totalFiles,
+ BuildFiles: buildFiles,
+ DocumentationFiles: documentationFiles,
+ GuidanceFiles: guidanceFiles,
+ InjectFiles: injectFiles,
+ InjectFileContents: injectFileContents,
+ }, nil
+}
+
+// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
+// Returns an empty string if the file doesn't belong to any of these categories.
+// categorizeFile categorizes a file into one of four categories: build, documentation, guidance, or inject.
+// Returns an empty string if the file doesn't belong to any of these categories.
+// The path parameter is relative to the repository root as returned by git ls-files.
+func categorizeFile(path string) string {
+ filename := filepath.Base(path)
+ lowerPath := strings.ToLower(path)
+ lowerFilename := strings.ToLower(filename)
+
+ // InjectFiles - critical guidance files that should be injected into the system prompt
+ // These are repository root files only - files directly in the repo root, not in subdirectories
+ // Since git ls-files returns paths relative to repo root, we just need to check for absence of path separators
+ isRepoRootFile := !strings.Contains(path, "/")
+ if isRepoRootFile {
+ if (strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md")) ||
+ strings.HasPrefix(lowerFilename, "dear_llm") ||
+ strings.Contains(lowerFilename, "cursorrules") {
+ return "inject"
+ }
+ }
+
+ // BuildFiles - build and configuration files
+ if strings.HasPrefix(lowerFilename, "makefile") ||
+ strings.HasSuffix(lowerPath, ".vscode/tasks.json") {
+ return "build"
+ }
+
+ // DocumentationFiles - general documentation files
+ if strings.HasPrefix(lowerFilename, "readme") ||
+ strings.HasPrefix(lowerFilename, "contributing") {
+ return "documentation"
+ }
+
+ // GuidanceFiles - other files that provide guidance but aren't critical enough to inject
+ // Non-root directory claude.md files, and other guidance files
+ if !isRepoRootFile && strings.HasPrefix(lowerFilename, "claude.") && strings.HasSuffix(lowerFilename, ".md") {
+ return "guidance"
+ }
+
+ return ""
+}
+
+// TopExtensions returns the top 5 most common file extensions in the codebase
+func (c *Codebase) TopExtensions() []string {
+ type extCount struct {
+ ext string
+ count int
+ }
+ pairs := make([]extCount, 0, len(c.ExtensionCounts))
+ for ext, count := range c.ExtensionCounts {
+ pairs = append(pairs, extCount{ext, count})
+ }
+
+ // Sort by count (descending), then by extension (ascending)
+ slices.SortFunc(pairs, func(a, b extCount) int {
+ return cmp.Or(
+ -cmp.Compare(a.count, b.count),
+ cmp.Compare(a.ext, b.ext),
+ )
+ })
+
+ const nTop = 5
+ count := min(nTop, len(pairs))
+ result := make([]string, count)
+ for i := range count {
+ result[i] = fmt.Sprintf("%v: %v (%0.0f%%)", pairs[i].ext, pairs[i].count, 100*float64(pairs[i].count)/float64(c.TotalFiles))
+ }
+
+ return result
+}