claudetool: add "related files" detection to codereview tool
diff --git a/claudetool/differential.go b/claudetool/differential.go
index 014fc13..710a09d 100644
--- a/claudetool/differential.go
+++ b/claudetool/differential.go
@@ -2,6 +2,7 @@
import (
"bytes"
+ "cmp"
"context"
"encoding/json"
"fmt"
@@ -12,8 +13,10 @@
"os"
"os/exec"
"path/filepath"
+ "runtime"
"slices"
"strings"
+ "sync"
"time"
"golang.org/x/tools/go/packages"
@@ -78,7 +81,20 @@
}
allPkgList := slices.Collect(maps.Keys(allPkgs))
- var msgs []string
+ var errorMessages []string // problems we want the model to address
+ var infoMessages []string // info the model should consider
+
+ // Find potentially related files that should also be considered
+ // TODO: add some caching here, since this depends only on the initial commit and the changed files, not the details of the changes
+ relatedFiles, err := r.findRelatedFiles(ctx, changedFiles)
+ if err != nil {
+ slog.DebugContext(ctx, "CodeReviewer.Run: failed to find related files", "err", err)
+ } else {
+ relatedMsg := r.formatRelatedFiles(relatedFiles)
+ if relatedMsg != "" {
+ infoMessages = append(infoMessages, relatedMsg)
+ }
+ }
testMsg, err := r.checkTests(ctx, allPkgList)
if err != nil {
@@ -86,7 +102,7 @@
return "", err
}
if testMsg != "" {
- msgs = append(msgs, testMsg)
+ errorMessages = append(errorMessages, testMsg)
}
goplsMsg, err := r.checkGopls(ctx, changedFiles) // includes vet checks
@@ -95,17 +111,24 @@
return "", err
}
if goplsMsg != "" {
- msgs = append(msgs, goplsMsg)
+ errorMessages = append(errorMessages, goplsMsg)
}
- if len(msgs) == 0 {
- slog.DebugContext(ctx, "CodeReviewer.Run: no issues found")
- return "OK", nil
+ buf := new(strings.Builder)
+ if len(infoMessages) > 0 {
+ buf.WriteString("# Info\n\n")
+ buf.WriteString(strings.Join(infoMessages, "\n\n"))
+ buf.WriteString("\n\n")
}
-
- msgs = append(msgs, "Please fix before proceeding.")
- slog.DebugContext(ctx, "CodeReviewer.Run: found issues", "issues", msgs)
- return strings.Join(msgs, "\n\n"), nil
+ if len(errorMessages) > 0 {
+ buf.WriteString("# Errors\n\n")
+ buf.WriteString(strings.Join(errorMessages, "\n\n"))
+ buf.WriteString("\n\nPlease fix before proceeding.\n")
+ }
+ if buf.Len() == 0 {
+ buf.WriteString("OK")
+ }
+ return buf.String(), nil
}
func (r *CodeReviewer) initializeInitialCommitWorktree(ctx context.Context) error {
@@ -845,3 +868,161 @@
return buf.String()
}
+
+// RelatedFile represents a file historically related to the changed files
+type RelatedFile struct {
+ Path string // Path to the file
+ Correlation float64 // Correlation score (0.0-1.0)
+}
+
+// findRelatedFiles identifies files that are historically related to the changed files
+// by analyzing git commit history for co-occurrences.
+func (r *CodeReviewer) findRelatedFiles(ctx context.Context, changedFiles []string) ([]RelatedFile, error) {
+ commits, err := r.getCommitsTouchingFiles(ctx, changedFiles)
+ if err != nil {
+ return nil, fmt.Errorf("failed to get commits touching files: %w", err)
+ }
+ if len(commits) == 0 {
+ return nil, nil
+ }
+
+ relChanged := make(map[string]bool, len(changedFiles))
+ for _, file := range changedFiles {
+ rel, err := filepath.Rel(r.repoRoot, file)
+ if err != nil {
+ return nil, fmt.Errorf("failed to get relative path for %s: %w", file, err)
+ }
+ relChanged[rel] = true
+ }
+
+ historyFiles := make(map[string]int)
+ var historyMu sync.Mutex
+
+ maxWorkers := runtime.GOMAXPROCS(0)
+ semaphore := make(chan bool, maxWorkers)
+ var wg sync.WaitGroup
+
+ for _, commit := range commits {
+ wg.Add(1)
+ semaphore <- true // acquire
+
+ go func(commit string) {
+ defer wg.Done()
+ defer func() { <-semaphore }() // release
+ commitFiles, err := r.getFilesInCommit(ctx, commit)
+ if err != nil {
+ slog.WarnContext(ctx, "Failed to get files in commit", "commit", commit, "err", err)
+ return
+ }
+ incr := 0
+ for _, file := range commitFiles {
+ if relChanged[file] {
+ incr++
+ }
+ }
+ if incr == 0 {
+ return
+ }
+ historyMu.Lock()
+ defer historyMu.Unlock()
+ for _, file := range commitFiles {
+ historyFiles[file] += incr
+ }
+ }(commit)
+ }
+ wg.Wait()
+
+ // normalize
+ maxCount := 0
+ for _, count := range historyFiles {
+ maxCount = max(maxCount, count)
+ }
+ if maxCount == 0 {
+ return nil, nil
+ }
+
+ var relatedFiles []RelatedFile
+ for file, count := range historyFiles {
+ if relChanged[file] {
+ // Don't include inputs in the output.
+ continue
+ }
+ correlation := float64(count) / float64(maxCount)
+ // Require min correlation to avoid noise
+ if correlation >= 0.1 {
+ relatedFiles = append(relatedFiles, RelatedFile{Path: file, Correlation: correlation})
+ }
+ }
+
+ // Highest correlation first
+ slices.SortFunc(relatedFiles, func(a, b RelatedFile) int {
+ return -1 * cmp.Compare(a.Correlation, b.Correlation)
+ })
+
+ // Limit to 1 correlated file per input file.
+ // (Arbitrary limit, to be adjusted.)
+ maxFiles := len(changedFiles)
+ if len(relatedFiles) > maxFiles {
+ relatedFiles = relatedFiles[:maxFiles]
+ }
+
+ // TODO: add an LLM in the mix here (like the keyword search tool) to do a filtering pass,
+ // and then increase the strength of the wording in the relatedFiles message.
+
+ return relatedFiles, nil
+}
+
+// getCommitsTouchingFiles returns all commits that touch any of the specified files
+func (r *CodeReviewer) getCommitsTouchingFiles(ctx context.Context, files []string) ([]string, error) {
+ if len(files) == 0 {
+ return nil, nil
+ }
+ fileArgs := append([]string{"rev-list", "--all", "--"}, files...)
+ cmd := exec.CommandContext(ctx, "git", fileArgs...)
+ cmd.Dir = r.repoRoot
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ return nil, fmt.Errorf("failed to get commits: %w\n%s", err, out)
+ }
+ return nonEmptyTrimmedLines(out), nil
+}
+
+// getFilesInCommit returns all files changed in a specific commit
+func (r *CodeReviewer) getFilesInCommit(ctx context.Context, commit string) ([]string, error) {
+ cmd := exec.CommandContext(ctx, "git", "diff-tree", "--no-commit-id", "--name-only", "-r", commit)
+ cmd.Dir = r.repoRoot
+ out, err := cmd.CombinedOutput()
+ if err != nil {
+ return nil, fmt.Errorf("failed to get files in commit: %w\n%s", err, out)
+ }
+ return nonEmptyTrimmedLines(out), nil
+}
+
+func nonEmptyTrimmedLines(b []byte) []string {
+ var lines []string
+ for line := range strings.Lines(string(b)) {
+ line = strings.TrimSpace(line)
+ if line != "" {
+ lines = append(lines, line)
+ }
+ }
+ return lines
+}
+
+// formatRelatedFiles formats the related files list into a human-readable message
+func (r *CodeReviewer) formatRelatedFiles(files []RelatedFile) string {
+ if len(files) == 0 {
+ return ""
+ }
+
+ buf := new(strings.Builder)
+
+ fmt.Fprintf(buf, "Potentially related files:\n\n")
+
+ for _, file := range files {
+ fmt.Fprintf(buf, "- %s (%0.0f%%)\n", file.Path, 100*file.Correlation)
+ }
+
+ fmt.Fprintf(buf, "\nThese files have historically changed with the files you have modified. Consider whether they require updates as well.\n")
+ return buf.String()
+}
diff --git a/claudetool/testdata/add_skipped_test.txtar b/claudetool/testdata/add_skipped_test.txtar
index 95000cc..099ebdd 100644
--- a/claudetool/testdata/add_skipped_test.txtar
+++ b/claudetool/testdata/add_skipped_test.txtar
@@ -19,6 +19,8 @@
Add skipped test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (2cdfdc5009e364add9a263909472f85eb08480ed) and HEAD:
1: sketch.dev.TestP: New test is skipped
diff --git a/claudetool/testdata/empty_testdir_add_file.txtar b/claudetool/testdata/empty_testdir_add_file.txtar
index b754f3d..7c2be2d 100644
--- a/claudetool/testdata/empty_testdir_add_file.txtar
+++ b/claudetool/testdata/empty_testdir_add_file.txtar
@@ -29,6 +29,8 @@
Add non-empty file to testdata
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (cdc9ec6dfb8669c11d8aa0df49c72112627784dc) and HEAD:
1: sketch.dev.TestEmptyTestdata: Was passing, now failing
diff --git a/claudetool/testdata/gopls_issues.txtar b/claudetool/testdata/gopls_issues.txtar
index 56f1557..c0250ac 100644
--- a/claudetool/testdata/gopls_issues.txtar
+++ b/claudetool/testdata/gopls_issues.txtar
@@ -15,6 +15,8 @@
Add file with gopls issues
-- .run_test --
+# Errors
+
Gopls check issues detected:
1. /PATH/TO/REPO/p.go:5:2-22: unreachable code
diff --git a/claudetool/testdata/gopls_new_issue_with_existing.txtar b/claudetool/testdata/gopls_new_issue_with_existing.txtar
index d47fb02..594ea93 100644
--- a/claudetool/testdata/gopls_new_issue_with_existing.txtar
+++ b/claudetool/testdata/gopls_new_issue_with_existing.txtar
@@ -26,6 +26,8 @@
Add a new function with a new gopls issue
-- .run_test --
+# Errors
+
Gopls check issues detected:
1. /PATH/TO/REPO/p.go:9:2-31: unreachable code
diff --git a/claudetool/testdata/mark_test_skipped.txtar b/claudetool/testdata/mark_test_skipped.txtar
index bf887e3..d834ae4 100644
--- a/claudetool/testdata/mark_test_skipped.txtar
+++ b/claudetool/testdata/mark_test_skipped.txtar
@@ -24,6 +24,8 @@
Skip test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (851ba0628e0e2d93e27e168d6af336f3e4d375c7) and HEAD:
1: sketch.dev.TestP: Was passing, now skipped
diff --git a/claudetool/testdata/multi_commit_review.txtar b/claudetool/testdata/multi_commit_review.txtar
index 2286dee..cbf2464 100644
--- a/claudetool/testdata/multi_commit_review.txtar
+++ b/claudetool/testdata/multi_commit_review.txtar
@@ -36,6 +36,8 @@
Add yet another file
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (a4fd3c2166b35e02a2cde466880e45aea6e6212e) and HEAD:
1: sketch.dev.TestX: New test is failing
diff --git a/claudetool/testdata/new_test_build_error.txtar b/claudetool/testdata/new_test_build_error.txtar
index 2fe1d34..2405c49 100644
--- a/claudetool/testdata/new_test_build_error.txtar
+++ b/claudetool/testdata/new_test_build_error.txtar
@@ -19,6 +19,8 @@
Add test with build error
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (cf75ed70e940ce33a17a70b1894ad053b67731c0) and HEAD:
1: sketch.dev: Previously had no tests, now has build/vet errors
diff --git a/claudetool/testdata/no_tests_to_failing_tests.txtar b/claudetool/testdata/no_tests_to_failing_tests.txtar
index b0cc956..6161586 100644
--- a/claudetool/testdata/no_tests_to_failing_tests.txtar
+++ b/claudetool/testdata/no_tests_to_failing_tests.txtar
@@ -19,6 +19,8 @@
Add failing test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (cf75ed70e940ce33a17a70b1894ad053b67731c0) and HEAD:
1: sketch.dev.TestP: New test is failing
diff --git a/claudetool/testdata/passing_to_build_error.txtar b/claudetool/testdata/passing_to_build_error.txtar
index 9558862..6c7d1d0 100644
--- a/claudetool/testdata/passing_to_build_error.txtar
+++ b/claudetool/testdata/passing_to_build_error.txtar
@@ -24,6 +24,8 @@
Break test with build error
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (149798f10aaf06bc4700bb62d46ed6d8d88e35ad) and HEAD:
1: sketch.dev: Was passing, now has build/vet errors
diff --git a/claudetool/testdata/passing_to_failing.txtar b/claudetool/testdata/passing_to_failing.txtar
index 31a0556..50afae7 100644
--- a/claudetool/testdata/passing_to_failing.txtar
+++ b/claudetool/testdata/passing_to_failing.txtar
@@ -24,6 +24,8 @@
Break test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (149798f10aaf06bc4700bb62d46ed6d8d88e35ad) and HEAD:
1: sketch.dev.TestP: Was passing, now failing
diff --git a/claudetool/testdata/passing_to_failing_subdir.txtar b/claudetool/testdata/passing_to_failing_subdir.txtar
index 7649554..1211912 100644
--- a/claudetool/testdata/passing_to_failing_subdir.txtar
+++ b/claudetool/testdata/passing_to_failing_subdir.txtar
@@ -24,6 +24,8 @@
Break test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (8c6c71d0beac5e27b87dfc3f2fd9d274b62c3d3a) and HEAD:
1: sketch.dev/foo.TestP: Was passing, now failing
diff --git a/claudetool/testdata/related_files_cooccurrence.txtar b/claudetool/testdata/related_files_cooccurrence.txtar
new file mode 100644
index 0000000..b15f325
--- /dev/null
+++ b/claudetool/testdata/related_files_cooccurrence.txtar
@@ -0,0 +1,103 @@
+Tests related files identification based on historical co-occurrence
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- p.go --
+package p
+
+func d() {}
+
+-- .commit --
+Add functions to a.go and b.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Add functions to a.go and b.go again
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 2
+}
+
+-- .commit --
+Add functions to a.go and c.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update file a.go only
+
+-- a.go --
+package main
+
+func a() {
+ x := 42 // new gopls issue to view mixed info/error lines
+}
+
+-- .run_test --
+# Info
+
+Potentially related files:
+
+- p.go (33%)
+
+These files have historically changed with the files you have modified. Consider whether they require updates as well.
+
+
+# Errors
+
+Gopls check issues detected:
+
+1. /PATH/TO/REPO/a.go:4:5-6: declared and not used: x
+
+IMPORTANT: Only fix new gopls check issues in parts of the code that you have already edited. Do not change existing code that was not part of your current edits.
+
+
+Please fix before proceeding.
diff --git a/claudetool/testdata/skipped_to_build_error.txtar b/claudetool/testdata/skipped_to_build_error.txtar
index f648572..671b53b 100644
--- a/claudetool/testdata/skipped_to_build_error.txtar
+++ b/claudetool/testdata/skipped_to_build_error.txtar
@@ -25,6 +25,8 @@
Change skipped test to test with build error
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (3635353dfefb2fa652728318567209ccfc4aba42) and HEAD:
1: sketch.dev: Was passing, now has build/vet errors
diff --git a/claudetool/testdata/skipped_to_failing.txtar b/claudetool/testdata/skipped_to_failing.txtar
index 50348d2..98935e8 100644
--- a/claudetool/testdata/skipped_to_failing.txtar
+++ b/claudetool/testdata/skipped_to_failing.txtar
@@ -25,6 +25,8 @@
Change skipped test to failing test
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (3635353dfefb2fa652728318567209ccfc4aba42) and HEAD:
1: sketch.dev.TestP: Was skipped, now failing
diff --git a/claudetool/testdata/vet_error_test.txtar b/claudetool/testdata/vet_error_test.txtar
index 8c9dc64..0163a81 100644
--- a/claudetool/testdata/vet_error_test.txtar
+++ b/claudetool/testdata/vet_error_test.txtar
@@ -21,6 +21,8 @@
Add vet error
-- .run_test --
+# Errors
+
Test regressions detected between initial commit (656e4d573cb77a30ebce29ebb974533ece282946) and HEAD:
1: sketch.dev: Previously had no tests, now has build/vet errors