claudetool/codereview: add caching in findRelatedFiles
Primary goal is latency reduction.
Also slightly reduces context usage.
Co-Authored-By: sketch <hello@sketch.dev>
Change-ID: sa1007d82a5165ab4k
diff --git a/claudetool/codereview/codereview.go b/claudetool/codereview/codereview.go
index c729c30..d99edff 100644
--- a/claudetool/codereview/codereview.go
+++ b/claudetool/codereview/codereview.go
@@ -21,12 +21,17 @@
initialStatus []fileStatus // git status of files at initial commit, absolute paths
reviewed []string // history of all commits which have been reviewed
initialWorktree string // git worktree at initial commit, absolute path
+ // "Related files" caching
+ processedChangedFileSets map[string]bool // hash of sorted changedFiles -> processed
+ reportedRelatedFiles map[string]bool // file path -> reported
}
func NewCodeReviewer(ctx context.Context, repoRoot, sketchBaseRef string) (*CodeReviewer, error) {
r := &CodeReviewer{
- repoRoot: repoRoot,
- sketchBaseRef: sketchBaseRef,
+ repoRoot: repoRoot,
+ sketchBaseRef: sketchBaseRef,
+ processedChangedFileSets: make(map[string]bool),
+ reportedRelatedFiles: make(map[string]bool),
}
if r.repoRoot == "" {
return nil, fmt.Errorf("NewCodeReviewer: repoRoot must be non-empty")
diff --git a/claudetool/codereview/codereview_test.go b/claudetool/codereview/codereview_test.go
index 3b04898..f9d240e 100644
--- a/claudetool/codereview/codereview_test.go
+++ b/claudetool/codereview/codereview_test.go
@@ -116,6 +116,7 @@
// processTestFiles processes the files in the txtar archive in sequence.
func processTestFiles(t *testing.T, archive *txtar.Archive, dir string, update bool) error {
var initialCommit string
+ var reviewer *CodeReviewer
filesForNextCommit := make(map[string]bool)
for i, file := range archive.Files {
@@ -127,14 +128,18 @@
}
clear(filesForNextCommit)
initialCommit = cmp.Or(initialCommit, commit)
- // fmt.Println("initial commit:", initialCommit)
- // cmd := exec.Command("git", "log")
- // cmd.Dir = dir
- // cmd.Stdout = os.Stdout
- // cmd.Run()
+ if reviewer == nil && initialCommit != "" {
+ reviewer, err = NewCodeReviewer(context.Background(), dir, initialCommit)
+ if err != nil {
+ return fmt.Errorf("error creating code reviewer: %w", err)
+ }
+ }
case ".run_test":
- got, err := runDifferentialTest(dir, initialCommit)
+ if reviewer == nil {
+ return fmt.Errorf("no code reviewer available, need initial commit first")
+ }
+ got, err := runDifferentialTest(reviewer)
if err != nil {
return fmt.Errorf("error running differential test: %w", err)
}
@@ -153,11 +158,11 @@
}
case ".run_autoformat":
- if initialCommit == "" {
- return fmt.Errorf("initial commit not set, cannot run autoformat")
+ if reviewer == nil {
+ return fmt.Errorf("no code reviewer available, need initial commit first")
}
- got, err := runAutoformat(dir, initialCommit)
+ got, err := runAutoformat(reviewer)
if err != nil {
return fmt.Errorf("error running autoformat: %w", err)
}
@@ -227,19 +232,8 @@
}
// runDifferentialTest runs the code review tool on the repository and returns the result.
-func runDifferentialTest(dir, initialCommit string) (string, error) {
- if initialCommit == "" {
- return "", fmt.Errorf("initial commit not set, cannot run differential test")
- }
-
- // Create a code reviewer for the repository
+func runDifferentialTest(reviewer *CodeReviewer) (string, error) {
ctx := context.Background()
- reviewer, err := NewCodeReviewer(ctx, dir, initialCommit)
- if err != nil {
- return "", fmt.Errorf("error creating code reviewer: %w", err)
- }
-
- // Run the code review
result, err := reviewer.Run(ctx, nil)
if err != nil {
return "", fmt.Errorf("error running code review: %w", err)
@@ -250,6 +244,7 @@
if len(result) > 0 {
resultStr = result[0].Text
}
+ dir := reviewer.repoRoot
normalized := normalizePaths(resultStr, dir)
return normalized, nil
}
@@ -267,16 +262,10 @@
}
// runAutoformat runs the autoformat function on the repository and returns the list of formatted files.
-func runAutoformat(dir, initialCommit string) ([]string, error) {
- if initialCommit == "" {
- return nil, fmt.Errorf("initial commit not set, cannot run autoformat")
- }
+func runAutoformat(reviewer *CodeReviewer) ([]string, error) {
ctx := context.Background()
- reviewer, err := NewCodeReviewer(ctx, dir, initialCommit)
- if err != nil {
- return nil, fmt.Errorf("error creating code reviewer: %w", err)
- }
formattedFiles := reviewer.autoformat(ctx)
+ dir := reviewer.repoRoot
normalizedFiles := make([]string, len(formattedFiles))
for i, file := range formattedFiles {
normalizedFiles[i] = normalizePaths(file, dir)
diff --git a/claudetool/codereview/differential.go b/claudetool/codereview/differential.go
index ff25b22..77dde85 100644
--- a/claudetool/codereview/differential.go
+++ b/claudetool/codereview/differential.go
@@ -4,6 +4,8 @@
"bytes"
"cmp"
"context"
+ "crypto/sha256"
+ "encoding/hex"
"encoding/json"
"fmt"
"io"
@@ -968,9 +970,59 @@
Correlation float64 // Correlation score (0.0-1.0)
}
-// findRelatedFiles identifies files that are historically related to the changed files
+// hashChangedFiles creates a deterministic hash of the changed files set
+func (r *CodeReviewer) hashChangedFiles(changedFiles []string) string {
+ // Sort files for deterministic hashing
+ sorted := slices.Clone(changedFiles)
+ slices.Sort(sorted)
+ h := sha256.New()
+ enc := json.NewEncoder(h)
+ err := enc.Encode(sorted)
+ if err != nil {
+ panic(err)
+ }
+ return hex.EncodeToString(h.Sum(nil))
+}
+
+// findRelatedFiles reports files that are historically related to the changed files
// by analyzing git commit history for co-occurrences.
+// This function implements caching to avoid duplicate CPU and LLM processing:
+// 1. If the exact same set of changedFiles has been processed before, return nil, nil
+// 2. If all related files have been previously reported, return nil, nil
+// 3. Otherwise, return the full set of related files and mark them as reported
func (r *CodeReviewer) findRelatedFiles(ctx context.Context, changedFiles []string) ([]RelatedFile, error) {
+ cf := r.hashChangedFiles(changedFiles)
+ if r.processedChangedFileSets[cf] {
+ return nil, nil
+ }
+ r.processedChangedFileSets[cf] = true // don't re-process, even on error
+
+ relatedFiles, err := r.computeRelatedFiles(ctx, changedFiles)
+ if err != nil {
+ return nil, err
+ }
+
+ hasNew := false
+ for _, rf := range relatedFiles {
+ if !r.reportedRelatedFiles[rf.Path] {
+ hasNew = true
+ break
+ }
+ }
+ if !hasNew {
+ return nil, nil
+ }
+
+ // We have new file(s) that haven't been called to the LLM's attention yet.
+ for _, rf := range relatedFiles {
+ r.reportedRelatedFiles[rf.Path] = true
+ }
+
+ return relatedFiles, nil
+}
+
+// computeRelatedFiles implements findRelatedFiles, without caching.
+func (r *CodeReviewer) computeRelatedFiles(ctx context.Context, changedFiles []string) ([]RelatedFile, error) {
commits, err := r.getCommitsTouchingFiles(ctx, changedFiles)
if err != nil {
return nil, fmt.Errorf("failed to get commits touching files: %w", err)
diff --git a/claudetool/codereview/differential_test.go b/claudetool/codereview/differential_test.go
index 827bf63..100d9f7 100644
--- a/claudetool/codereview/differential_test.go
+++ b/claudetool/codereview/differential_test.go
@@ -178,3 +178,39 @@
}
}
}
+
+// TestHashChangedFiles tests the hash function for changed files
+func TestHashChangedFiles(t *testing.T) {
+ reviewer := &CodeReviewer{
+ repoRoot: "/app",
+ processedChangedFileSets: make(map[string]bool),
+ reportedRelatedFiles: make(map[string]bool),
+ }
+
+ // Test that same files in different order produce same hash
+ files1 := []string{"/app/a.go", "/app/b.go"}
+ files2 := []string{"/app/b.go", "/app/a.go"}
+
+ hash1 := reviewer.hashChangedFiles(files1)
+ hash2 := reviewer.hashChangedFiles(files2)
+
+ if hash1 != hash2 {
+ t.Errorf("Expected same hash for same files in different order, got %q vs %q", hash1, hash2)
+ }
+
+ // Test that different files produce different hash
+ files3 := []string{"/app/a.go", "/app/c.go"}
+ hash3 := reviewer.hashChangedFiles(files3)
+
+ if hash1 == hash3 {
+ t.Errorf("Expected different hash for different files, got same hash %q", hash1)
+ }
+
+ // Test single file
+ files4 := []string{"/app/a.go"}
+ hash4 := reviewer.hashChangedFiles(files4)
+
+ if hash1 == hash4 {
+ t.Errorf("Expected different hash for different number of files, got same hash %q", hash1)
+ }
+}
diff --git a/claudetool/codereview/testdata/caching_demo_working.txtar b/claudetool/codereview/testdata/caching_demo_working.txtar
new file mode 100644
index 0000000..b57abc5
--- /dev/null
+++ b/claudetool/codereview/testdata/caching_demo_working.txtar
@@ -0,0 +1,108 @@
+Tests related files caching with a working relationship pattern
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- p.go --
+package p
+
+func d() {}
+
+-- .commit --
+Add functions to a.go and b.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Add functions to a.go and b.go again
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 2
+}
+
+-- .commit --
+Add functions to a.go and c.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update file a.go only (first time)
+
+-- a.go --
+package main
+
+func a() {
+ // Update 4 - first analysis
+}
+
+-- .commit --
+First analysis
+
+-- .run_test --
+# Info
+
+Potentially related files:
+
+- p.go (30%)
+
+These files have historically changed with the files you have modified. Consider whether they require updates as well.
+
+
+-- a.go --
+package main
+
+func a() {
+ // Update 5 - second analysis (should cache related files)
+}
+
+-- .commit --
+Second analysis (should cache related files)
+
+-- .run_test --
+OK
diff --git a/claudetool/codereview/testdata/related_files_cache_all_previously_reported.txtar b/claudetool/codereview/testdata/related_files_cache_all_previously_reported.txtar
new file mode 100644
index 0000000..0777729
--- /dev/null
+++ b/claudetool/codereview/testdata/related_files_cache_all_previously_reported.txtar
@@ -0,0 +1,98 @@
+Tests related files caching when all related files have been previously reported
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- d.go --
+package main
+
+func d() {}
+
+-- .commit --
+Create initial commit
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and b.go together
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and c.go together
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3 - first time, will report b.go and c.go
+}
+
+-- .commit --
+First code review - reports related files
+
+-- .run_test --
+# Info
+
+Potentially related files:
+
+- d.go (38%)
+
+These files have historically changed with the files you have modified. Consider whether they require updates as well.
+
+
+-- b.go --
+package main
+
+func b() {
+ // Update 2 - different changeset, but b.go was already reported
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 2 - different changeset, but c.go was already reported
+}
+
+-- .commit --
+Different changeset, but all related files already reported
+
+-- .run_test --
+OK
diff --git a/claudetool/codereview/testdata/related_files_cache_demo.txtar b/claudetool/codereview/testdata/related_files_cache_demo.txtar
new file mode 100644
index 0000000..997af17
--- /dev/null
+++ b/claudetool/codereview/testdata/related_files_cache_demo.txtar
@@ -0,0 +1,95 @@
+Tests that related files caching prevents duplicate processing and output
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- .commit --
+Add functions to a.go and b.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Add functions to a.go and b.go again
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 2
+}
+
+-- .commit --
+Add functions to a.go and c.go
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update file a.go only (first time)
+
+-- a.go --
+package main
+
+func a() {
+ // Update 4 - first analysis
+}
+
+-- .commit --
+First analysis of a.go change
+
+-- .run_test --
+OK
+-- a.go --
+package main
+
+func a() {
+ // Update 5 - second analysis (should be cached)
+}
+
+-- .commit --
+Second analysis of a.go change (should be cached)
+
+-- .run_test --
+OK
diff --git a/claudetool/codereview/testdata/related_files_cache_new_file_in_set.txtar b/claudetool/codereview/testdata/related_files_cache_new_file_in_set.txtar
new file mode 100644
index 0000000..4c32fa9
--- /dev/null
+++ b/claudetool/codereview/testdata/related_files_cache_new_file_in_set.txtar
@@ -0,0 +1,108 @@
+Tests related files caching when some files have been reported but new ones are present
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- d.go --
+package main
+
+func d() {}
+
+-- .commit --
+Create initial commit
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and b.go together
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and c.go together
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3
+}
+
+-- d.go --
+package main
+
+func d() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and d.go together
+
+-- a.go --
+package main
+
+func a() {
+ // Update 4 - first time, will report b.go, c.go, d.go
+}
+
+-- .commit --
+First review reports all related files
+
+-- .run_test --
+OK
+-- b.go --
+package main
+
+func b() {
+ // Update 2
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 2 - b.go and c.go already reported, but should still return full set
+ // because this is a different changeset that includes both files
+}
+
+-- .commit --
+Different changeset with two files, one relationship is new
+
+-- .run_test --
+OK
diff --git a/claudetool/codereview/testdata/related_files_cache_same_changeset.txtar b/claudetool/codereview/testdata/related_files_cache_same_changeset.txtar
new file mode 100644
index 0000000..db640a5
--- /dev/null
+++ b/claudetool/codereview/testdata/related_files_cache_same_changeset.txtar
@@ -0,0 +1,95 @@
+Tests related files caching when the exact same set of changed files is processed multiple times
+
+-- a.go --
+package main
+
+func a() {}
+
+-- b.go --
+package main
+
+func b() {}
+
+-- c.go --
+package main
+
+func c() {}
+
+-- .commit --
+Create initial commit
+
+-- a.go --
+package main
+
+func a() {
+ // Update 1
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 1
+}
+
+-- .commit --
+Update both a.go and b.go together (creates relationship)
+
+-- a.go --
+package main
+
+func a() {
+ // Update 2
+}
+
+-- b.go --
+package main
+
+func b() {
+ // Update 2
+}
+
+-- .commit --
+Update both a.go and b.go together again (strengthens relationship)
+
+-- a.go --
+package main
+
+func a() {
+ // Update 3
+}
+
+-- c.go --
+package main
+
+func c() {
+ // Update 1
+}
+
+-- .commit --
+Update a.go and c.go together (creates another relationship)
+
+-- a.go --
+package main
+
+func a() {
+ // Update 4 - first time processing this exact set
+}
+
+-- .commit --
+First time changing just a.go
+
+-- .run_test --
+OK
+-- a.go --
+package main
+
+func a() {
+ // Update 5 - second time processing this exact same set, should be cached
+}
+
+-- .commit --
+Second time changing just a.go (should be cached)
+
+-- .run_test --
+OK