blob: c7235e46cffd4df892c4b645322f42cec98fbe22 [file] [log] [blame]
Earl Lee2e463fb2025-04-17 11:22:22 -07001package patchkit
2
3import (
4 "fmt"
5 "go/scanner"
6 "go/token"
7 "slices"
8 "strings"
9 "unicode"
10
11 "sketch.dev/claudetool/editbuf"
12)
13
14// A Spec specifies a single patch.
15type Spec struct {
16 Off int // Byte offset to apply the replacement
17 Len int // Length of the replacement
18 Src string // Original string (for debugging)
19 Old string // Search string
20 New string // Replacement string
21}
22
23// Unique generates a patch spec to apply op, given a unique occurrence of needle in haystack and replacement text replace.
24// It reports the number of matches found for needle in haystack: 0, 1, or 2 (for any value > 1).
25func Unique(haystack, needle, replace string) (*Spec, int) {
26 prefix, rest, ok := strings.Cut(haystack, needle)
27 if !ok {
28 return nil, 0
29 }
30 if strings.Contains(rest, needle) {
31 return nil, 2
32 }
33 s := &Spec{
34 Off: len(prefix),
35 Len: len(needle),
36 Src: haystack,
37 Old: needle,
38 New: replace,
39 }
40 return s, 1
41}
42
43// minimize reduces the size of the patch by removing any shared prefix and suffix.
44func (s *Spec) minimize() {
45 pre := commonPrefixLen(s.Old, s.New)
46 s.Off += pre
47 s.Len -= pre
48 s.Old = s.Old[pre:]
49 s.New = s.New[pre:]
50 suf := commonSuffixLen(s.Old, s.New)
51 s.Len -= suf
52 s.Old = s.Old[:len(s.Old)-suf]
53 s.New = s.New[:len(s.New)-suf]
54}
55
56// ApplyToEditBuf applies the patch to the given edit buffer.
57func (s *Spec) ApplyToEditBuf(buf *editbuf.Buffer) {
58 s.minimize()
59 buf.Replace(s.Off, s.Off+s.Len, s.New)
60}
61
62// UniqueDedent is Unique, but with flexibility around consistent whitespace prefix changes.
63// Unlike Unique, which returns a count of matches,
64// UniqueDedent returns a boolean indicating whether a unique match was found.
65// It is for LLMs that have a hard time reliably reproducing uniform whitespace prefixes.
66// For example, they may generate 8 spaces instead of 6 for all relevant lines.
67// UniqueDedent adjusts the needle's whitespace prefix to match the haystack's
68// and then replaces the unique instance of needle in haystack with replacement.
69func UniqueDedent(haystack, needle, replace string) (*Spec, bool) {
70 // TODO: this all definitely admits of some optimization
71 haystackLines := slices.Collect(strings.Lines(haystack))
72 needleLines := slices.Collect(strings.Lines(needle))
73 match := uniqueTrimmedLineMatch(haystackLines, needleLines)
74 if match == -1 {
75 return nil, false
76 }
77 // We now systematically adjust needle's whitespace prefix to match haystack.
78 // The first line gets special treatment, because its leading whitespace is irrelevant,
79 // and models often skip past it (or part of it).
80 if len(needleLines) == 0 {
81 return nil, false
82 }
83 // First line: cut leading whitespace and make corresponding fixes to replacement.
84 // The leading whitespace will come out in the wash in Unique.
85 // We need to make corresponding fixes to the replacement.
86 nl0 := needleLines[0]
87 noWS := strings.TrimLeftFunc(nl0, unicode.IsSpace)
88 ws0, _ := strings.CutSuffix(nl0, noWS) // can't fail
89 rest, ok := strings.CutPrefix(replace, ws0)
90 if ok {
91 // Adjust needle and replacement in tandem.
92 nl0 = noWS
93 replace = rest
94 }
95 // Calculate common whitespace prefixes for the rest.
96 haystackPrefix := commonWhitespacePrefix(haystackLines[match : match+len(needleLines)])
97 needlePrefix := commonWhitespacePrefix(needleLines[1:])
98 nbuf := new(strings.Builder)
99 for i, line := range needleLines {
100 if i == 0 {
101 nbuf.WriteString(nl0)
102 continue
103 }
104 // Allow empty (newline-only) lines not to be prefixed.
105 if strings.TrimRight(line, "\n\r") == "" {
106 nbuf.WriteString(line)
107 continue
108 }
109 // Swap in haystackPrefix for needlePrefix.
110 nbuf.WriteString(haystackPrefix)
111 nbuf.WriteString(line[len(needlePrefix):])
112 }
113 // Do a replacement with our new-and-improved needle.
114 needle = nbuf.String()
115 spec, count := Unique(haystack, needle, replace)
116 if count != 1 {
117 return nil, false
118 }
119 return spec, true
120}
121
122type tok struct {
123 pos token.Position
124 tok token.Token
125 lit string
126}
127
128func (t tok) String() string {
129 if t.lit == "" {
130 return fmt.Sprintf("%s", t.tok)
131 }
132 return fmt.Sprintf("%s(%q)", t.tok, t.lit)
133}
134
135func tokenize(code string) ([]tok, bool) {
136 var s scanner.Scanner
137 fset := token.NewFileSet()
138 file := fset.AddFile("", fset.Base(), len(code))
139 s.Init(file, []byte(code), nil, scanner.ScanComments)
140 var tokens []tok
141 for {
142 pos, t, lit := s.Scan()
143 if s.ErrorCount > 0 {
144 return nil, false // invalid Go code (or not Go code at all)
145 }
146 if t == token.EOF {
147 return tokens, true
148 }
149 tokens = append(tokens, tok{pos: fset.PositionFor(pos, false), tok: t, lit: lit})
150 }
151}
152
153func tokensEqual(a, b []tok) bool {
154 if len(a) != len(b) {
155 return false
156 }
157 for i := range a {
158 at, bt := a[i], b[i]
159 // positions are expected to differ
160 if at.tok != bt.tok || at.lit != bt.lit {
161 return false
162 }
163 }
164 return true
165}
166
167func tokensUniqueMatch(haystack, needle []tok) int {
168 // TODO: optimize
169 match := -1
170 for i := range haystack {
171 rest := haystack[i:]
172 if len(rest) < len(needle) {
173 break
174 }
175 rest = rest[:len(needle)]
176 if !tokensEqual(rest, needle) {
177 continue
178 }
179 if match != -1 {
180 return -1 // multiple matches
181 }
182 match = i
183 }
184 return match
185}
186
187// UniqueGoTokens is Unique, but with flexibility around all insignificant whitespace.
188// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
189// It is safe (enough) because it ensures that the needle alterations occurs only in places
190// where whitespace is not semantically significant.
191// In practice, this appears safe.
192func UniqueGoTokens(haystack, needle, replace string) (*Spec, bool) {
193 nt, ok := tokenize(needle)
194 if !ok {
195 return nil, false
196 }
197 ht, ok := tokenize(haystack)
198 if !ok {
199 return nil, false
200 }
201 match := tokensUniqueMatch(ht, nt)
202 if match == -1 {
203 return nil, false
204 }
205 matchEnd := match + len(nt) - 1
206 start := ht[match].pos.Offset
207 needle = haystack[start:]
208 if matchEnd+1 < len(ht) {
209 // todo: handle match at very end of file
210 end := ht[matchEnd+1].pos.Offset
211 needle = needle[:end-start]
212 }
213 // OK, declare this very fuzzy match to be our new needle.
214 spec, count := Unique(haystack, needle, replace)
215 if count != 1 {
216 return nil, false
217 }
218 return spec, true
219}
220
221// UniqueInValidGo is Unique, but with flexibility around all leading and trailing whitespace.
222// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
223// It is safe (enough) because it ensures that the needle alterations occurs only in places
224// where whitespace is not semantically significant.
225// In practice, this appears safe.
226func UniqueInValidGo(haystack, needle, replace string) (*Spec, bool) {
227 haystackLines := slices.Collect(strings.Lines(haystack))
228 needleLines := slices.Collect(strings.Lines(needle))
229 matchStart := uniqueTrimmedLineMatch(haystackLines, needleLines)
230 if matchStart == -1 {
231 return nil, false
232 }
233 needle, replace = improveNeedle(haystack, needle, replace, matchStart)
234 matchEnd := matchStart + strings.Count(needle, "\n")
235 // Ensure that none of the lines that we fuzzy-matched involve a multiline comment or string literal.
236 var s scanner.Scanner
237 fset := token.NewFileSet()
238 file := fset.AddFile("", fset.Base(), len(haystack))
239 s.Init(file, []byte(haystack), nil, scanner.ScanComments)
240 for {
241 pos, tok, lit := s.Scan()
242 if s.ErrorCount > 0 {
243 return nil, false // invalid Go code (or not Go code at all)
244 }
245 if tok == token.EOF {
246 break
247 }
248 if tok == token.SEMICOLON || !strings.Contains(lit, "\n") {
249 continue
250 }
251 // In a token that spans multiple lines,
252 // so not perfectly matching whitespace might be unsafe.
253 p := fset.Position(pos)
254 tokenStart := p.Line - 1 // 1-based to 0-based
255 tokenEnd := tokenStart + strings.Count(lit, "\n")
256 // Check whether [matchStart, matchEnd] overlaps [tokenStart, tokenEnd]
257 // TODO: think more about edge conditions here. Any off-by-one errors?
258 // For example, leading whitespace and trailing whitespace
259 // on this token's lines are not semantically significant.
260 if tokenStart <= matchEnd && matchStart <= tokenEnd {
261 // if tokenStart <= matchStart && tokenEnd <= tokenEnd {}
262 return nil, false // this token overlaps the range we're replacing, not safe
263 }
264 }
265
266 // TODO: restore this sanity check? it's mildly expensive and i've never seen it fail.
267 // replaced := strings.Join(haystackLines[:matchStart], "") + replacement + strings.Join(haystackLines[matchEnd:], "")
268 // _, err := format.Source([]byte(replaced))
269 // if err != nil {
270 // return nil, false
271 // }
272
273 // OK, declare this very fuzzy match to be our new needle.
274 needle = strings.Join(haystackLines[matchStart:matchEnd], "")
275 spec, count := Unique(haystack, needle, replace)
276 if count != 1 {
277 return nil, false
278 }
279 return spec, true
280}
281
282// UniqueTrim is Unique, but with flexibility to shrink old/replace in tandem.
283func UniqueTrim(haystack, needle, replace string) (*Spec, bool) {
284 // LLMs appear to particularly struggle with the first line of a patch.
285 // If that first line is replicated in replace,
286 // and removing it yields a unique match,
287 // we can remove that line entirely from both.
288 n0, nRest, nOK := strings.Cut(needle, "\n")
289 r0, rRest, rOK := strings.Cut(replace, "\n")
290 if !nOK || !rOK || n0 != r0 {
291 return nil, false
292 }
293 spec, count := Unique(haystack, nRest, rRest)
294 if count != 1 {
295 return nil, false
296 }
297 return spec, true
298}
299
300// uniqueTrimmedLineMatch returns the index of the first line in haystack that matches needle,
301// when ignoring leading and trailing whitespace.
302// uniqueTrimmedLineMatch returns -1 if there is no unique match.
303func uniqueTrimmedLineMatch(haystackLines, needleLines []string) int {
304 // TODO: optimize
305 trimmedHaystackLines := trimSpaceAll(haystackLines)
306 trimmedNeedleLines := trimSpaceAll(needleLines)
307 match := -1
308 for i := range trimmedHaystackLines {
309 rest := trimmedHaystackLines[i:]
310 if len(rest) < len(trimmedNeedleLines) {
311 break
312 }
313 rest = rest[:len(trimmedNeedleLines)]
314 if !slices.Equal(rest, trimmedNeedleLines) {
315 continue
316 }
317 if match != -1 {
318 return -1 // multiple matches
319 }
320 match = i
321 }
322 return match
323}
324
325func trimSpaceAll(x []string) []string {
326 trimmed := make([]string, len(x))
327 for i, s := range x {
328 trimmed[i] = strings.TrimSpace(s)
329 }
330 return trimmed
331}
332
333// improveNeedle adjusts both needle and replacement in tandem to better match haystack.
334// Note that we adjust search and replace together.
335func improveNeedle(haystack string, needle, replacement string, matchLine int) (string, string) {
336 // TODO: we make new slices too much
337 needleLines := slices.Collect(strings.Lines(needle))
338 if len(needleLines) == 0 {
339 return needle, replacement
340 }
341 haystackLines := slices.Collect(strings.Lines(haystack))
342 if matchLine+len(needleLines) > len(haystackLines) {
343 // should be impossible, but just in case
344 return needle, replacement
345 }
346 // Add trailing last-line newline if needed to better match haystack.
347 if !strings.HasSuffix(needle, "\n") && strings.HasSuffix(haystackLines[matchLine+len(needleLines)-1], "\n") {
348 needle += "\n"
349 replacement += "\n"
350 }
351 // Add leading first-line prefix if needed to better match haystack.
352 rest, ok := strings.CutSuffix(haystackLines[matchLine], needleLines[0])
353 if ok {
354 needle = rest + needle
355 replacement = rest + replacement
356 }
357 return needle, replacement
358}
359
360func isNonSpace(r rune) bool {
361 return !unicode.IsSpace(r)
362}
363
364func whitespacePrefix(s string) string {
365 firstNonSpace := strings.IndexFunc(s, isNonSpace)
366 return s[:max(0, firstNonSpace)] // map -1 for "not found" onto 0
367}
368
369// commonWhitespacePrefix returns the longest common whitespace prefix of the elements of x, somewhat flexibly.
370func commonWhitespacePrefix(x []string) string {
371 var pre string
372 for i, s := range x {
373 if i == 0 {
374 pre = s
375 continue
376 }
377 // ignore line endings for the moment
378 // (this is just for prefixes)
379 s = strings.TrimRight(s, "\n\r")
380 if s == "" {
381 continue
382 }
383 n := commonPrefixLen(pre, whitespacePrefix(s))
384 if n == 0 {
385 return ""
386 }
387 pre = pre[:n]
388 }
389 pre = strings.TrimRightFunc(pre, isNonSpace)
390 return pre
391}
392
393// commonPrefixLen returns the length of the common prefix of two strings.
394// TODO: optimize, see e.g. https://go-review.googlesource.com/c/go/+/408116
395func commonPrefixLen(a, b string) int {
396 shortest := min(len(a), len(b))
397 for i := range shortest {
398 if a[i] != b[i] {
399 return i
400 }
401 }
402 return shortest
403}
404
405// commonSuffixLen returns the length of the common suffix of two strings.
406// TODO: optimize
407func commonSuffixLen(a, b string) int {
408 shortest := min(len(a), len(b))
409 for i := 0; i < shortest; i++ {
410 if a[len(a)-i-1] != b[len(b)-i-1] {
411 return i
412 }
413 }
414 return shortest
415}