Initial commit

commit: 2e463fb649fcff14d4025ddb91f630a98e7da526 [log] [tgz]
author: Earl Lee <earl.lee@sketch.dev> Thu Apr 17 11:22:22 2025 -0700
committer: Earl Lee <earl.lee@sketch.dev> Thu Apr 17 11:35:33 2025 -0700
tree: 0e86854d80d2759a913870655f13226c31f9d30c
diff --git a/claudetool/patchkit/patchkit.go b/claudetool/patchkit/patchkit.go
new file mode 100644
index 0000000..c7235e4
--- /dev/null
+++ b/claudetool/patchkit/patchkit.go

@@ -0,0 +1,415 @@
+package patchkit
+
+import (
+	"fmt"
+	"go/scanner"
+	"go/token"
+	"slices"
+	"strings"
+	"unicode"
+
+	"sketch.dev/claudetool/editbuf"
+)
+
+// A Spec specifies a single patch.
+type Spec struct {
+	Off int    // Byte offset to apply the replacement
+	Len int    // Length of the replacement
+	Src string // Original string (for debugging)
+	Old string // Search string
+	New string // Replacement string
+}
+
+// Unique generates a patch spec to apply op, given a unique occurrence of needle in haystack and replacement text replace.
+// It reports the number of matches found for needle in haystack: 0, 1, or 2 (for any value > 1).
+func Unique(haystack, needle, replace string) (*Spec, int) {
+	prefix, rest, ok := strings.Cut(haystack, needle)
+	if !ok {
+		return nil, 0
+	}
+	if strings.Contains(rest, needle) {
+		return nil, 2
+	}
+	s := &Spec{
+		Off: len(prefix),
+		Len: len(needle),
+		Src: haystack,
+		Old: needle,
+		New: replace,
+	}
+	return s, 1
+}
+
+// minimize reduces the size of the patch by removing any shared prefix and suffix.
+func (s *Spec) minimize() {
+	pre := commonPrefixLen(s.Old, s.New)
+	s.Off += pre
+	s.Len -= pre
+	s.Old = s.Old[pre:]
+	s.New = s.New[pre:]
+	suf := commonSuffixLen(s.Old, s.New)
+	s.Len -= suf
+	s.Old = s.Old[:len(s.Old)-suf]
+	s.New = s.New[:len(s.New)-suf]
+}
+
+// ApplyToEditBuf applies the patch to the given edit buffer.
+func (s *Spec) ApplyToEditBuf(buf *editbuf.Buffer) {
+	s.minimize()
+	buf.Replace(s.Off, s.Off+s.Len, s.New)
+}
+
+// UniqueDedent is Unique, but with flexibility around consistent whitespace prefix changes.
+// Unlike Unique, which returns a count of matches,
+// UniqueDedent returns a boolean indicating whether a unique match was found.
+// It is for LLMs that have a hard time reliably reproducing uniform whitespace prefixes.
+// For example, they may generate 8 spaces instead of 6 for all relevant lines.
+// UniqueDedent adjusts the needle's whitespace prefix to match the haystack's
+// and then replaces the unique instance of needle in haystack with replacement.
+func UniqueDedent(haystack, needle, replace string) (*Spec, bool) {
+	// TODO: this all definitely admits of some optimization
+	haystackLines := slices.Collect(strings.Lines(haystack))
+	needleLines := slices.Collect(strings.Lines(needle))
+	match := uniqueTrimmedLineMatch(haystackLines, needleLines)
+	if match == -1 {
+		return nil, false
+	}
+	// We now systematically adjust needle's whitespace prefix to match haystack.
+	// The first line gets special treatment, because its leading whitespace is irrelevant,
+	// and models often skip past it (or part of it).
+	if len(needleLines) == 0 {
+		return nil, false
+	}
+	// First line: cut leading whitespace and make corresponding fixes to replacement.
+	// The leading whitespace will come out in the wash in Unique.
+	// We need to make corresponding fixes to the replacement.
+	nl0 := needleLines[0]
+	noWS := strings.TrimLeftFunc(nl0, unicode.IsSpace)
+	ws0, _ := strings.CutSuffix(nl0, noWS) // can't fail
+	rest, ok := strings.CutPrefix(replace, ws0)
+	if ok {
+		// Adjust needle and replacement in tandem.
+		nl0 = noWS
+		replace = rest
+	}
+	// Calculate common whitespace prefixes for the rest.
+	haystackPrefix := commonWhitespacePrefix(haystackLines[match : match+len(needleLines)])
+	needlePrefix := commonWhitespacePrefix(needleLines[1:])
+	nbuf := new(strings.Builder)
+	for i, line := range needleLines {
+		if i == 0 {
+			nbuf.WriteString(nl0)
+			continue
+		}
+		// Allow empty (newline-only) lines not to be prefixed.
+		if strings.TrimRight(line, "\n\r") == "" {
+			nbuf.WriteString(line)
+			continue
+		}
+		// Swap in haystackPrefix for needlePrefix.
+		nbuf.WriteString(haystackPrefix)
+		nbuf.WriteString(line[len(needlePrefix):])
+	}
+	// Do a replacement with our new-and-improved needle.
+	needle = nbuf.String()
+	spec, count := Unique(haystack, needle, replace)
+	if count != 1 {
+		return nil, false
+	}
+	return spec, true
+}
+
+type tok struct {
+	pos token.Position
+	tok token.Token
+	lit string
+}
+
+func (t tok) String() string {
+	if t.lit == "" {
+		return fmt.Sprintf("%s", t.tok)
+	}
+	return fmt.Sprintf("%s(%q)", t.tok, t.lit)
+}
+
+func tokenize(code string) ([]tok, bool) {
+	var s scanner.Scanner
+	fset := token.NewFileSet()
+	file := fset.AddFile("", fset.Base(), len(code))
+	s.Init(file, []byte(code), nil, scanner.ScanComments)
+	var tokens []tok
+	for {
+		pos, t, lit := s.Scan()
+		if s.ErrorCount > 0 {
+			return nil, false // invalid Go code (or not Go code at all)
+		}
+		if t == token.EOF {
+			return tokens, true
+		}
+		tokens = append(tokens, tok{pos: fset.PositionFor(pos, false), tok: t, lit: lit})
+	}
+}
+
+func tokensEqual(a, b []tok) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		at, bt := a[i], b[i]
+		// positions are expected to differ
+		if at.tok != bt.tok || at.lit != bt.lit {
+			return false
+		}
+	}
+	return true
+}
+
+func tokensUniqueMatch(haystack, needle []tok) int {
+	// TODO: optimize
+	match := -1
+	for i := range haystack {
+		rest := haystack[i:]
+		if len(rest) < len(needle) {
+			break
+		}
+		rest = rest[:len(needle)]
+		if !tokensEqual(rest, needle) {
+			continue
+		}
+		if match != -1 {
+			return -1 // multiple matches
+		}
+		match = i
+	}
+	return match
+}
+
+// UniqueGoTokens is Unique, but with flexibility around all insignificant whitespace.
+// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
+// It is safe (enough) because it ensures that the needle alterations occurs only in places
+// where whitespace is not semantically significant.
+// In practice, this appears safe.
+func UniqueGoTokens(haystack, needle, replace string) (*Spec, bool) {
+	nt, ok := tokenize(needle)
+	if !ok {
+		return nil, false
+	}
+	ht, ok := tokenize(haystack)
+	if !ok {
+		return nil, false
+	}
+	match := tokensUniqueMatch(ht, nt)
+	if match == -1 {
+		return nil, false
+	}
+	matchEnd := match + len(nt) - 1
+	start := ht[match].pos.Offset
+	needle = haystack[start:]
+	if matchEnd+1 < len(ht) {
+		// todo: handle match at very end of file
+		end := ht[matchEnd+1].pos.Offset
+		needle = needle[:end-start]
+	}
+	// OK, declare this very fuzzy match to be our new needle.
+	spec, count := Unique(haystack, needle, replace)
+	if count != 1 {
+		return nil, false
+	}
+	return spec, true
+}
+
+// UniqueInValidGo is Unique, but with flexibility around all leading and trailing whitespace.
+// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
+// It is safe (enough) because it ensures that the needle alterations occurs only in places
+// where whitespace is not semantically significant.
+// In practice, this appears safe.
+func UniqueInValidGo(haystack, needle, replace string) (*Spec, bool) {
+	haystackLines := slices.Collect(strings.Lines(haystack))
+	needleLines := slices.Collect(strings.Lines(needle))
+	matchStart := uniqueTrimmedLineMatch(haystackLines, needleLines)
+	if matchStart == -1 {
+		return nil, false
+	}
+	needle, replace = improveNeedle(haystack, needle, replace, matchStart)
+	matchEnd := matchStart + strings.Count(needle, "\n")
+	// Ensure that none of the lines that we fuzzy-matched involve a multiline comment or string literal.
+	var s scanner.Scanner
+	fset := token.NewFileSet()
+	file := fset.AddFile("", fset.Base(), len(haystack))
+	s.Init(file, []byte(haystack), nil, scanner.ScanComments)
+	for {
+		pos, tok, lit := s.Scan()
+		if s.ErrorCount > 0 {
+			return nil, false // invalid Go code (or not Go code at all)
+		}
+		if tok == token.EOF {
+			break
+		}
+		if tok == token.SEMICOLON || !strings.Contains(lit, "\n") {
+			continue
+		}
+		// In a token that spans multiple lines,
+		// so not perfectly matching whitespace might be unsafe.
+		p := fset.Position(pos)
+		tokenStart := p.Line - 1 // 1-based to 0-based
+		tokenEnd := tokenStart + strings.Count(lit, "\n")
+		// Check whether [matchStart, matchEnd] overlaps [tokenStart, tokenEnd]
+		// TODO: think more about edge conditions here. Any off-by-one errors?
+		// For example, leading whitespace and trailing whitespace
+		// on this token's lines are not semantically significant.
+		if tokenStart <= matchEnd && matchStart <= tokenEnd {
+			// if tokenStart <= matchStart && tokenEnd <= tokenEnd {}
+			return nil, false // this token overlaps the range we're replacing, not safe
+		}
+	}
+
+	// TODO: restore this sanity check? it's mildly expensive and i've never seen it fail.
+	// replaced := strings.Join(haystackLines[:matchStart], "") + replacement + strings.Join(haystackLines[matchEnd:], "")
+	// _, err := format.Source([]byte(replaced))
+	// if err != nil {
+	//     return nil, false
+	// }
+
+	// OK, declare this very fuzzy match to be our new needle.
+	needle = strings.Join(haystackLines[matchStart:matchEnd], "")
+	spec, count := Unique(haystack, needle, replace)
+	if count != 1 {
+		return nil, false
+	}
+	return spec, true
+}
+
+// UniqueTrim is Unique, but with flexibility to shrink old/replace in tandem.
+func UniqueTrim(haystack, needle, replace string) (*Spec, bool) {
+	// LLMs appear to particularly struggle with the first line of a patch.
+	// If that first line is replicated in replace,
+	// and removing it yields a unique match,
+	// we can remove that line entirely from both.
+	n0, nRest, nOK := strings.Cut(needle, "\n")
+	r0, rRest, rOK := strings.Cut(replace, "\n")
+	if !nOK || !rOK || n0 != r0 {
+		return nil, false
+	}
+	spec, count := Unique(haystack, nRest, rRest)
+	if count != 1 {
+		return nil, false
+	}
+	return spec, true
+}
+
+// uniqueTrimmedLineMatch returns the index of the first line in haystack that matches needle,
+// when ignoring leading and trailing whitespace.
+// uniqueTrimmedLineMatch returns -1 if there is no unique match.
+func uniqueTrimmedLineMatch(haystackLines, needleLines []string) int {
+	// TODO: optimize
+	trimmedHaystackLines := trimSpaceAll(haystackLines)
+	trimmedNeedleLines := trimSpaceAll(needleLines)
+	match := -1
+	for i := range trimmedHaystackLines {
+		rest := trimmedHaystackLines[i:]
+		if len(rest) < len(trimmedNeedleLines) {
+			break
+		}
+		rest = rest[:len(trimmedNeedleLines)]
+		if !slices.Equal(rest, trimmedNeedleLines) {
+			continue
+		}
+		if match != -1 {
+			return -1 // multiple matches
+		}
+		match = i
+	}
+	return match
+}
+
+func trimSpaceAll(x []string) []string {
+	trimmed := make([]string, len(x))
+	for i, s := range x {
+		trimmed[i] = strings.TrimSpace(s)
+	}
+	return trimmed
+}
+
+// improveNeedle adjusts both needle and replacement in tandem to better match haystack.
+// Note that we adjust search and replace together.
+func improveNeedle(haystack string, needle, replacement string, matchLine int) (string, string) {
+	// TODO: we make new slices too much
+	needleLines := slices.Collect(strings.Lines(needle))
+	if len(needleLines) == 0 {
+		return needle, replacement
+	}
+	haystackLines := slices.Collect(strings.Lines(haystack))
+	if matchLine+len(needleLines) > len(haystackLines) {
+		// should be impossible, but just in case
+		return needle, replacement
+	}
+	// Add trailing last-line newline if needed to better match haystack.
+	if !strings.HasSuffix(needle, "\n") && strings.HasSuffix(haystackLines[matchLine+len(needleLines)-1], "\n") {
+		needle += "\n"
+		replacement += "\n"
+	}
+	// Add leading first-line prefix if needed to better match haystack.
+	rest, ok := strings.CutSuffix(haystackLines[matchLine], needleLines[0])
+	if ok {
+		needle = rest + needle
+		replacement = rest + replacement
+	}
+	return needle, replacement
+}
+
+func isNonSpace(r rune) bool {
+	return !unicode.IsSpace(r)
+}
+
+func whitespacePrefix(s string) string {
+	firstNonSpace := strings.IndexFunc(s, isNonSpace)
+	return s[:max(0, firstNonSpace)] // map -1 for "not found" onto 0
+}
+
+// commonWhitespacePrefix returns the longest common whitespace prefix of the elements of x, somewhat flexibly.
+func commonWhitespacePrefix(x []string) string {
+	var pre string
+	for i, s := range x {
+		if i == 0 {
+			pre = s
+			continue
+		}
+		// ignore line endings for the moment
+		// (this is just for prefixes)
+		s = strings.TrimRight(s, "\n\r")
+		if s == "" {
+			continue
+		}
+		n := commonPrefixLen(pre, whitespacePrefix(s))
+		if n == 0 {
+			return ""
+		}
+		pre = pre[:n]
+	}
+	pre = strings.TrimRightFunc(pre, isNonSpace)
+	return pre
+}
+
+// commonPrefixLen returns the length of the common prefix of two strings.
+// TODO: optimize, see e.g. https://go-review.googlesource.com/c/go/+/408116
+func commonPrefixLen(a, b string) int {
+	shortest := min(len(a), len(b))
+	for i := range shortest {
+		if a[i] != b[i] {
+			return i
+		}
+	}
+	return shortest
+}
+
+// commonSuffixLen returns the length of the common suffix of two strings.
+// TODO: optimize
+func commonSuffixLen(a, b string) int {
+	shortest := min(len(a), len(b))
+	for i := 0; i < shortest; i++ {
+		if a[len(a)-i-1] != b[len(b)-i-1] {
+			return i
+		}
+	}
+	return shortest
+}
commit	2e463fb649fcff14d4025ddb91f630a98e7da526	[log] [tgz]
author	Earl Lee <earl.lee@sketch.dev>	Thu Apr 17 11:22:22 2025 -0700
committer	Earl Lee <earl.lee@sketch.dev>	Thu Apr 17 11:35:33 2025 -0700
tree	0e86854d80d2759a913870655f13226c31f9d30c