Blame - claudetool/patchkit/patchkit.go - sketch

blob: c7235e46cffd4df892c4b645322f42cec98fbe22 [file] [log] [blame]

Earl Lee	2e463fb	2025-04-17 11:22:22 -0700	[diff] [blame]	1	package patchkit
				2
				3	import (
				4	"fmt"
				5	"go/scanner"
				6	"go/token"
				7	"slices"
				8	"strings"
				9	"unicode"
				10
				11	"sketch.dev/claudetool/editbuf"
				12	)
				13
				14	// A Spec specifies a single patch.
				15	type Spec struct {
				16	Off int // Byte offset to apply the replacement
				17	Len int // Length of the replacement
				18	Src string // Original string (for debugging)
				19	Old string // Search string
				20	New string // Replacement string
				21	}
				22
				23	// Unique generates a patch spec to apply op, given a unique occurrence of needle in haystack and replacement text replace.
				24	// It reports the number of matches found for needle in haystack: 0, 1, or 2 (for any value > 1).
				25	func Unique(haystack, needle, replace string) (*Spec, int) {
				26	prefix, rest, ok := strings.Cut(haystack, needle)
				27	if !ok {
				28	return nil, 0
				29	}
				30	if strings.Contains(rest, needle) {
				31	return nil, 2
				32	}
				33	s := &Spec{
				34	Off: len(prefix),
				35	Len: len(needle),
				36	Src: haystack,
				37	Old: needle,
				38	New: replace,
				39	}
				40	return s, 1
				41	}
				42
				43	// minimize reduces the size of the patch by removing any shared prefix and suffix.
				44	func (s *Spec) minimize() {
				45	pre := commonPrefixLen(s.Old, s.New)
				46	s.Off += pre
				47	s.Len -= pre
				48	s.Old = s.Old[pre:]
				49	s.New = s.New[pre:]
				50	suf := commonSuffixLen(s.Old, s.New)
				51	s.Len -= suf
				52	s.Old = s.Old[:len(s.Old)-suf]
				53	s.New = s.New[:len(s.New)-suf]
				54	}
				55
				56	// ApplyToEditBuf applies the patch to the given edit buffer.
				57	func (s Spec) ApplyToEditBuf(buf editbuf.Buffer) {
				58	s.minimize()
				59	buf.Replace(s.Off, s.Off+s.Len, s.New)
				60	}
				61
				62	// UniqueDedent is Unique, but with flexibility around consistent whitespace prefix changes.
				63	// Unlike Unique, which returns a count of matches,
				64	// UniqueDedent returns a boolean indicating whether a unique match was found.
				65	// It is for LLMs that have a hard time reliably reproducing uniform whitespace prefixes.
				66	// For example, they may generate 8 spaces instead of 6 for all relevant lines.
				67	// UniqueDedent adjusts the needle's whitespace prefix to match the haystack's
				68	// and then replaces the unique instance of needle in haystack with replacement.
				69	func UniqueDedent(haystack, needle, replace string) (*Spec, bool) {
				70	// TODO: this all definitely admits of some optimization
				71	haystackLines := slices.Collect(strings.Lines(haystack))
				72	needleLines := slices.Collect(strings.Lines(needle))
				73	match := uniqueTrimmedLineMatch(haystackLines, needleLines)
				74	if match == -1 {
				75	return nil, false
				76	}
				77	// We now systematically adjust needle's whitespace prefix to match haystack.
				78	// The first line gets special treatment, because its leading whitespace is irrelevant,
				79	// and models often skip past it (or part of it).
				80	if len(needleLines) == 0 {
				81	return nil, false
				82	}
				83	// First line: cut leading whitespace and make corresponding fixes to replacement.
				84	// The leading whitespace will come out in the wash in Unique.
				85	// We need to make corresponding fixes to the replacement.
				86	nl0 := needleLines[0]
				87	noWS := strings.TrimLeftFunc(nl0, unicode.IsSpace)
				88	ws0, _ := strings.CutSuffix(nl0, noWS) // can't fail
				89	rest, ok := strings.CutPrefix(replace, ws0)
				90	if ok {
				91	// Adjust needle and replacement in tandem.
				92	nl0 = noWS
				93	replace = rest
				94	}
				95	// Calculate common whitespace prefixes for the rest.
				96	haystackPrefix := commonWhitespacePrefix(haystackLines[match : match+len(needleLines)])
				97	needlePrefix := commonWhitespacePrefix(needleLines[1:])
				98	nbuf := new(strings.Builder)
				99	for i, line := range needleLines {
				100	if i == 0 {
				101	nbuf.WriteString(nl0)
				102	continue
				103	}
				104	// Allow empty (newline-only) lines not to be prefixed.
				105	if strings.TrimRight(line, "\n\r") == "" {
				106	nbuf.WriteString(line)
				107	continue
				108	}
				109	// Swap in haystackPrefix for needlePrefix.
				110	nbuf.WriteString(haystackPrefix)
				111	nbuf.WriteString(line[len(needlePrefix):])
				112	}
				113	// Do a replacement with our new-and-improved needle.
				114	needle = nbuf.String()
				115	spec, count := Unique(haystack, needle, replace)
				116	if count != 1 {
				117	return nil, false
				118	}
				119	return spec, true
				120	}
				121
				122	type tok struct {
				123	pos token.Position
				124	tok token.Token
				125	lit string
				126	}
				127
				128	func (t tok) String() string {
				129	if t.lit == "" {
				130	return fmt.Sprintf("%s", t.tok)
				131	}
				132	return fmt.Sprintf("%s(%q)", t.tok, t.lit)
				133	}
				134
				135	func tokenize(code string) ([]tok, bool) {
				136	var s scanner.Scanner
				137	fset := token.NewFileSet()
				138	file := fset.AddFile("", fset.Base(), len(code))
				139	s.Init(file, []byte(code), nil, scanner.ScanComments)
				140	var tokens []tok
				141	for {
				142	pos, t, lit := s.Scan()
				143	if s.ErrorCount > 0 {
				144	return nil, false // invalid Go code (or not Go code at all)
				145	}
				146	if t == token.EOF {
				147	return tokens, true
				148	}
				149	tokens = append(tokens, tok{pos: fset.PositionFor(pos, false), tok: t, lit: lit})
				150	}
				151	}
				152
				153	func tokensEqual(a, b []tok) bool {
				154	if len(a) != len(b) {
				155	return false
				156	}
				157	for i := range a {
				158	at, bt := a[i], b[i]
				159	// positions are expected to differ
				160	if at.tok != bt.tok \|\| at.lit != bt.lit {
				161	return false
				162	}
				163	}
				164	return true
				165	}
				166
				167	func tokensUniqueMatch(haystack, needle []tok) int {
				168	// TODO: optimize
				169	match := -1
				170	for i := range haystack {
				171	rest := haystack[i:]
				172	if len(rest) < len(needle) {
				173	break
				174	}
				175	rest = rest[:len(needle)]
				176	if !tokensEqual(rest, needle) {
				177	continue
				178	}
				179	if match != -1 {
				180	return -1 // multiple matches
				181	}
				182	match = i
				183	}
				184	return match
				185	}
				186
				187	// UniqueGoTokens is Unique, but with flexibility around all insignificant whitespace.
				188	// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
				189	// It is safe (enough) because it ensures that the needle alterations occurs only in places
				190	// where whitespace is not semantically significant.
				191	// In practice, this appears safe.
				192	func UniqueGoTokens(haystack, needle, replace string) (*Spec, bool) {
				193	nt, ok := tokenize(needle)
				194	if !ok {
				195	return nil, false
				196	}
				197	ht, ok := tokenize(haystack)
				198	if !ok {
				199	return nil, false
				200	}
				201	match := tokensUniqueMatch(ht, nt)
				202	if match == -1 {
				203	return nil, false
				204	}
				205	matchEnd := match + len(nt) - 1
				206	start := ht[match].pos.Offset
				207	needle = haystack[start:]
				208	if matchEnd+1 < len(ht) {
				209	// todo: handle match at very end of file
				210	end := ht[matchEnd+1].pos.Offset
				211	needle = needle[:end-start]
				212	}
				213	// OK, declare this very fuzzy match to be our new needle.
				214	spec, count := Unique(haystack, needle, replace)
				215	if count != 1 {
				216	return nil, false
				217	}
				218	return spec, true
				219	}
				220
				221	// UniqueInValidGo is Unique, but with flexibility around all leading and trailing whitespace.
				222	// Like UniqueDedent, it returns a boolean indicating whether a unique match was found.
				223	// It is safe (enough) because it ensures that the needle alterations occurs only in places
				224	// where whitespace is not semantically significant.
				225	// In practice, this appears safe.
				226	func UniqueInValidGo(haystack, needle, replace string) (*Spec, bool) {
				227	haystackLines := slices.Collect(strings.Lines(haystack))
				228	needleLines := slices.Collect(strings.Lines(needle))
				229	matchStart := uniqueTrimmedLineMatch(haystackLines, needleLines)
				230	if matchStart == -1 {
				231	return nil, false
				232	}
				233	needle, replace = improveNeedle(haystack, needle, replace, matchStart)
				234	matchEnd := matchStart + strings.Count(needle, "\n")
				235	// Ensure that none of the lines that we fuzzy-matched involve a multiline comment or string literal.
				236	var s scanner.Scanner
				237	fset := token.NewFileSet()
				238	file := fset.AddFile("", fset.Base(), len(haystack))
				239	s.Init(file, []byte(haystack), nil, scanner.ScanComments)
				240	for {
				241	pos, tok, lit := s.Scan()
				242	if s.ErrorCount > 0 {
				243	return nil, false // invalid Go code (or not Go code at all)
				244	}
				245	if tok == token.EOF {
				246	break
				247	}
				248	if tok == token.SEMICOLON \|\| !strings.Contains(lit, "\n") {
				249	continue
				250	}
				251	// In a token that spans multiple lines,
				252	// so not perfectly matching whitespace might be unsafe.
				253	p := fset.Position(pos)
				254	tokenStart := p.Line - 1 // 1-based to 0-based
				255	tokenEnd := tokenStart + strings.Count(lit, "\n")
				256	// Check whether [matchStart, matchEnd] overlaps [tokenStart, tokenEnd]
				257	// TODO: think more about edge conditions here. Any off-by-one errors?
				258	// For example, leading whitespace and trailing whitespace
				259	// on this token's lines are not semantically significant.
				260	if tokenStart <= matchEnd && matchStart <= tokenEnd {
				261	// if tokenStart <= matchStart && tokenEnd <= tokenEnd {}
				262	return nil, false // this token overlaps the range we're replacing, not safe
				263	}
				264	}
				265
				266	// TODO: restore this sanity check? it's mildly expensive and i've never seen it fail.
				267	// replaced := strings.Join(haystackLines[:matchStart], "") + replacement + strings.Join(haystackLines[matchEnd:], "")
				268	// _, err := format.Source([]byte(replaced))
				269	// if err != nil {
				270	// return nil, false
				271	// }
				272
				273	// OK, declare this very fuzzy match to be our new needle.
				274	needle = strings.Join(haystackLines[matchStart:matchEnd], "")
				275	spec, count := Unique(haystack, needle, replace)
				276	if count != 1 {
				277	return nil, false
				278	}
				279	return spec, true
				280	}
				281
				282	// UniqueTrim is Unique, but with flexibility to shrink old/replace in tandem.
				283	func UniqueTrim(haystack, needle, replace string) (*Spec, bool) {
				284	// LLMs appear to particularly struggle with the first line of a patch.
				285	// If that first line is replicated in replace,
				286	// and removing it yields a unique match,
				287	// we can remove that line entirely from both.
				288	n0, nRest, nOK := strings.Cut(needle, "\n")
				289	r0, rRest, rOK := strings.Cut(replace, "\n")
				290	if !nOK \|\| !rOK \|\| n0 != r0 {
				291	return nil, false
				292	}
				293	spec, count := Unique(haystack, nRest, rRest)
				294	if count != 1 {
				295	return nil, false
				296	}
				297	return spec, true
				298	}
				299
				300	// uniqueTrimmedLineMatch returns the index of the first line in haystack that matches needle,
				301	// when ignoring leading and trailing whitespace.
				302	// uniqueTrimmedLineMatch returns -1 if there is no unique match.
				303	func uniqueTrimmedLineMatch(haystackLines, needleLines []string) int {
				304	// TODO: optimize
				305	trimmedHaystackLines := trimSpaceAll(haystackLines)
				306	trimmedNeedleLines := trimSpaceAll(needleLines)
				307	match := -1
				308	for i := range trimmedHaystackLines {
				309	rest := trimmedHaystackLines[i:]
				310	if len(rest) < len(trimmedNeedleLines) {
				311	break
				312	}
				313	rest = rest[:len(trimmedNeedleLines)]
				314	if !slices.Equal(rest, trimmedNeedleLines) {
				315	continue
				316	}
				317	if match != -1 {
				318	return -1 // multiple matches
				319	}
				320	match = i
				321	}
				322	return match
				323	}
				324
				325	func trimSpaceAll(x []string) []string {
				326	trimmed := make([]string, len(x))
				327	for i, s := range x {
				328	trimmed[i] = strings.TrimSpace(s)
				329	}
				330	return trimmed
				331	}
				332
				333	// improveNeedle adjusts both needle and replacement in tandem to better match haystack.
				334	// Note that we adjust search and replace together.
				335	func improveNeedle(haystack string, needle, replacement string, matchLine int) (string, string) {
				336	// TODO: we make new slices too much
				337	needleLines := slices.Collect(strings.Lines(needle))
				338	if len(needleLines) == 0 {
				339	return needle, replacement
				340	}
				341	haystackLines := slices.Collect(strings.Lines(haystack))
				342	if matchLine+len(needleLines) > len(haystackLines) {
				343	// should be impossible, but just in case
				344	return needle, replacement
				345	}
				346	// Add trailing last-line newline if needed to better match haystack.
				347	if !strings.HasSuffix(needle, "\n") && strings.HasSuffix(haystackLines[matchLine+len(needleLines)-1], "\n") {
				348	needle += "\n"
				349	replacement += "\n"
				350	}
				351	// Add leading first-line prefix if needed to better match haystack.
				352	rest, ok := strings.CutSuffix(haystackLines[matchLine], needleLines[0])
				353	if ok {
				354	needle = rest + needle
				355	replacement = rest + replacement
				356	}
				357	return needle, replacement
				358	}
				359
				360	func isNonSpace(r rune) bool {
				361	return !unicode.IsSpace(r)
				362	}
				363
				364	func whitespacePrefix(s string) string {
				365	firstNonSpace := strings.IndexFunc(s, isNonSpace)
				366	return s[:max(0, firstNonSpace)] // map -1 for "not found" onto 0
				367	}
				368
				369	// commonWhitespacePrefix returns the longest common whitespace prefix of the elements of x, somewhat flexibly.
				370	func commonWhitespacePrefix(x []string) string {
				371	var pre string
				372	for i, s := range x {
				373	if i == 0 {
				374	pre = s
				375	continue
				376	}
				377	// ignore line endings for the moment
				378	// (this is just for prefixes)
				379	s = strings.TrimRight(s, "\n\r")
				380	if s == "" {
				381	continue
				382	}
				383	n := commonPrefixLen(pre, whitespacePrefix(s))
				384	if n == 0 {
				385	return ""
				386	}
				387	pre = pre[:n]
				388	}
				389	pre = strings.TrimRightFunc(pre, isNonSpace)
				390	return pre
				391	}
				392
				393	// commonPrefixLen returns the length of the common prefix of two strings.
				394	// TODO: optimize, see e.g. https://go-review.googlesource.com/c/go/+/408116
				395	func commonPrefixLen(a, b string) int {
				396	shortest := min(len(a), len(b))
				397	for i := range shortest {
				398	if a[i] != b[i] {
				399	return i
				400	}
				401	}
				402	return shortest
				403	}
				404
				405	// commonSuffixLen returns the length of the common suffix of two strings.
				406	// TODO: optimize
				407	func commonSuffixLen(a, b string) int {
				408	shortest := min(len(a), len(b))
				409	for i := 0; i < shortest; i++ {
				410	if a[len(a)-i-1] != b[len(b)-i-1] {
				411	return i
				412	}
				413	}
				414	return shortest
				415	}