llm and everything: Update ToolResult to use []Content instead of string for multimodal support This was a journey. The sketch-generated summary below is acceptable, but I want to tell you about it in my voice too. The goal was to send screenshots to Claude, so that it could... look at them. Currently the take screenshot and read screenshot tools are different, and they'll need to be renamed/prompt-engineered a bit, but that's all fine. The miserable part was that we had to change the return value of tool from string to Content[], and this crosses several layers: - llm.Tool - llm.Content - ant.Content & openai and gemini friends - AgentMessage [we left this alone] Extra fun is that Claude's API for sending images has nested Content fields, and empty string and missing needs to be distinguished for the Text field (because lots of shell commands return the empty string!). For the UI, I made us transform the results into a string, dropping images. This would have been yet more churn for not much obvious benefit. Plus, it was going to break skaband's compatibility, and ... yet more work. OpenAI and Gemini don't obviously support images in this same way, so they just don't get the tools. ~~~~~~~~~~ Sketch said: This architectural change transforms tool results from plain strings to []Content arrays, enabling multimodal interaction in the system. Key changes include: - Core structural changes: - Modified ToolResult type from string to []Content across all packages - Added MediaType field to Content struct for MIME type support - Created TextContent and ImageContent helper functions - Updated all tool.Run implementations to return []Content - Image handling: - Implemented base64 image support in Anthropic adapter - Added proper media type detection and content formatting - Created browser_read_image tool for displaying screenshots - Updated browser_screenshot to provide usable image paths - Adapter improvements: - Updated all LLM adapters (ANT, OAI, GEM) to handle content arrays - Added specialized image content handling in the Anthropic adapter - Ensured proper JSON serialization/deserialization for all content types - Improved test coverage for content arrays - UI enhancements: - Added omitempty tags to reduce JSON response size - Updated TypeScript types to handle array content - Made field naming consistent (tool_error vs is_error) - Preserved backward compatibility for existing consumers Co-Authored-By: sketch <hello@sketch.dev> Change-ID: s1a2b3c4d5e6f7g8h

commit: 72252cbcb97840d724133be67c4c69cc69ebb2d3 [log] [tgz]
author: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
committer: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
tree: a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent: 7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]
diff --git a/loop/agent.go b/loop/agent.go
index c103919..2c8eec9 100644
--- a/loop/agent.go
+++ b/loop/agent.go

@@ -26,6 +26,7 @@
 	"sketch.dev/claudetool/onstart"
 	"sketch.dev/experiment"
 	"sketch.dev/llm"
+	"sketch.dev/llm/ant"
 	"sketch.dev/llm/conversation"
 )
 
@@ -228,8 +229,8 @@
 	if a.TurnDuration != nil {
 		attrs = append(attrs, slog.Int64("turnDuration", a.TurnDuration.Nanoseconds()))
 	}
-	if a.ToolResult != "" {
-		attrs = append(attrs, slog.String("tool_result", a.ToolResult))
+	if len(a.ToolResult) > 0 {
+		attrs = append(attrs, slog.Any("tool_result", a.ToolResult))
 	}
 	if a.ToolError {
 		attrs = append(attrs, slog.Bool("tool_error", a.ToolError))
@@ -554,6 +555,33 @@
 	a.mu.Unlock()
 }
 
+// contentToString converts []llm.Content to a string, concatenating all text content and skipping non-text types.
+// If there's only one element in the array and it's a text type, it returns that text directly.
+// It also processes nested ToolResult arrays recursively.
+func contentToString(contents []llm.Content) string {
+	if len(contents) == 0 {
+		return ""
+	}
+
+	// If there's only one element and it's a text type, return it directly
+	if len(contents) == 1 && contents[0].Type == llm.ContentTypeText {
+		return contents[0].Text
+	}
+
+	// Otherwise, concatenate all text content
+	var result strings.Builder
+	for _, content := range contents {
+		if content.Type == llm.ContentTypeText {
+			result.WriteString(content.Text)
+		} else if content.Type == llm.ContentTypeToolResult && len(content.ToolResult) > 0 {
+			// Recursively process nested tool results
+			result.WriteString(contentToString(content.ToolResult))
+		}
+	}
+
+	return result.String()
+}
+
 // OnToolResult implements ant.Listener.
 func (a *Agent) OnToolResult(ctx context.Context, convo *conversation.Convo, toolID string, toolName string, toolInput json.RawMessage, content llm.Content, result *string, err error) {
 	// Remove the tool call from outstanding calls
@@ -564,7 +592,7 @@
 	m := AgentMessage{
 		Type:       ToolUseMessageType,
 		Content:    content.Text,
-		ToolResult: content.ToolResult,
+		ToolResult: contentToString(content.ToolResult),
 		ToolError:  content.ToolError,
 		ToolName:   toolName,
 		ToolInput:  string(toolInput),
@@ -879,7 +907,8 @@
 	// Add browser tools if enabled
 	// if experiment.Enabled("browser") {
 	if true {
-		bTools, browserCleanup := browse.RegisterBrowserTools(a.config.Context)
+		_, supportsScreenshots := a.config.Service.(*ant.Service)
+		bTools, browserCleanup := browse.RegisterBrowserTools(a.config.Context, supportsScreenshots)
 		// Add cleanup function to context cancel
 		go func() {
 			<-a.config.Context.Done()
@@ -943,13 +972,13 @@
   },
   "required": ["question", "responseOptions"]
 }`),
-		Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+		Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
 			// The Run logic for "multiplchoice" tool is a no-op on the server.
 			// The UI will present a list of options for the user to select from,
 			// and that's it as far as "executing" the tool_use goes.
 			// When the user *does* select one of the presented options, that
 			// responseText gets sent as a chat message on behalf of the user.
-			return "end your turn and wait for the user to respond", nil
+			return llm.TextContent("end your turn and wait for the user to respond"), nil
 		},
 	}
 	return ret
@@ -997,28 +1026,28 @@
 	},
 	"required": ["title"]
 }`),
-		Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+		Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
 			var params struct {
 				Title string `json:"title"`
 			}
 			if err := json.Unmarshal(input, &params); err != nil {
-				return "", err
+				return nil, err
 			}
 
 			// We don't allow changing the title once set to be consistent with the previous behavior
 			// and to prevent accidental title changes
 			t := a.Title()
 			if t != "" {
-				return "", fmt.Errorf("title already set to: %s", t)
+				return nil, fmt.Errorf("title already set to: %s", t)
 			}
 
 			if params.Title == "" {
-				return "", fmt.Errorf("title parameter cannot be empty")
+				return nil, fmt.Errorf("title parameter cannot be empty")
 			}
 
 			a.SetTitle(params.Title)
 			response := fmt.Sprintf("Title set to %q", params.Title)
-			return response, nil
+			return llm.TextContent(response), nil
 		},
 	}
 	return titleTool
@@ -1039,28 +1068,28 @@
 	},
 	"required": ["branch_name"]
 }`),
-		Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+		Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
 			var params struct {
 				BranchName string `json:"branch_name"`
 			}
 			if err := json.Unmarshal(input, &params); err != nil {
-				return "", err
+				return nil, err
 			}
 
 			b := a.BranchName()
 			if b != "" {
-				return "", fmt.Errorf("branch already set to: %s", b)
+				return nil, fmt.Errorf("branch already set to: %s", b)
 			}
 
 			if params.BranchName == "" {
-				return "", fmt.Errorf("branch_name parameter cannot be empty")
+				return nil, fmt.Errorf("branch_name must not be empty")
 			}
 			if params.BranchName != cleanBranchName(params.BranchName) {
-				return "", fmt.Errorf("branch_name parameter must be alphanumeric hyphenated slug")
+				return nil, fmt.Errorf("branch_name parameter must be alphanumeric hyphenated slug")
 			}
 			branchName := "sketch/" + params.BranchName
 			if branchExists(a.workingDir, branchName) {
-				return "", fmt.Errorf("branch %q already exists; please choose a different branch name", branchName)
+				return nil, fmt.Errorf("branch %q already exists; please choose a different branch name", branchName)
 			}
 
 			a.SetBranch(branchName)
@@ -1074,7 +1103,7 @@
 				response += "\n\n" + styleHint
 			}
 
-			return response, nil
+			return llm.TextContent(response), nil
 		},
 	}
 	return preCommit
@@ -1089,11 +1118,6 @@
 	a.inbox <- msg
 }
 
-func (a *Agent) ToolResultMessage(ctx context.Context, toolCallID, msg string) {
-	a.pushToOutbox(ctx, AgentMessage{Type: UserMessageType, Content: msg, ToolCallId: toolCallID})
-	a.inbox <- msg
-}
-
 func (a *Agent) CancelToolUse(toolUseID string, cause error) error {
 	return a.convo.CancelToolUse(toolUseID, cause)
 }
@@ -1137,6 +1161,11 @@
 		m.Timestamp = time.Now()
 	}
 
+	// If this is a ToolUseMessage and ToolResult is set but Content is not, copy the ToolResult to Content
+	if m.Type == ToolUseMessageType && m.ToolResult != "" && m.Content == "" {
+		m.Content = m.ToolResult
+	}
+
 	// If this is an end-of-turn message, calculate the turn duration and add it to the message
 	if m.EndOfTurn && m.Type == AgentMessageType {
 		turnDuration := time.Since(a.startOfTurn)

diff --git a/loop/agent_test.go b/loop/agent_test.go
index 72e7ccb..ce44352 100644
--- a/loop/agent_test.go
+++ b/loop/agent_test.go

@@ -680,3 +680,117 @@
 		t.Errorf("Expected to eventually reach StateEndOfTurn, but never did")
 	}
 }
+
+func TestContentToString(t *testing.T) {
+	tests := []struct {
+		name     string
+		contents []llm.Content
+		want     string
+	}{
+		{
+			name:     "empty",
+			contents: []llm.Content{},
+			want:     "",
+		},
+		{
+			name: "single text content",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeText, Text: "hello world"},
+			},
+			want: "hello world",
+		},
+		{
+			name: "multiple text content",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeText, Text: "hello "},
+				{Type: llm.ContentTypeText, Text: "world"},
+			},
+			want: "hello world",
+		},
+		{
+			name: "mixed content types",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeText, Text: "hello "},
+				{Type: llm.ContentTypeText, MediaType: "image/png", Data: "base64data"},
+				{Type: llm.ContentTypeText, Text: "world"},
+			},
+			want: "hello world",
+		},
+		{
+			name: "non-text content only",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeToolUse, ToolName: "example"},
+			},
+			want: "",
+		},
+		{
+			name: "nested tool result",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeText, Text: "outer "},
+				{Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+					{Type: llm.ContentTypeText, Text: "inner"},
+				}},
+			},
+			want: "outer inner",
+		},
+		{
+			name: "deeply nested tool result",
+			contents: []llm.Content{
+				{Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+					{Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+						{Type: llm.ContentTypeText, Text: "deeply nested"},
+					}},
+				}},
+			},
+			want: "deeply nested",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := contentToString(tt.contents); got != tt.want {
+				t.Errorf("contentToString() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestPushToOutbox(t *testing.T) {
+	// Create a new agent
+	a := &Agent{
+		outstandingLLMCalls:  make(map[string]struct{}),
+		outstandingToolCalls: make(map[string]string),
+		stateMachine:         NewStateMachine(),
+		subscribers:          make([]chan *AgentMessage, 0),
+	}
+
+	// Create a channel to receive messages
+	messageCh := make(chan *AgentMessage, 1)
+
+	// Add the channel to the subscribers list
+	a.mu.Lock()
+	a.subscribers = append(a.subscribers, messageCh)
+	a.mu.Unlock()
+
+	// We need to set the text that would be produced by our modified contentToString function
+	resultText := "test resultnested result" // Directly set the expected output
+
+	// In a real-world scenario, this would be coming from a toolResult that contained nested content
+
+	m := AgentMessage{
+		Type:       ToolUseMessageType,
+		ToolResult: resultText,
+	}
+
+	// Push the message to the outbox
+	a.pushToOutbox(context.Background(), m)
+
+	// Receive the message from the subscriber
+	received := <-messageCh
+
+	// Check that the Content field contains the concatenated text from ToolResult
+	expected := "test resultnested result"
+	if received.Content != expected {
+		t.Errorf("Expected Content to be %q, got %q", expected, received.Content)
+	}
+}

diff --git a/loop/agent_user_cancel_test.go b/loop/agent_user_cancel_test.go
index 0e5eb43..ed7df1b 100644
--- a/loop/agent_user_cancel_test.go
+++ b/loop/agent_user_cancel_test.go

@@ -74,11 +74,14 @@
 
 		toolUseContents := []llm.Content{
 			{
-				Type:       llm.ContentTypeToolResult,
-				ToolUseID:  "tool1",
-				Text:       "",
-				ToolResult: "This is a tool result",
-				ToolError:  false,
+				Type:      llm.ContentTypeToolResult,
+				ToolUseID: "tool1",
+				Text:      "",
+				ToolResult: []llm.Content{{
+					Type: llm.ContentTypeText,
+					Text: "This is a tool result",
+				}},
+				ToolError: false,
 			},
 		}
 		toolUseResultsMsg := llm.Message{
@@ -344,10 +347,13 @@
 		}
 		canceledToolUseContents := []llm.Content{
 			{
-				Type:       llm.ContentTypeToolResult,
-				ToolUseID:  "tool1",
-				ToolError:  true,
-				ToolResult: "user canceled this tool_use",
+				Type:      llm.ContentTypeToolResult,
+				ToolUseID: "tool1",
+				ToolError: true,
+				ToolResult: []llm.Content{{
+					Type: llm.ContentTypeText,
+					Text: "user canceled this tool_use",
+				}},
 			},
 		}
 		canceledToolUseMsg := llm.Message{
@@ -424,11 +430,14 @@
 
 	toolUseContents := []llm.Content{
 		{
-			Type:       llm.ContentTypeToolResult,
-			ToolUseID:  "tool1",
-			Text:       "",
-			ToolResult: "This is a tool result",
-			ToolError:  false,
+			Type:      llm.ContentTypeToolResult,
+			ToolUseID: "tool1",
+			Text:      "",
+			ToolResult: []llm.Content{{
+				Type: llm.ContentTypeText,
+				Text: "This is a tool result",
+			}},
+			ToolError: false,
 		},
 	}
 	toolUseResponse := &llm.Response{

diff --git a/loop/donetool.go b/loop/donetool.go
index 4261a98..12db3b8 100644
--- a/loop/donetool.go
+++ b/loop/donetool.go

@@ -19,23 +19,23 @@
 		Name:        "done",
 		Description: doneDescription,
 		InputSchema: json.RawMessage(doneChecklistJSONSchema),
-		Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+		Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
 			// Cannot be done with a messy git.
 			if err := codereview.RequireNormalGitState(ctx); err != nil {
-				return "", err
+				return nil, err
 			}
 			if err := codereview.RequireNoUncommittedChanges(ctx); err != nil {
-				return "", err
+				return nil, err
 			}
 			// Ensure that the current commit has been reviewed.
 			head, err := codereview.CurrentCommit(ctx)
 			if err == nil {
 				needsReview := !codereview.IsInitialCommit(head) && !codereview.HasReviewed(head)
 				if needsReview {
-					return "", fmt.Errorf("codereview tool has not been run for commit %v", head)
+					return nil, fmt.Errorf("codereview tool has not been run for commit %v", head)
 				}
 			}
-			return `Please ask the user to review your work. Be concise - users are more likely to read shorter comments.`, nil
+			return llm.TextContent("Please ask the user to review your work. Be concise - users are more likely to read shorter comments."), nil
 		},
 	}
 }

diff --git a/loop/testdata/agent_loop.httprr b/loop/testdata/agent_loop.httprr
index 64137c0..512e5d1 100644
--- a/loop/testdata/agent_loop.httprr
+++ b/loop/testdata/agent_loop.httprr

@@ -1,9 +1,9 @@
 httprr trace v1
-14275 2118
+14628 2230
 POST https://api.anthropic.com/v1/messages HTTP/1.1

 Host: api.anthropic.com

 User-Agent: Go-http-client/1.1

-Content-Length: 14077

+Content-Length: 14430

 Anthropic-Version: 2023-06-01

 Content-Type: application/json

 

@@ -350,6 +350,22 @@
    }
   },
   {
+   "name": "browser_scroll_into_view",
+   "description": "Scroll an element into view if it's not visible",
+   "input_schema": {
+    "type": "object",
+    "properties": {
+     "selector": {
+      "type": "string",
+      "description": "CSS selector for the element to scroll into view"
+     }
+    },
+    "required": [
+     "selector"
+    ]
+   }
+  },
+  {
    "name": "browser_screenshot",
    "description": "Take a screenshot of the page or a specific element",
    "input_schema": {
@@ -371,18 +387,18 @@
    }
   },
   {
-   "name": "browser_scroll_into_view",
-   "description": "Scroll an element into view if it's not visible",
+   "name": "browser_read_image",
+   "description": "Read an image file (such as a screenshot) and encode it for sending to the LLM",
    "input_schema": {
     "type": "object",
     "properties": {
-     "selector": {
+     "path": {
       "type": "string",
-      "description": "CSS selector for the element to scroll into view"
+      "description": "Path to the image file to read"
      }
     },
     "required": [
-     "selector"
+     "path"
     ]
    }
   },
@@ -448,24 +464,24 @@
 Anthropic-Organization-Id: 3c473a21-7208-450a-a9f8-80aebda45c1b

 Anthropic-Ratelimit-Input-Tokens-Limit: 200000

 Anthropic-Ratelimit-Input-Tokens-Remaining: 200000

-Anthropic-Ratelimit-Input-Tokens-Reset: 2025-05-09T22:32:28Z

+Anthropic-Ratelimit-Input-Tokens-Reset: 2025-05-10T13:34:48Z

 Anthropic-Ratelimit-Output-Tokens-Limit: 80000

 Anthropic-Ratelimit-Output-Tokens-Remaining: 80000

-Anthropic-Ratelimit-Output-Tokens-Reset: 2025-05-09T22:32:31Z

+Anthropic-Ratelimit-Output-Tokens-Reset: 2025-05-10T13:34:52Z

 Anthropic-Ratelimit-Requests-Limit: 4000

 Anthropic-Ratelimit-Requests-Remaining: 3999

-Anthropic-Ratelimit-Requests-Reset: 2025-05-09T22:32:27Z

+Anthropic-Ratelimit-Requests-Reset: 2025-05-10T13:34:47Z

 Anthropic-Ratelimit-Tokens-Limit: 280000

 Anthropic-Ratelimit-Tokens-Remaining: 280000

-Anthropic-Ratelimit-Tokens-Reset: 2025-05-09T22:32:28Z

+Anthropic-Ratelimit-Tokens-Reset: 2025-05-10T13:34:48Z

 Cf-Cache-Status: DYNAMIC

-Cf-Ray: 93d4a720b88fb976-SJC

+Cf-Ray: 93d9d0e80caa67fe-SJC

 Content-Type: application/json

-Date: Fri, 09 May 2025 22:32:31 GMT

-Request-Id: req_011CNxkUqzPtAV1N4dwqmCn8

+Date: Sat, 10 May 2025 13:34:52 GMT

+Request-Id: req_011CNywHj3Qrj3hEz9qFRVgK

 Server: cloudflare

 Strict-Transport-Security: max-age=31536000; includeSubDomains; preload

 Via: 1.1 google

 X-Robots-Tag: none

 

-{"id":"msg_01D6Uo6fKbA6VEwcrR1EJNDx","type":"message","role":"assistant","model":"claude-3-7-sonnet-20250219","content":[{"type":"text","text":"Here are the tools available to me:\n\n1. bash - Execute shell commands\n2. keyword_search - Find files with search terms\n3. think - Record thoughts or plans\n4. title - Set conversation title\n5. precommit - Create git branch for tracking work\n6. done - Mark task as complete with checklist\n7. codereview - Run automated code review\n8. multiplechoice - Present multiple-choice options\n9. browser_navigate - Navigate to URL\n10. browser_click - Click element with CSS selector\n11. browser_type - Type text into input element\n12. browser_wait_for - Wait for element to appear\n13. browser_get_text - Get text from element\n14. browser_eval - Run JavaScript in browser\n15. browser_screenshot - Take screenshot\n16. browser_scroll_into_view - Scroll element into view\n17. patch - Make precise text edits to files"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":4,"cache_creation_input_tokens":3294,"cache_read_input_tokens":0,"output_tokens":206}}
\ No newline at end of file
+{"id":"msg_014KGRNEmFdTUGqDQN7sRmzc","type":"message","role":"assistant","model":"claude-3-7-sonnet-20250219","content":[{"type":"text","text":"Here are the tools available to me:\n\n1. bash - Executes shell commands\n2. keyword_search - Searches files with given keywords\n3. think - For recording thoughts, notes, and plans\n4. title - Sets conversation title\n5. precommit - Creates git branches and provides commit message guidance\n6. done - Marks task completion with a checklist\n7. codereview - Runs automated code review\n8. multiplechoice - Presents multiple choice options to the user\n9. Browser tools:\n   - browser_navigate - Opens a URL\n   - browser_click - Clicks elements\n   - browser_type - Types text into elements\n   - browser_wait_for - Waits for elements\n   - browser_get_text - Gets text from elements\n   - browser_eval - Runs JavaScript in browser\n   - browser_scroll_into_view - Scrolls to elements\n   - browser_screenshot - Takes screenshots\n   - browser_read_image - Reads image files\n10. patch - Makes precise text modifications to files"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":4,"cache_creation_input_tokens":3376,"cache_read_input_tokens":0,"output_tokens":236}}
\ No newline at end of file
commit	72252cbcb97840d724133be67c4c69cc69ebb2d3	[log] [tgz]
author	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
committer	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
tree	a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent	7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]