llm and everything: Update ToolResult to use []Content instead of string for multimodal support This was a journey. The sketch-generated summary below is acceptable, but I want to tell you about it in my voice too. The goal was to send screenshots to Claude, so that it could... look at them. Currently the take screenshot and read screenshot tools are different, and they'll need to be renamed/prompt-engineered a bit, but that's all fine. The miserable part was that we had to change the return value of tool from string to Content[], and this crosses several layers: - llm.Tool - llm.Content - ant.Content & openai and gemini friends - AgentMessage [we left this alone] Extra fun is that Claude's API for sending images has nested Content fields, and empty string and missing needs to be distinguished for the Text field (because lots of shell commands return the empty string!). For the UI, I made us transform the results into a string, dropping images. This would have been yet more churn for not much obvious benefit. Plus, it was going to break skaband's compatibility, and ... yet more work. OpenAI and Gemini don't obviously support images in this same way, so they just don't get the tools. ~~~~~~~~~~ Sketch said: This architectural change transforms tool results from plain strings to []Content arrays, enabling multimodal interaction in the system. Key changes include: - Core structural changes: - Modified ToolResult type from string to []Content across all packages - Added MediaType field to Content struct for MIME type support - Created TextContent and ImageContent helper functions - Updated all tool.Run implementations to return []Content - Image handling: - Implemented base64 image support in Anthropic adapter - Added proper media type detection and content formatting - Created browser_read_image tool for displaying screenshots - Updated browser_screenshot to provide usable image paths - Adapter improvements: - Updated all LLM adapters (ANT, OAI, GEM) to handle content arrays - Added specialized image content handling in the Anthropic adapter - Ensured proper JSON serialization/deserialization for all content types - Improved test coverage for content arrays - UI enhancements: - Added omitempty tags to reduce JSON response size - Updated TypeScript types to handle array content - Made field naming consistent (tool_error vs is_error) - Preserved backward compatibility for existing consumers Co-Authored-By: sketch <hello@sketch.dev> Change-ID: s1a2b3c4d5e6f7g8h

commit: 72252cbcb97840d724133be67c4c69cc69ebb2d3 [log] [tgz]
author: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
committer: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
tree: a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent: 7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]
diff --git a/llm/ant/ant.go b/llm/ant/ant.go
index d1c366a..fdf2fde 100644
--- a/llm/ant/ant.go
+++ b/llm/ant/ant.go

@@ -44,15 +44,26 @@
 var _ llm.Service = (*Service)(nil)
 
 type content struct {
-	// TODO: image support?
 	// https://docs.anthropic.com/en/api/messages
 	ID   string `json:"id,omitempty"`
 	Type string `json:"type,omitempty"`
-	Text string `json:"text,omitempty"`
+
+	// Subtly, an empty string appears in tool results often, so we have
+	// to distinguish between empty string and no string.
+	// Underlying error looks like one of:
+	//   "messages.46.content.0.tool_result.content.0.text.text: Field required""
+	//   "messages.1.content.1.tool_use.text: Extra inputs are not permitted"
+	//
+	// I haven't found a super great source for the API, but
+	// https://github.com/anthropics/anthropic-sdk-typescript/blob/main/src/resources/messages/messages.ts
+	// is somewhat acceptable but hard to read.
+	Text      *string         `json:"text,omitempty"`
+	MediaType string          `json:"media_type,omitempty"` // for image
+	Source    json.RawMessage `json:"source,omitempty"`     // for image
 
 	// for thinking
 	Thinking  string `json:"thinking,omitempty"`
-	Data      string `json:"data,omitempty"`      // for redacted_thinking
+	Data      string `json:"data,omitempty"`      // for redacted_thinking or image
 	Signature string `json:"signature,omitempty"` // for thinking
 
 	// for tool_use
@@ -60,9 +71,30 @@
 	ToolInput json.RawMessage `json:"input,omitempty"`
 
 	// for tool_result
-	ToolUseID  string `json:"tool_use_id,omitempty"`
-	ToolError  bool   `json:"is_error,omitempty"`
-	ToolResult string `json:"content,omitempty"`
+	ToolUseID string `json:"tool_use_id,omitempty"`
+	ToolError bool   `json:"is_error,omitempty"`
+	// note the recursive nature here; message looks like:
+	// {
+	//  "role": "user",
+	//  "content": [
+	//    {
+	//      "type": "tool_result",
+	//      "tool_use_id": "toolu_01A09q90qw90lq917835lq9",
+	//      "content": [
+	//        {"type": "text", "text": "15 degrees"},
+	//        {
+	//          "type": "image",
+	//          "source": {
+	//            "type": "base64",
+	//            "media_type": "image/jpeg",
+	//            "data": "/9j/4AAQSkZJRg...",
+	//          }
+	//        }
+	//      ]
+	//    }
+	//  ]
+	//}
+	ToolResult []content `json:"content,omitempty"`
 
 	// timing information for tool_result; not sent to Claude
 	StartTime *time.Time `json:"-"`
@@ -217,10 +249,28 @@
 }
 
 func fromLLMContent(c llm.Content) content {
-	return content{
+	var toolResult []content
+	if len(c.ToolResult) > 0 {
+		toolResult = make([]content, len(c.ToolResult))
+		for i, tr := range c.ToolResult {
+			// For image content inside a tool_result, we need to map it to "image" type
+			if tr.MediaType != "" && tr.MediaType == "image/jpeg" || tr.MediaType == "image/png" {
+				// Format as an image for Claude
+				toolResult[i] = content{
+					Type: "image",
+					Source: json.RawMessage(fmt.Sprintf(`{"type":"base64","media_type":"%s","data":"%s"}`,
+						tr.MediaType, tr.Data)),
+				}
+			} else {
+				toolResult[i] = fromLLMContent(tr)
+			}
+		}
+	}
+
+	d := content{
 		ID:           c.ID,
 		Type:         fromLLMContentType[c.Type],
-		Text:         c.Text,
+		MediaType:    c.MediaType,
 		Thinking:     c.Thinking,
 		Data:         c.Data,
 		Signature:    c.Signature,
@@ -228,9 +278,15 @@
 		ToolInput:    c.ToolInput,
 		ToolUseID:    c.ToolUseID,
 		ToolError:    c.ToolError,
-		ToolResult:   c.ToolResult,
+		ToolResult:   toolResult,
 		CacheControl: fromLLMCache(c.Cache),
 	}
+	// Anthropic API complains if Text is specified when it shouldn't be
+	// or not specified when it's the empty string.
+	if c.Type != llm.ContentTypeToolResult && c.Type != llm.ContentTypeToolUse {
+		d.Text = &c.Text
+	}
+	return d
 }
 
 func fromLLMToolUse(tu *llm.ToolUse) *toolUse {
@@ -300,10 +356,19 @@
 }
 
 func toLLMContent(c content) llm.Content {
-	return llm.Content{
+	// Convert toolResult from []content to []llm.Content
+	var toolResultContents []llm.Content
+	if len(c.ToolResult) > 0 {
+		toolResultContents = make([]llm.Content, len(c.ToolResult))
+		for i, tr := range c.ToolResult {
+			toolResultContents[i] = toLLMContent(tr)
+		}
+	}
+
+	ret := llm.Content{
 		ID:         c.ID,
 		Type:       toLLMContentType[c.Type],
-		Text:       c.Text,
+		MediaType:  c.MediaType,
 		Thinking:   c.Thinking,
 		Data:       c.Data,
 		Signature:  c.Signature,
@@ -311,8 +376,12 @@
 		ToolInput:  c.ToolInput,
 		ToolUseID:  c.ToolUseID,
 		ToolError:  c.ToolError,
-		ToolResult: c.ToolResult,
+		ToolResult: toolResultContents,
 	}
+	if c.Text != nil {
+		ret.Text = *c.Text
+	}
+	return ret
 }
 
 func toLLMResponse(r *response) *llm.Response {

diff --git a/llm/ant/ant_image_test.go b/llm/ant/ant_image_test.go
new file mode 100644
index 0000000..1bc0c95
--- /dev/null
+++ b/llm/ant/ant_image_test.go

@@ -0,0 +1,78 @@
+package ant
+
+import (
+	"encoding/json"
+	"testing"
+
+	"sketch.dev/llm"
+)
+
+func TestAnthropicImageToolResult(t *testing.T) {
+	// Create a tool result with both text and image content
+	textContent := llm.Content{
+		Type: llm.ContentTypeText,
+		Text: "15 degrees",
+	}
+
+	imageContent := llm.Content{
+		Type:      llm.ContentTypeText, // Will be mapped to "image" in Anthropic format
+		MediaType: "image/jpeg",
+		Data:      "/9j/4AAQSkZJRg...", // Shortened base64 encoded image
+	}
+
+	toolResult := llm.Content{
+		Type:       llm.ContentTypeToolResult,
+		ToolUseID:  "toolu_01A09q90qw90lq917835lq9",
+		ToolResult: []llm.Content{textContent, imageContent},
+	}
+
+	// Convert to Anthropic format
+	anthropicContent := fromLLMContent(toolResult)
+
+	// Check the type
+	if anthropicContent.Type != "tool_result" {
+		t.Errorf("Expected type to be 'tool_result', got '%s'", anthropicContent.Type)
+	}
+
+	// Check the tool_use_id
+	if anthropicContent.ToolUseID != "toolu_01A09q90qw90lq917835lq9" {
+		t.Errorf("Expected tool_use_id to be 'toolu_01A09q90qw90lq917835lq9', got '%s'", anthropicContent.ToolUseID)
+	}
+
+	// Check that we have two content items in the tool result
+	if len(anthropicContent.ToolResult) != 2 {
+		t.Errorf("Expected 2 content items, got %d", len(anthropicContent.ToolResult))
+	}
+
+	// Check that the first item is text
+	if anthropicContent.ToolResult[0].Type != "text" {
+		t.Errorf("Expected first content type to be 'text', got '%s'", anthropicContent.ToolResult[0].Type)
+	}
+
+	if *anthropicContent.ToolResult[0].Text != "15 degrees" {
+		t.Errorf("Expected first content text to be '15 degrees', got '%s'", *anthropicContent.ToolResult[0].Text)
+	}
+
+	// Check that the second item is an image
+	if anthropicContent.ToolResult[1].Type != "image" {
+		t.Errorf("Expected second content type to be 'image', got '%s'", anthropicContent.ToolResult[1].Type)
+	}
+
+	// Check that the image source contains the expected format
+	var source map[string]any
+	if err := json.Unmarshal(anthropicContent.ToolResult[1].Source, &source); err != nil {
+		t.Errorf("Failed to unmarshal image source: %v", err)
+	}
+
+	if source["type"] != "base64" {
+		t.Errorf("Expected source type to be 'base64', got '%s'", source["type"])
+	}
+
+	if source["media_type"] != "image/jpeg" {
+		t.Errorf("Expected media_type to be 'image/jpeg', got '%s'", source["media_type"])
+	}
+
+	if source["data"] != "/9j/4AAQSkZJRg..." {
+		t.Errorf("Expected data to be '/9j/4AAQSkZJRg...', got '%s'", source["data"])
+	}
+}

diff --git a/llm/conversation/convo.go b/llm/conversation/convo.go
index 7860a07..c46fcc0 100644
--- a/llm/conversation/convo.go
+++ b/llm/conversation/convo.go

@@ -272,10 +272,13 @@
 			continue
 		}
 		content := llm.Content{
-			Type:       llm.ContentTypeToolResult,
-			ToolUseID:  part.ID,
-			ToolError:  true,
-			ToolResult: "not executed; retry possible",
+			Type:      llm.ContentTypeToolResult,
+			ToolUseID: part.ID,
+			ToolError: true,
+			ToolResult: []llm.Content{{
+				Type: llm.ContentTypeText,
+				Text: "not executed; retry possible",
+			}},
 		}
 		prefix = append(prefix, content)
 		msg.Content = append(prefix, msg.Content...)
@@ -361,7 +364,10 @@
 		}
 
 		content.ToolError = true
-		content.ToolResult = "user canceled this too_use"
+		content.ToolResult = []llm.Content{{
+			Type: llm.ContentTypeText,
+			Text: "user canceled this too_use",
+		}}
 		toolResults = append(toolResults, content)
 	}
 	return toolResults, nil
@@ -429,17 +435,24 @@
 				content.ToolUseEndTime = &endTime
 
 				content.ToolError = true
-				content.ToolResult = err.Error()
+				content.ToolResult = []llm.Content{{
+					Type: llm.ContentTypeText,
+					Text: err.Error(),
+				}}
 				c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, nil, err)
 				toolResultC <- content
 			}
-			sendRes := func(res string) {
+			sendRes := func(toolResult []llm.Content) {
 				// Record end time
 				endTime := time.Now()
 				content.ToolUseEndTime = &endTime
 
-				content.ToolResult = res
-				c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, &res, nil)
+				content.ToolResult = toolResult
+				var firstText string
+				if len(toolResult) > 0 {
+					firstText = toolResult[0].Text
+				}
+				c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, &firstText, nil)
 				toolResultC <- content
 			}
 

diff --git a/llm/gem/gem.go b/llm/gem/gem.go
index 6686058..e5cbcf0 100644
--- a/llm/gem/gem.go
+++ b/llm/gem/gem.go

@@ -222,10 +222,24 @@
 				// Tool result becomes a function response
 				// Create a map for the response
 				response := map[string]any{
-					"result": c.ToolResult,
-					"error":  c.ToolError,
+					"error": c.ToolError,
 				}
 
+				// Handle tool results: Gemini only supports string results
+				// Combine all text content into a single string
+				var resultText string
+				if len(c.ToolResult) > 0 {
+					// Collect all text from content objects
+					texts := make([]string, 0, len(c.ToolResult))
+					for _, result := range c.ToolResult {
+						if result.Text != "" {
+							texts = append(texts, result.Text)
+						}
+					}
+					resultText = strings.Join(texts, "\n")
+				}
+				response["result"] = resultText
+
 				// Determine the function name to use - this is critical
 				funcName := ""
 
@@ -254,7 +268,7 @@
 				slog.DebugContext(context.Background(), "gemini_preparing_tool_result",
 					"tool_use_id", c.ToolUseID,
 					"mapped_func_name", funcName,
-					"result_length", len(c.ToolResult))
+					"result_count", len(c.ToolResult))
 
 				content.Parts = append(content.Parts, gemini.Part{
 					FunctionResponse: &gemini.FunctionResponse{
@@ -464,7 +478,7 @@
 					"content_idx", j,
 					"tool_use_id", c.ToolUseID,
 					"tool_error", c.ToolError,
-					"result_length", len(c.ToolResult))
+					"result_count", len(c.ToolResult))
 			}
 		}
 		slog.DebugContext(ctx, "gemini_message",

diff --git a/llm/image_content_test.go b/llm/image_content_test.go
new file mode 100644
index 0000000..4d22838
--- /dev/null
+++ b/llm/image_content_test.go

@@ -0,0 +1,62 @@
+package llm
+
+import (
+	"encoding/json"
+	"testing"
+)
+
+func TestImageContent(t *testing.T) {
+	// Create a Content structure with an image
+	imageContent := Content{
+		Type:      ContentTypeText, // In the future, we might add a specific ContentTypeImage
+		MediaType: "image/jpeg",
+		Data:      "/9j/4AAQSkZJRg...", // Shortened base64 encoded image
+	}
+
+	// Verify the structure is correct
+	if imageContent.MediaType != "image/jpeg" {
+		t.Errorf("Expected MediaType to be 'image/jpeg', got '%s'", imageContent.MediaType)
+	}
+
+	if imageContent.Data != "/9j/4AAQSkZJRg..." {
+		t.Errorf("Expected Data to contain base64 image data")
+	}
+
+	// Create a tool result that contains text and image content
+	toolResult := Content{
+		Type:      ContentTypeToolResult,
+		ToolUseID: "toolu_01A09q90qw90lq917835lq9",
+		ToolResult: []Content{
+			{
+				Type: ContentTypeText,
+				Text: "15 degrees",
+			},
+			imageContent,
+		},
+	}
+
+	// Check that the tool result contains two content items
+	if len(toolResult.ToolResult) != 2 {
+		t.Errorf("Expected tool result to contain 2 content items, got %d", len(toolResult.ToolResult))
+	}
+
+	// Verify JSON marshaling works as expected
+	bytes, err := json.Marshal(toolResult)
+	if err != nil {
+		t.Errorf("Failed to marshal content to JSON: %v", err)
+	}
+
+	// Unmarshal and verify structure is preserved
+	var unmarshaled Content
+	if err := json.Unmarshal(bytes, &unmarshaled); err != nil {
+		t.Errorf("Failed to unmarshal JSON: %v", err)
+	}
+
+	if len(unmarshaled.ToolResult) != 2 {
+		t.Errorf("Expected unmarshaled tool result to contain 2 content items, got %d", len(unmarshaled.ToolResult))
+	}
+
+	if unmarshaled.ToolResult[1].MediaType != "image/jpeg" {
+		t.Errorf("Expected unmarshaled image MediaType to be 'image/jpeg', got '%s'", unmarshaled.ToolResult[1].MediaType)
+	}
+}

diff --git a/llm/llm.go b/llm/llm.go
index 1e53ea3..9331961 100644
--- a/llm/llm.go
+++ b/llm/llm.go

@@ -75,7 +75,7 @@
 	// The outputs from Run will be sent back to Claude.
 	// If you do not want to respond to the tool call request from Claude, return ErrDoNotRespond.
 	// ctx contains extra (rarely used) tool call information; retrieve it with ToolCallInfoFromContext.
-	Run func(ctx context.Context, input json.RawMessage) (string, error) `json:"-"`
+	Run func(ctx context.Context, input json.RawMessage) ([]Content, error) `json:"-"`
 }
 
 type Content struct {
@@ -83,6 +83,9 @@
 	Type ContentType
 	Text string
 
+	// Media type for image content
+	MediaType string
+
 	// for thinking
 	Thinking  string
 	Data      string
@@ -95,7 +98,7 @@
 	// for tool_result
 	ToolUseID  string
 	ToolError  bool
-	ToolResult string
+	ToolResult []Content
 
 	// timing information for tool_result; added externally; not sent to the LLM
 	ToolUseStartTime *time.Time
@@ -121,7 +124,7 @@
 			attrs = append(attrs, slog.String("tool_name", content.ToolName))
 			attrs = append(attrs, slog.String("tool_input", string(content.ToolInput)))
 		case ContentTypeToolResult:
-			attrs = append(attrs, slog.String("tool_result", content.ToolResult))
+			attrs = append(attrs, slog.Any("tool_result", content.ToolResult))
 			attrs = append(attrs, slog.Bool("tool_error", content.ToolError))
 		case ContentTypeThinking:
 			attrs = append(attrs, slog.String("thinking", content.Text))
@@ -229,3 +232,23 @@
 		Content: []Content{StringContent(text)},
 	}
 }
+
+// TextContent creates a simple text content for tool results.
+// This is a helper function to create the most common type of tool result content.
+func TextContent(text string) []Content {
+	return []Content{{
+		Type: ContentTypeText,
+		Text: text,
+	}}
+}
+
+// ImageContent creates an image content for tool results.
+// MediaType should be "image/jpeg" or "image/png"
+func ImageContent(text string, mediaType string, base64Data string) []Content {
+	return []Content{{
+		Type:      ContentTypeText,
+		Text:      text,
+		MediaType: mediaType,
+		Data:      base64Data,
+	}}
+}

diff --git a/llm/oai/oai.go b/llm/oai/oai.go
index 8b64157..37484e0 100644
--- a/llm/oai/oai.go
+++ b/llm/oai/oai.go

@@ -9,6 +9,7 @@
 	"log/slog"
 	"math/rand/v2"
 	"net/http"
+	"strings"
 	"time"
 
 	"github.com/sashabaranov/go-openai"
@@ -320,7 +321,20 @@
 		}
 	case llm.ContentTypeToolResult:
 		// Tool results in OpenAI are sent as a separate message with tool_call_id
-		return c.ToolResult, nil
+		// OpenAI doesn't support multiple content items or images in tool results
+		// Combine all text content into a single string
+		var resultText string
+		if len(c.ToolResult) > 0 {
+			// Collect all text from content objects
+			texts := make([]string, 0, len(c.ToolResult))
+			for _, result := range c.ToolResult {
+				if result.Text != "" {
+					texts = append(texts, result.Text)
+				}
+			}
+			resultText = strings.Join(texts, "\n")
+		}
+		return resultText, nil
 	default:
 		// For thinking or other types, convert to text
 		return c.Text, nil
@@ -348,9 +362,16 @@
 
 	// Process tool results as separate messages, but first
 	for _, tr := range toolResults {
+		// Convert toolresult array to a string for OpenAI
+		var toolResultContent string
+		if len(tr.ToolResult) > 0 {
+			// For now, just use the first text content in the array
+			toolResultContent = tr.ToolResult[0].Text
+		}
+
 		m := openai.ChatCompletionMessage{
 			Role:       "tool",
-			Content:    cmp.Or(tr.ToolResult, " "), // TODO: remove omitempty upstream
+			Content:    cmp.Or(toolResultContent, " "), // Use empty space if empty to avoid omitempty issues
 			ToolCallID: tr.ToolUseID,
 		}
 		messages = append(messages, m)
@@ -472,10 +493,13 @@
 // toToolResultLLMContent converts a tool result message from OpenAI to llm.Content.
 func toToolResultLLMContent(msg openai.ChatCompletionMessage) llm.Content {
 	return llm.Content{
-		Type:       llm.ContentTypeToolResult,
-		ToolUseID:  msg.ToolCallID,
-		ToolResult: msg.Content,
-		ToolError:  false, // OpenAI doesn't specify errors explicitly
+		Type:      llm.ContentTypeToolResult,
+		ToolUseID: msg.ToolCallID,
+		ToolResult: []llm.Content{{
+			Type: llm.ContentTypeText,
+			Text: msg.Content,
+		}},
+		ToolError: false, // OpenAI doesn't specify errors explicitly
 	}
 }
 

diff --git a/llm/tool_content_test.go b/llm/tool_content_test.go
new file mode 100644
index 0000000..bfa5cc6
--- /dev/null
+++ b/llm/tool_content_test.go

@@ -0,0 +1,37 @@
+package llm
+
+import (
+	"testing"
+)
+
+func TestToolResultArray(t *testing.T) {
+	// Test a tool result with multiple content items
+	textContent := Content{
+		Type: ContentTypeText,
+		Text: "15 degrees",
+	}
+
+	imageContent := Content{
+		Type:      ContentTypeText, // In the future, this could be ContentTypeImage
+		Text:      "",
+		MediaType: "image/jpeg",
+		Data:      "/9j/4AAQSkZJRg...", // Base64 encoded image sample
+	}
+
+	toolResult := Content{
+		ToolResult: []Content{textContent, imageContent},
+	}
+
+	// Check the structure
+	if len(toolResult.ToolResult) != 2 {
+		t.Errorf("Expected 2 content items in ToolResult, got %d", len(toolResult.ToolResult))
+	}
+
+	if toolResult.ToolResult[0].Text != "15 degrees" {
+		t.Errorf("Expected first item text to be '15 degrees', got '%s'", toolResult.ToolResult[0].Text)
+	}
+
+	if toolResult.ToolResult[1].MediaType != "image/jpeg" {
+		t.Errorf("Expected second item media type to be 'image/jpeg', got '%s'", toolResult.ToolResult[1].MediaType)
+	}
+}
commit	72252cbcb97840d724133be67c4c69cc69ebb2d3	[log] [tgz]
author	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
committer	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
tree	a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent	7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]