llm and everything: Update ToolResult to use []Content instead of string for multimodal support This was a journey. The sketch-generated summary below is acceptable, but I want to tell you about it in my voice too. The goal was to send screenshots to Claude, so that it could... look at them. Currently the take screenshot and read screenshot tools are different, and they'll need to be renamed/prompt-engineered a bit, but that's all fine. The miserable part was that we had to change the return value of tool from string to Content[], and this crosses several layers: - llm.Tool - llm.Content - ant.Content & openai and gemini friends - AgentMessage [we left this alone] Extra fun is that Claude's API for sending images has nested Content fields, and empty string and missing needs to be distinguished for the Text field (because lots of shell commands return the empty string!). For the UI, I made us transform the results into a string, dropping images. This would have been yet more churn for not much obvious benefit. Plus, it was going to break skaband's compatibility, and ... yet more work. OpenAI and Gemini don't obviously support images in this same way, so they just don't get the tools. ~~~~~~~~~~ Sketch said: This architectural change transforms tool results from plain strings to []Content arrays, enabling multimodal interaction in the system. Key changes include: - Core structural changes: - Modified ToolResult type from string to []Content across all packages - Added MediaType field to Content struct for MIME type support - Created TextContent and ImageContent helper functions - Updated all tool.Run implementations to return []Content - Image handling: - Implemented base64 image support in Anthropic adapter - Added proper media type detection and content formatting - Created browser_read_image tool for displaying screenshots - Updated browser_screenshot to provide usable image paths - Adapter improvements: - Updated all LLM adapters (ANT, OAI, GEM) to handle content arrays - Added specialized image content handling in the Anthropic adapter - Ensured proper JSON serialization/deserialization for all content types - Improved test coverage for content arrays - UI enhancements: - Added omitempty tags to reduce JSON response size - Updated TypeScript types to handle array content - Made field naming consistent (tool_error vs is_error) - Preserved backward compatibility for existing consumers Co-Authored-By: sketch <hello@sketch.dev> Change-ID: s1a2b3c4d5e6f7g8h

commit: 72252cbcb97840d724133be67c4c69cc69ebb2d3 [log] [tgz]
author: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
committer: Philip Zeyliger <philip@bold.dev> Sat May 10 17:00:08 2025 -0700
tree: a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent: 7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index 52248b8..60e66ac 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go

@@ -3,11 +3,14 @@
 
 import (
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"log"
+	"net/http"
 	"os"
 	"path/filepath"
+	"strings"
 	"sync"
 	"time"
 
@@ -143,15 +146,15 @@
 	}
 }
 
-func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input navigateInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	err = chromedp.Run(browserCtx,
@@ -159,10 +162,10 @@
 		chromedp.WaitReady("body"),
 	)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
-	return successResponse(), nil
+	return llm.TextContent(successResponse()), nil
 }
 
 // ClickTool definition
@@ -194,15 +197,15 @@
 	}
 }
 
-func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input clickInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	actions := []chromedp.Action{
@@ -217,10 +220,10 @@
 
 	err = chromedp.Run(browserCtx, actions...)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
-	return successResponse(), nil
+	return llm.TextContent(successResponse()), nil
 }
 
 // TypeTool definition
@@ -257,15 +260,15 @@
 	}
 }
 
-func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input typeInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	actions := []chromedp.Action{
@@ -281,10 +284,10 @@
 
 	err = chromedp.Run(browserCtx, actions...)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
-	return successResponse(), nil
+	return llm.TextContent(successResponse()), nil
 }
 
 // WaitForTool definition
@@ -316,10 +319,10 @@
 	}
 }
 
-func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input waitForInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	timeout := 30000 // default timeout 30 seconds
@@ -329,7 +332,7 @@
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
@@ -337,10 +340,10 @@
 
 	err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
-	return successResponse(), nil
+	return llm.TextContent(successResponse()), nil
 }
 
 // GetTextTool definition
@@ -371,15 +374,15 @@
 	}
 }
 
-func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input getTextInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	var text string
@@ -388,16 +391,16 @@
 		chromedp.Text(input.Selector, &text),
 	)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	output := getTextOutput{Text: text}
 	result, err := json.Marshal(output)
 	if err != nil {
-		return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
 	}
 
-	return string(result), nil
+	return llm.TextContent(string(result)), nil
 }
 
 // EvalTool definition
@@ -428,30 +431,30 @@
 	}
 }
 
-func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input evalInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	var result any
 	err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	output := evalOutput{Result: result}
 	response, err := json.Marshal(output)
 	if err != nil {
-		return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
 	}
 
-	return string(response), nil
+	return llm.TextContent(string(response)), nil
 }
 
 // ScreenshotTool definition
@@ -487,15 +490,15 @@
 	}
 }
 
-func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input screenshotInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	var buf []byte
@@ -514,23 +517,26 @@
 
 	err = chromedp.Run(browserCtx, actions...)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	// Save the screenshot and get its ID
 	id := b.SaveScreenshot(buf)
 	if id == "" {
-		return errorResponse(fmt.Errorf("failed to save screenshot")), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
 	}
 
-	// Return the ID in the response
-	output := screenshotOutput{ID: id}
-	response, err := json.Marshal(output)
-	if err != nil {
-		return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
-	}
+	// Get the full path to the screenshot
+	screenshotPath := GetScreenshotPath(id)
 
-	return string(response), nil
+	// Return the ID and instructions on how to view the screenshot
+	result := fmt.Sprintf(`{
+  "id": "%s",
+  "path": "%s",
+  "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
+}`, id, screenshotPath)
+
+	return llm.TextContent(result), nil
 }
 
 // ScrollIntoViewTool definition
@@ -557,15 +563,15 @@
 	}
 }
 
-func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
 	var input scrollIntoViewInput
 	if err := json.Unmarshal(m, &input); err != nil {
-		return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
 	}
 
 	browserCtx, err := b.GetBrowserContext()
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	script := fmt.Sprintf(`
@@ -583,28 +589,35 @@
 		chromedp.Evaluate(script, &result),
 	)
 	if err != nil {
-		return errorResponse(err), nil
+		return llm.TextContent(errorResponse(err)), nil
 	}
 
 	if !result {
-		return errorResponse(fmt.Errorf("element not found: %s", input.Selector)), nil
+		return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
 	}
 
-	return successResponse(), nil
+	return llm.TextContent(successResponse()), nil
 }
 
-// GetAllTools returns all browser tools
-func (b *BrowseTools) GetAllTools() []*llm.Tool {
-	return []*llm.Tool{
+// GetTools returns browser tools, optionally filtering out screenshot-related tools
+func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
+	tools := []*llm.Tool{
 		b.NewNavigateTool(),
 		b.NewClickTool(),
 		b.NewTypeTool(),
 		b.NewWaitForTool(),
 		b.NewGetTextTool(),
 		b.NewEvalTool(),
-		b.NewScreenshotTool(),
 		b.NewScrollIntoViewTool(),
 	}
+
+	// Add screenshot-related tools if supported
+	if includeScreenshotTools {
+		tools = append(tools, b.NewScreenshotTool())
+		tools = append(tools, b.NewReadImageTool())
+	}
+
+	return tools
 }
 
 // SaveScreenshot saves a screenshot to disk and returns its ID
@@ -631,3 +644,67 @@
 func GetScreenshotPath(id string) string {
 	return filepath.Join(ScreenshotDir, id+".png")
 }
+
+// ReadImageTool definition
+type readImageInput struct {
+	Path string `json:"path"`
+}
+
+// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
+func (b *BrowseTools) NewReadImageTool() *llm.Tool {
+	return &llm.Tool{
+		Name:        "browser_read_image",
+		Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
+		InputSchema: json.RawMessage(`{
+			"type": "object",
+			"properties": {
+				"path": {
+					"type": "string",
+					"description": "Path to the image file to read"
+				}
+			},
+			"required": ["path"]
+		}`),
+		Run: b.readImageRun,
+	}
+}
+
+func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
+	var input readImageInput
+	if err := json.Unmarshal(m, &input); err != nil {
+		return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
+	}
+
+	// Check if the path exists
+	if _, err := os.Stat(input.Path); os.IsNotExist(err) {
+		return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
+	}
+
+	// Read the file
+	imageData, err := os.ReadFile(input.Path)
+	if err != nil {
+		return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
+	}
+
+	// Detect the image type
+	imageType := http.DetectContentType(imageData)
+	if !strings.HasPrefix(imageType, "image/") {
+		return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
+	}
+
+	// Encode the image as base64
+	base64Data := base64.StdEncoding.EncodeToString(imageData)
+
+	// Create a Content object that includes both text and the image
+	return []llm.Content{
+		{
+			Type: llm.ContentTypeText,
+			Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
+		},
+		{
+			Type:      llm.ContentTypeText, // Will be mapped to image in content array
+			MediaType: imageType,
+			Data:      base64Data,
+		},
+	}, nil
+}

diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go
index b3168da..f1360d8 100644
--- a/claudetool/browse/browse_test.go
+++ b/claudetool/browse/browse_test.go

@@ -3,7 +3,9 @@
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"os"
+	"path/filepath"
 	"slices"
 	"strings"
 	"testing"
@@ -63,24 +65,32 @@
 	}
 }
 
-func TestGetAllTools(t *testing.T) {
+func TestGetTools(t *testing.T) {
 	// Create browser tools instance
 	tools := NewBrowseTools(context.Background())
 
-	// Get all tools
-	allTools := tools.GetAllTools()
-
-	// We should have 8 tools
-	if len(allTools) != 8 {
-		t.Errorf("expected 8 tools, got %d", len(allTools))
-	}
-
-	// Check that each tool has the expected name prefix
-	for _, tool := range allTools {
-		if !strings.HasPrefix(tool.Name, "browser_") {
-			t.Errorf("tool name %q does not have prefix 'browser_'", tool.Name)
+	// Test with screenshot tools included
+	t.Run("with screenshots", func(t *testing.T) {
+		toolsWithScreenshots := tools.GetTools(true)
+		if len(toolsWithScreenshots) != 9 {
+			t.Errorf("expected 9 tools with screenshots, got %d", len(toolsWithScreenshots))
 		}
-	}
+
+		// Check tool naming convention
+		for _, tool := range toolsWithScreenshots {
+			if !strings.HasPrefix(tool.Name, "browser_") {
+				t.Errorf("tool name %q does not have prefix 'browser_'", tool.Name)
+			}
+		}
+	})
+
+	// Test without screenshot tools
+	t.Run("without screenshots", func(t *testing.T) {
+		noScreenshotTools := tools.GetTools(false)
+		if len(noScreenshotTools) != 7 {
+			t.Errorf("expected 7 tools without screenshots, got %d", len(noScreenshotTools))
+		}
+	})
 }
 
 // TestBrowserInitialization verifies that the browser can start correctly
@@ -169,7 +179,8 @@
 		Error  string `json:"error,omitempty"`
 	}
 
-	if err := json.Unmarshal([]byte(result), &response); err != nil {
+	resultText := result[0].Text
+	if err := json.Unmarshal([]byte(resultText), &response); err != nil {
 		t.Fatalf("Error unmarshaling response: %v", err)
 	}
 
@@ -239,3 +250,57 @@
 	// Clean up the test file
 	os.Remove(filePath)
 }
+
+func TestReadImageTool(t *testing.T) {
+	// Create a test BrowseTools instance
+	ctx := context.Background()
+	browseTools := NewBrowseTools(ctx)
+
+	// Create a test image
+	testDir := t.TempDir()
+	testImagePath := filepath.Join(testDir, "test_image.png")
+
+	// Create a small 1x1 black PNG image
+	smallPng := []byte{
+		0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
+		0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53,
+		0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, 0x08, 0xD7, 0x63, 0x60, 0x00, 0x00, 0x00,
+		0x02, 0x00, 0x01, 0xE2, 0x21, 0xBC, 0x33, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE,
+		0x42, 0x60, 0x82,
+	}
+
+	// Write the test image
+	err := os.WriteFile(testImagePath, smallPng, 0o644)
+	if err != nil {
+		t.Fatalf("Failed to create test image: %v", err)
+	}
+
+	// Create the tool
+	readImageTool := browseTools.NewReadImageTool()
+
+	// Prepare input
+	input := fmt.Sprintf(`{"path": "%s"}`, testImagePath)
+
+	// Run the tool
+	result, err := readImageTool.Run(ctx, json.RawMessage(input))
+	if err != nil {
+		t.Fatalf("Read image tool failed: %v", err)
+	}
+
+	// In the updated code, result is already a []llm.Content
+	contents := result
+
+	// Check that we got at least two content objects
+	if len(contents) < 2 {
+		t.Fatalf("Expected at least 2 content objects, got %d", len(contents))
+	}
+
+	// Check that the second content has image data
+	if contents[1].MediaType == "" {
+		t.Errorf("Expected MediaType in second content")
+	}
+
+	if contents[1].Data == "" {
+		t.Errorf("Expected Data in second content")
+	}
+}

diff --git a/claudetool/browse/register.go b/claudetool/browse/register.go
index a540c8f..183bf14 100644
--- a/claudetool/browse/register.go
+++ b/claudetool/browse/register.go

@@ -10,7 +10,7 @@
 // RegisterBrowserTools initializes the browser tools and returns all the tools
 // ready to be added to an agent. It also returns a cleanup function that should
 // be called when done to properly close the browser.
-func RegisterBrowserTools(ctx context.Context) ([]*llm.Tool, func()) {
+func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool) ([]*llm.Tool, func()) {
 	browserTools := NewBrowseTools(ctx)
 
 	// Initialize the browser
@@ -18,8 +18,7 @@
 		log.Printf("Warning: Failed to initialize browser: %v", err)
 	}
 
-	// Return all tools and a cleanup function
-	return browserTools.GetAllTools(), func() {
+	return browserTools.GetTools(supportsScreenshots), func() {
 		browserTools.Close()
 	}
 }
commit	72252cbcb97840d724133be67c4c69cc69ebb2d3	[log] [tgz]
author	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
committer	Philip Zeyliger <philip@bold.dev>	Sat May 10 17:00:08 2025 -0700
tree	a361499dc3fa6b9af2be3e74cfd59fd8ba34690e
parent	7ce5fb76d8748ebf73c5adf9d6cd8eb67716fba8 [diff]