llm and everything: Update ToolResult to use []Content instead of string for multimodal support
This was a journey. The sketch-generated summary below is acceptable,
but I want to tell you about it in my voice too. The goal was to send
screenshots to Claude, so that it could... look at them. Currently
the take screenshot and read screenshot tools are different, and they'll
need to be renamed/prompt-engineered a bit, but that's all fine.
The miserable part was that we had to change the return value
of tool from string to Content[], and this crosses several layers:
- llm.Tool
- llm.Content
- ant.Content & openai and gemini friends
- AgentMessage [we left this alone]
Extra fun is that Claude's API for sending images has nested Content
fields, and empty string and missing needs to be distinguished for the
Text field (because lots of shell commands return the empty string!).
For the UI, I made us transform the results into a string, dropping
images. This would have been yet more churn for not much obvious
benefit. Plus, it was going to break skaband's compatibility, and ...
yet more work.
OpenAI and Gemini don't obviously support images in this same way,
so they just don't get the tools.
~~~~~~~~~~ Sketch said:
This architectural change transforms tool results from plain strings to []Content arrays, enabling multimodal interaction in the system. Key changes include:
- Core structural changes:
- Modified ToolResult type from string to []Content across all packages
- Added MediaType field to Content struct for MIME type support
- Created TextContent and ImageContent helper functions
- Updated all tool.Run implementations to return []Content
- Image handling:
- Implemented base64 image support in Anthropic adapter
- Added proper media type detection and content formatting
- Created browser_read_image tool for displaying screenshots
- Updated browser_screenshot to provide usable image paths
- Adapter improvements:
- Updated all LLM adapters (ANT, OAI, GEM) to handle content arrays
- Added specialized image content handling in the Anthropic adapter
- Ensured proper JSON serialization/deserialization for all content types
- Improved test coverage for content arrays
- UI enhancements:
- Added omitempty tags to reduce JSON response size
- Updated TypeScript types to handle array content
- Made field naming consistent (tool_error vs is_error)
- Preserved backward compatibility for existing consumers
Co-Authored-By: sketch <hello@sketch.dev>
Change-ID: s1a2b3c4d5e6f7g8h
diff --git a/claudetool/bash.go b/claudetool/bash.go
index 4684d76..7dec267 100644
--- a/claudetool/bash.go
+++ b/claudetool/bash.go
@@ -102,22 +102,22 @@
}
}
-func (b *BashTool) Run(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BashTool) Run(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var req bashInput
if err := json.Unmarshal(m, &req); err != nil {
- return "", fmt.Errorf("failed to unmarshal bash command input: %w", err)
+ return nil, fmt.Errorf("failed to unmarshal bash command input: %w", err)
}
// do a quick permissions check (NOT a security barrier)
err := bashkit.Check(req.Command)
if err != nil {
- return "", err
+ return nil, err
}
// Custom permission callback if set
if b.CheckPermission != nil {
if err := b.CheckPermission(req.Command); err != nil {
- return "", err
+ return nil, err
}
}
@@ -125,23 +125,23 @@
if req.Background {
result, err := executeBackgroundBash(ctx, req)
if err != nil {
- return "", err
+ return nil, err
}
// Marshal the result to JSON
// TODO: emit XML(-ish) instead?
output, err := json.Marshal(result)
if err != nil {
- return "", fmt.Errorf("failed to marshal background result: %w", err)
+ return nil, fmt.Errorf("failed to marshal background result: %w", err)
}
- return string(output), nil
+ return llm.TextContent(string(output)), nil
}
// For foreground commands, use executeBash
out, execErr := executeBash(ctx, req)
- if execErr == nil {
- return out, nil
+ if execErr != nil {
+ return nil, execErr
}
- return "", execErr
+ return llm.TextContent(out), nil
}
const maxBashOutputLength = 131072
@@ -300,7 +300,7 @@
}
// BashRun is the legacy function for testing compatibility
-func BashRun(ctx context.Context, m json.RawMessage) (string, error) {
+func BashRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
// Use the default Bash tool which has no permission callback
return Bash.Run(ctx, m)
}
diff --git a/claudetool/bash_test.go b/claudetool/bash_test.go
index 3865383..5091e84 100644
--- a/claudetool/bash_test.go
+++ b/claudetool/bash_test.go
@@ -22,8 +22,9 @@
}
expected := "Hello, world!\n"
- if result != expected {
- t.Errorf("Expected %q, got %q", expected, result)
+ resultStr := ContentToString(result)
+ if resultStr != expected {
+ t.Errorf("Expected %q, got %q", expected, resultStr)
}
})
@@ -37,8 +38,9 @@
}
expected := "foobar"
- if result != expected {
- t.Errorf("Expected %q, got %q", expected, result)
+ resultStr := ContentToString(result)
+ if resultStr != expected {
+ t.Errorf("Expected %q, got %q", expected, resultStr)
}
})
@@ -62,8 +64,9 @@
}
expected := "Completed\n"
- if result != expected {
- t.Errorf("Expected %q, got %q", expected, result)
+ resultStr := ContentToString(result)
+ if resultStr != expected {
+ t.Errorf("Expected %q, got %q", expected, resultStr)
}
})
@@ -228,7 +231,8 @@
// Parse the returned JSON
var bgResult BackgroundResult
- if err := json.Unmarshal([]byte(result), &bgResult); err != nil {
+ resultStr := ContentToString(result)
+ if err := json.Unmarshal([]byte(resultStr), &bgResult); err != nil {
t.Fatalf("Failed to unmarshal background result: %v", err)
}
@@ -285,7 +289,8 @@
// Parse the returned JSON
var bgResult BackgroundResult
- if err := json.Unmarshal([]byte(result), &bgResult); err != nil {
+ resultStr := ContentToString(result)
+ if err := json.Unmarshal([]byte(resultStr), &bgResult); err != nil {
t.Fatalf("Failed to unmarshal background result: %v", err)
}
@@ -342,7 +347,8 @@
// Parse the returned JSON
var bgResult BackgroundResult
- if err := json.Unmarshal([]byte(result), &bgResult); err != nil {
+ resultStr := ContentToString(result)
+ if err := json.Unmarshal([]byte(resultStr), &bgResult); err != nil {
t.Fatalf("Failed to unmarshal background result: %v", err)
}
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index 52248b8..60e66ac 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go
@@ -3,11 +3,14 @@
import (
"context"
+ "encoding/base64"
"encoding/json"
"fmt"
"log"
+ "net/http"
"os"
"path/filepath"
+ "strings"
"sync"
"time"
@@ -143,15 +146,15 @@
}
}
-func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input navigateInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
err = chromedp.Run(browserCtx,
@@ -159,10 +162,10 @@
chromedp.WaitReady("body"),
)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
- return successResponse(), nil
+ return llm.TextContent(successResponse()), nil
}
// ClickTool definition
@@ -194,15 +197,15 @@
}
}
-func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input clickInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
actions := []chromedp.Action{
@@ -217,10 +220,10 @@
err = chromedp.Run(browserCtx, actions...)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
- return successResponse(), nil
+ return llm.TextContent(successResponse()), nil
}
// TypeTool definition
@@ -257,15 +260,15 @@
}
}
-func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input typeInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
actions := []chromedp.Action{
@@ -281,10 +284,10 @@
err = chromedp.Run(browserCtx, actions...)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
- return successResponse(), nil
+ return llm.TextContent(successResponse()), nil
}
// WaitForTool definition
@@ -316,10 +319,10 @@
}
}
-func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input waitForInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
timeout := 30000 // default timeout 30 seconds
@@ -329,7 +332,7 @@
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
@@ -337,10 +340,10 @@
err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
- return successResponse(), nil
+ return llm.TextContent(successResponse()), nil
}
// GetTextTool definition
@@ -371,15 +374,15 @@
}
}
-func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input getTextInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
var text string
@@ -388,16 +391,16 @@
chromedp.Text(input.Selector, &text),
)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
output := getTextOutput{Text: text}
result, err := json.Marshal(output)
if err != nil {
- return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
}
- return string(result), nil
+ return llm.TextContent(string(result)), nil
}
// EvalTool definition
@@ -428,30 +431,30 @@
}
}
-func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input evalInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
var result any
err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
output := evalOutput{Result: result}
response, err := json.Marshal(output)
if err != nil {
- return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
}
- return string(response), nil
+ return llm.TextContent(string(response)), nil
}
// ScreenshotTool definition
@@ -487,15 +490,15 @@
}
}
-func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input screenshotInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
var buf []byte
@@ -514,23 +517,26 @@
err = chromedp.Run(browserCtx, actions...)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
// Save the screenshot and get its ID
id := b.SaveScreenshot(buf)
if id == "" {
- return errorResponse(fmt.Errorf("failed to save screenshot")), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
}
- // Return the ID in the response
- output := screenshotOutput{ID: id}
- response, err := json.Marshal(output)
- if err != nil {
- return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
- }
+ // Get the full path to the screenshot
+ screenshotPath := GetScreenshotPath(id)
- return string(response), nil
+ // Return the ID and instructions on how to view the screenshot
+ result := fmt.Sprintf(`{
+ "id": "%s",
+ "path": "%s",
+ "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
+}`, id, screenshotPath)
+
+ return llm.TextContent(result), nil
}
// ScrollIntoViewTool definition
@@ -557,15 +563,15 @@
}
}
-func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) (string, error) {
+func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input scrollIntoViewInput
if err := json.Unmarshal(m, &input); err != nil {
- return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
script := fmt.Sprintf(`
@@ -583,28 +589,35 @@
chromedp.Evaluate(script, &result),
)
if err != nil {
- return errorResponse(err), nil
+ return llm.TextContent(errorResponse(err)), nil
}
if !result {
- return errorResponse(fmt.Errorf("element not found: %s", input.Selector)), nil
+ return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
}
- return successResponse(), nil
+ return llm.TextContent(successResponse()), nil
}
-// GetAllTools returns all browser tools
-func (b *BrowseTools) GetAllTools() []*llm.Tool {
- return []*llm.Tool{
+// GetTools returns browser tools, optionally filtering out screenshot-related tools
+func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
+ tools := []*llm.Tool{
b.NewNavigateTool(),
b.NewClickTool(),
b.NewTypeTool(),
b.NewWaitForTool(),
b.NewGetTextTool(),
b.NewEvalTool(),
- b.NewScreenshotTool(),
b.NewScrollIntoViewTool(),
}
+
+ // Add screenshot-related tools if supported
+ if includeScreenshotTools {
+ tools = append(tools, b.NewScreenshotTool())
+ tools = append(tools, b.NewReadImageTool())
+ }
+
+ return tools
}
// SaveScreenshot saves a screenshot to disk and returns its ID
@@ -631,3 +644,67 @@
func GetScreenshotPath(id string) string {
return filepath.Join(ScreenshotDir, id+".png")
}
+
+// ReadImageTool definition
+type readImageInput struct {
+ Path string `json:"path"`
+}
+
+// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
+func (b *BrowseTools) NewReadImageTool() *llm.Tool {
+ return &llm.Tool{
+ Name: "browser_read_image",
+ Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
+ InputSchema: json.RawMessage(`{
+ "type": "object",
+ "properties": {
+ "path": {
+ "type": "string",
+ "description": "Path to the image file to read"
+ }
+ },
+ "required": ["path"]
+ }`),
+ Run: b.readImageRun,
+ }
+}
+
+func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
+ var input readImageInput
+ if err := json.Unmarshal(m, &input); err != nil {
+ return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
+ }
+
+ // Check if the path exists
+ if _, err := os.Stat(input.Path); os.IsNotExist(err) {
+ return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
+ }
+
+ // Read the file
+ imageData, err := os.ReadFile(input.Path)
+ if err != nil {
+ return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
+ }
+
+ // Detect the image type
+ imageType := http.DetectContentType(imageData)
+ if !strings.HasPrefix(imageType, "image/") {
+ return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
+ }
+
+ // Encode the image as base64
+ base64Data := base64.StdEncoding.EncodeToString(imageData)
+
+ // Create a Content object that includes both text and the image
+ return []llm.Content{
+ {
+ Type: llm.ContentTypeText,
+ Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
+ },
+ {
+ Type: llm.ContentTypeText, // Will be mapped to image in content array
+ MediaType: imageType,
+ Data: base64Data,
+ },
+ }, nil
+}
diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go
index b3168da..f1360d8 100644
--- a/claudetool/browse/browse_test.go
+++ b/claudetool/browse/browse_test.go
@@ -3,7 +3,9 @@
import (
"context"
"encoding/json"
+ "fmt"
"os"
+ "path/filepath"
"slices"
"strings"
"testing"
@@ -63,24 +65,32 @@
}
}
-func TestGetAllTools(t *testing.T) {
+func TestGetTools(t *testing.T) {
// Create browser tools instance
tools := NewBrowseTools(context.Background())
- // Get all tools
- allTools := tools.GetAllTools()
-
- // We should have 8 tools
- if len(allTools) != 8 {
- t.Errorf("expected 8 tools, got %d", len(allTools))
- }
-
- // Check that each tool has the expected name prefix
- for _, tool := range allTools {
- if !strings.HasPrefix(tool.Name, "browser_") {
- t.Errorf("tool name %q does not have prefix 'browser_'", tool.Name)
+ // Test with screenshot tools included
+ t.Run("with screenshots", func(t *testing.T) {
+ toolsWithScreenshots := tools.GetTools(true)
+ if len(toolsWithScreenshots) != 9 {
+ t.Errorf("expected 9 tools with screenshots, got %d", len(toolsWithScreenshots))
}
- }
+
+ // Check tool naming convention
+ for _, tool := range toolsWithScreenshots {
+ if !strings.HasPrefix(tool.Name, "browser_") {
+ t.Errorf("tool name %q does not have prefix 'browser_'", tool.Name)
+ }
+ }
+ })
+
+ // Test without screenshot tools
+ t.Run("without screenshots", func(t *testing.T) {
+ noScreenshotTools := tools.GetTools(false)
+ if len(noScreenshotTools) != 7 {
+ t.Errorf("expected 7 tools without screenshots, got %d", len(noScreenshotTools))
+ }
+ })
}
// TestBrowserInitialization verifies that the browser can start correctly
@@ -169,7 +179,8 @@
Error string `json:"error,omitempty"`
}
- if err := json.Unmarshal([]byte(result), &response); err != nil {
+ resultText := result[0].Text
+ if err := json.Unmarshal([]byte(resultText), &response); err != nil {
t.Fatalf("Error unmarshaling response: %v", err)
}
@@ -239,3 +250,57 @@
// Clean up the test file
os.Remove(filePath)
}
+
+func TestReadImageTool(t *testing.T) {
+ // Create a test BrowseTools instance
+ ctx := context.Background()
+ browseTools := NewBrowseTools(ctx)
+
+ // Create a test image
+ testDir := t.TempDir()
+ testImagePath := filepath.Join(testDir, "test_image.png")
+
+ // Create a small 1x1 black PNG image
+ smallPng := []byte{
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53,
+ 0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, 0x08, 0xD7, 0x63, 0x60, 0x00, 0x00, 0x00,
+ 0x02, 0x00, 0x01, 0xE2, 0x21, 0xBC, 0x33, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE,
+ 0x42, 0x60, 0x82,
+ }
+
+ // Write the test image
+ err := os.WriteFile(testImagePath, smallPng, 0o644)
+ if err != nil {
+ t.Fatalf("Failed to create test image: %v", err)
+ }
+
+ // Create the tool
+ readImageTool := browseTools.NewReadImageTool()
+
+ // Prepare input
+ input := fmt.Sprintf(`{"path": "%s"}`, testImagePath)
+
+ // Run the tool
+ result, err := readImageTool.Run(ctx, json.RawMessage(input))
+ if err != nil {
+ t.Fatalf("Read image tool failed: %v", err)
+ }
+
+ // In the updated code, result is already a []llm.Content
+ contents := result
+
+ // Check that we got at least two content objects
+ if len(contents) < 2 {
+ t.Fatalf("Expected at least 2 content objects, got %d", len(contents))
+ }
+
+ // Check that the second content has image data
+ if contents[1].MediaType == "" {
+ t.Errorf("Expected MediaType in second content")
+ }
+
+ if contents[1].Data == "" {
+ t.Errorf("Expected Data in second content")
+ }
+}
diff --git a/claudetool/browse/register.go b/claudetool/browse/register.go
index a540c8f..183bf14 100644
--- a/claudetool/browse/register.go
+++ b/claudetool/browse/register.go
@@ -10,7 +10,7 @@
// RegisterBrowserTools initializes the browser tools and returns all the tools
// ready to be added to an agent. It also returns a cleanup function that should
// be called when done to properly close the browser.
-func RegisterBrowserTools(ctx context.Context) ([]*llm.Tool, func()) {
+func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool) ([]*llm.Tool, func()) {
browserTools := NewBrowseTools(ctx)
// Initialize the browser
@@ -18,8 +18,7 @@
log.Printf("Warning: Failed to initialize browser: %v", err)
}
- // Return all tools and a cleanup function
- return browserTools.GetAllTools(), func() {
+ return browserTools.GetTools(supportsScreenshots), func() {
browserTools.Close()
}
}
diff --git a/claudetool/codereview/codereview_test.go b/claudetool/codereview/codereview_test.go
index aa7b8f1..4872168 100644
--- a/claudetool/codereview/codereview_test.go
+++ b/claudetool/codereview/codereview_test.go
@@ -246,7 +246,11 @@
}
// Normalize paths in the result
- normalized := normalizePaths(result, dir)
+ resultStr := ""
+ if len(result) > 0 {
+ resultStr = result[0].Text
+ }
+ normalized := normalizePaths(resultStr, dir)
return normalized, nil
}
diff --git a/claudetool/codereview/differential.go b/claudetool/codereview/differential.go
index f358594..37728d4 100644
--- a/claudetool/codereview/differential.go
+++ b/claudetool/codereview/differential.go
@@ -37,27 +37,27 @@
return spec
}
-func (r *CodeReviewer) Run(ctx context.Context, m json.RawMessage) (string, error) {
+func (r *CodeReviewer) Run(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
// NOTE: If you add or modify error messages here, update the corresponding UI parsing in:
// webui/src/web-components/sketch-tool-card.ts (SketchToolCardCodeReview.getStatusIcon)
if err := r.RequireNormalGitState(ctx); err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to check for normal git state", "err", err)
- return "", err
+ return nil, err
}
if err := r.RequireNoUncommittedChanges(ctx); err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to check for uncommitted changes", "err", err)
- return "", err
+ return nil, err
}
// Check that the current commit is not the initial commit
currentCommit, err := r.CurrentCommit(ctx)
if err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to get current commit", "err", err)
- return "", err
+ return nil, err
}
if r.IsInitialCommit(currentCommit) {
slog.DebugContext(ctx, "CodeReviewer.Run: current commit is initial commit, nothing to review")
- return "", fmt.Errorf("no new commits have been added, nothing to review")
+ return nil, fmt.Errorf("no new commits have been added, nothing to review")
}
// No matter what failures happen from here out, we will declare this to have been reviewed.
@@ -67,7 +67,7 @@
changedFiles, err := r.changedFiles(ctx, r.initialCommit, currentCommit)
if err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to get changed files", "err", err)
- return "", err
+ return nil, err
}
// Prepare to analyze before/after for the impacted files.
@@ -79,7 +79,7 @@
if err != nil {
// TODO: log and skip to stuff that doesn't require packages
slog.DebugContext(ctx, "CodeReviewer.Run: failed to get packages for files", "err", err)
- return "", err
+ return nil, err
}
allPkgList := slices.Collect(maps.Keys(allPkgs))
@@ -101,7 +101,7 @@
testMsg, err := r.checkTests(ctx, allPkgList)
if err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to check tests", "err", err)
- return "", err
+ return nil, err
}
if testMsg != "" {
errorMessages = append(errorMessages, testMsg)
@@ -110,7 +110,7 @@
goplsMsg, err := r.checkGopls(ctx, changedFiles) // includes vet checks
if err != nil {
slog.DebugContext(ctx, "CodeReviewer.Run: failed to check gopls", "err", err)
- return "", err
+ return nil, err
}
if goplsMsg != "" {
errorMessages = append(errorMessages, goplsMsg)
@@ -143,7 +143,7 @@
if buf.Len() == 0 {
buf.WriteString("OK")
}
- return buf.String(), nil
+ return llm.TextContent(buf.String()), nil
}
func (r *CodeReviewer) initializeInitialCommitWorktree(ctx context.Context) error {
diff --git a/claudetool/edit.go b/claudetool/edit.go
index 50084b7..b539cd6 100644
--- a/claudetool/edit.go
+++ b/claudetool/edit.go
@@ -67,55 +67,75 @@
}
// EditRun is the implementation of the edit tool
-func EditRun(ctx context.Context, input json.RawMessage) (string, error) {
+func EditRun(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
var editRequest editInput
if err := json.Unmarshal(input, &editRequest); err != nil {
- return "", fmt.Errorf("failed to parse edit input: %v", err)
+ return nil, fmt.Errorf("failed to parse edit input: %v", err)
}
// Validate the command
cmd := editCommand(editRequest.Command)
if !isValidCommand(cmd) {
- return "", fmt.Errorf("unrecognized command %s. The allowed commands are: view, create, str_replace, insert, undo_edit", cmd)
+ return nil, fmt.Errorf("unrecognized command %s. The allowed commands are: view, create, str_replace, insert, undo_edit", cmd)
}
path := editRequest.Path
// Validate the path
if err := validatePath(cmd, path); err != nil {
- return "", err
+ return nil, err
}
// Execute the appropriate command
switch cmd {
case viewCommand:
- return handleView(ctx, path, editRequest.ViewRange)
+ result, err := handleView(ctx, path, editRequest.ViewRange)
+ if err != nil {
+ return nil, err
+ }
+ return llm.TextContent(result), nil
case createCommand:
if editRequest.FileText == nil {
- return "", fmt.Errorf("parameter file_text is required for command: create")
+ return nil, fmt.Errorf("parameter file_text is required for command: create")
}
- return handleCreate(path, *editRequest.FileText)
+ result, err := handleCreate(path, *editRequest.FileText)
+ if err != nil {
+ return nil, err
+ }
+ return llm.TextContent(result), nil
case strReplaceCommand:
if editRequest.OldStr == nil {
- return "", fmt.Errorf("parameter old_str is required for command: str_replace")
+ return nil, fmt.Errorf("parameter old_str is required for command: str_replace")
}
newStr := ""
if editRequest.NewStr != nil {
newStr = *editRequest.NewStr
}
- return handleStrReplace(path, *editRequest.OldStr, newStr)
+ result, err := handleStrReplace(path, *editRequest.OldStr, newStr)
+ if err != nil {
+ return nil, err
+ }
+ return llm.TextContent(result), nil
case insertCommand:
if editRequest.InsertLine == nil {
- return "", fmt.Errorf("parameter insert_line is required for command: insert")
+ return nil, fmt.Errorf("parameter insert_line is required for command: insert")
}
if editRequest.NewStr == nil {
- return "", fmt.Errorf("parameter new_str is required for command: insert")
+ return nil, fmt.Errorf("parameter new_str is required for command: insert")
}
- return handleInsert(path, *editRequest.InsertLine, *editRequest.NewStr)
+ result, err := handleInsert(path, *editRequest.InsertLine, *editRequest.NewStr)
+ if err != nil {
+ return nil, err
+ }
+ return llm.TextContent(result), nil
case undoEditCommand:
- return handleUndoEdit(path)
+ result, err := handleUndoEdit(path)
+ if err != nil {
+ return nil, err
+ }
+ return llm.TextContent(result), nil
default:
- return "", fmt.Errorf("command %s is not implemented", cmd)
+ return nil, fmt.Errorf("command %s is not implemented", cmd)
}
}
diff --git a/claudetool/edit_test.go b/claudetool/edit_test.go
index fe3d66c..ab687fa 100644
--- a/claudetool/edit_test.go
+++ b/claudetool/edit_test.go
@@ -50,7 +50,7 @@
t.Fatalf("Tool execution failed: %v", err)
}
- return result
+ return ContentToString(result)
}
// TestEditToolView tests the view command functionality
diff --git a/claudetool/keyword.go b/claudetool/keyword.go
index 048a236..27d9888 100644
--- a/claudetool/keyword.go
+++ b/claudetool/keyword.go
@@ -83,10 +83,10 @@
return strings.TrimSpace(string(out)), nil
}
-func keywordRun(ctx context.Context, m json.RawMessage) (string, error) {
+func keywordRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input keywordInput
if err := json.Unmarshal(m, &input); err != nil {
- return "", err
+ return nil, err
}
wd := WorkingDir(ctx)
root, err := FindRepoRoot(wd)
@@ -100,7 +100,7 @@
for _, term := range input.SearchTerms {
out, err := ripgrep(ctx, wd, []string{term})
if err != nil {
- return "", err
+ return nil, err
}
if len(out) > 64*1024 {
slog.InfoContext(ctx, "keyword search result too large", "term", term, "bytes", len(out))
@@ -115,7 +115,7 @@
var err error
out, err = ripgrep(ctx, wd, keep)
if err != nil {
- return "", err
+ return nil, err
}
if len(out) < 128*1024 {
break
@@ -139,10 +139,10 @@
resp, err := convo.SendMessage(initialMessage)
if err != nil {
- return "", fmt.Errorf("failed to send relevance filtering message: %w", err)
+ return nil, fmt.Errorf("failed to send relevance filtering message: %w", err)
}
if len(resp.Content) != 1 {
- return "", fmt.Errorf("unexpected number of messages in relevance filtering response: %d", len(resp.Content))
+ return nil, fmt.Errorf("unexpected number of messages in relevance filtering response: %d", len(resp.Content))
}
filtered := resp.Content[0].Text
@@ -155,7 +155,7 @@
"filtered", filtered,
)
- return resp.Content[0].Text, nil
+ return llm.TextContent(resp.Content[0].Text), nil
}
func ripgrep(ctx context.Context, wd string, terms []string) (string, error) {
diff --git a/claudetool/patch.go b/claudetool/patch.go
index 419e966..24a22ef 100644
--- a/claudetool/patch.go
+++ b/claudetool/patch.go
@@ -94,18 +94,18 @@
}
// PatchRun is the entry point for the user_patch tool.
-func PatchRun(ctx context.Context, m json.RawMessage) (string, error) {
+func PatchRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
var input patchInput
if err := json.Unmarshal(m, &input); err != nil {
- return "", fmt.Errorf("failed to unmarshal user_patch input: %w", err)
+ return nil, fmt.Errorf("failed to unmarshal user_patch input: %w", err)
}
// Validate the input
if !filepath.IsAbs(input.Path) {
- return "", fmt.Errorf("path %q is not absolute", input.Path)
+ return nil, fmt.Errorf("path %q is not absolute", input.Path)
}
if len(input.Patches) == 0 {
- return "", fmt.Errorf("no patches provided")
+ return nil, fmt.Errorf("no patches provided")
}
// TODO: check whether the file is autogenerated, and if so, require a "force" flag to modify it.
@@ -118,11 +118,11 @@
switch patch.Operation {
case "prepend_bof", "append_eof", "overwrite":
default:
- return "", fmt.Errorf("file %q does not exist", input.Path)
+ return nil, fmt.Errorf("file %q does not exist", input.Path)
}
}
case err != nil:
- return "", fmt.Errorf("failed to read file %q: %w", input.Path, err)
+ return nil, fmt.Errorf("failed to read file %q: %w", input.Path, err)
}
likelyGoFile := strings.HasSuffix(input.Path, ".go")
@@ -151,7 +151,7 @@
buf.Replace(0, len(orig), patch.NewText)
case "replace":
if patch.OldText == "" {
- return "", fmt.Errorf("patch %d: oldText cannot be empty for %s operation", i, patch.Operation)
+ return nil, fmt.Errorf("patch %d: oldText cannot be empty for %s operation", i, patch.Operation)
}
// Attempt to apply the patch.
@@ -214,7 +214,7 @@
patchErr = errors.Join(patchErr, fmt.Errorf("old text not found:\n%s", patch.OldText))
continue
default:
- return "", fmt.Errorf("unrecognized operation %q", patch.Operation)
+ return nil, fmt.Errorf("unrecognized operation %q", patch.Operation)
}
}
@@ -224,18 +224,18 @@
"patches": input.Patches,
"errors": patchErr,
})
- return "", patchErr
+ return nil, patchErr
}
patched, err := buf.Bytes()
if err != nil {
- return "", err
+ return nil, err
}
if err := os.MkdirAll(filepath.Dir(input.Path), 0o700); err != nil {
- return "", fmt.Errorf("failed to create directory %q: %w", filepath.Dir(input.Path), err)
+ return nil, fmt.Errorf("failed to create directory %q: %w", filepath.Dir(input.Path), err)
}
if err := os.WriteFile(input.Path, patched, 0o600); err != nil {
- return "", fmt.Errorf("failed to write patched contents to file %q: %w", input.Path, err)
+ return nil, fmt.Errorf("failed to write patched contents to file %q: %w", input.Path, err)
}
response := new(strings.Builder)
@@ -244,7 +244,7 @@
if parsed {
parseErr := parseGo(patched)
if parseErr != nil {
- return "", fmt.Errorf("after applying all patches, the file no longer parses:\n%w", parseErr)
+ return nil, fmt.Errorf("after applying all patches, the file no longer parses:\n%w", parseErr)
}
}
@@ -253,7 +253,7 @@
}
// TODO: maybe report the patch result to the model, i.e. some/all of the new code after the patches and formatting.
- return response.String(), nil
+ return llm.TextContent(response.String()), nil
}
func parseGo(buf []byte) error {
diff --git a/claudetool/think.go b/claudetool/think.go
index 69aac3c..9611150 100644
--- a/claudetool/think.go
+++ b/claudetool/think.go
@@ -34,6 +34,6 @@
`
)
-func thinkRun(ctx context.Context, m json.RawMessage) (string, error) {
- return "recorded", nil
+func thinkRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
+ return llm.TextContent("recorded"), nil
}
diff --git a/claudetool/util.go b/claudetool/util.go
new file mode 100644
index 0000000..88136e4
--- /dev/null
+++ b/claudetool/util.go
@@ -0,0 +1,13 @@
+package claudetool
+
+import (
+ "sketch.dev/llm"
+)
+
+// ContentToString extracts text from []llm.Content if available
+func ContentToString(content []llm.Content) string {
+ if len(content) == 0 {
+ return ""
+ }
+ return content[0].Text
+}
diff --git a/dockerimg/createdockerfile.go b/dockerimg/createdockerfile.go
index bccdc6d..a5a2c4b 100644
--- a/dockerimg/createdockerfile.go
+++ b/dockerimg/createdockerfile.go
@@ -184,19 +184,19 @@
}
toolCalled := false
var dockerfileExtraCmds string
- runDockerfile := func(ctx context.Context, input json.RawMessage) (string, error) {
+ runDockerfile := func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
// TODO: unmarshal straight into a struct
var m map[string]any
if err := json.Unmarshal(input, &m); err != nil {
- return "", fmt.Errorf(`input=%[1]v (%[1]T), wanted a map[string]any, got: %w`, input, err)
+ return nil, fmt.Errorf(`input=%[1]v (%[1]T), wanted a map[string]any, got: %w`, input, err)
}
var ok bool
dockerfileExtraCmds, ok = m["extra_cmds"].(string)
if !ok {
- return "", fmt.Errorf(`input["extra_cmds"]=%[1]v (%[1]T), wanted a string`, m["path"])
+ return nil, fmt.Errorf(`input["extra_cmds"]=%[1]v (%[1]T), wanted a string`, m["path"])
}
toolCalled = true
- return "OK", nil
+ return llm.TextContent("OK"), nil
}
convo := conversation.New(ctx, srv)
diff --git a/llm/ant/ant.go b/llm/ant/ant.go
index d1c366a..fdf2fde 100644
--- a/llm/ant/ant.go
+++ b/llm/ant/ant.go
@@ -44,15 +44,26 @@
var _ llm.Service = (*Service)(nil)
type content struct {
- // TODO: image support?
// https://docs.anthropic.com/en/api/messages
ID string `json:"id,omitempty"`
Type string `json:"type,omitempty"`
- Text string `json:"text,omitempty"`
+
+ // Subtly, an empty string appears in tool results often, so we have
+ // to distinguish between empty string and no string.
+ // Underlying error looks like one of:
+ // "messages.46.content.0.tool_result.content.0.text.text: Field required""
+ // "messages.1.content.1.tool_use.text: Extra inputs are not permitted"
+ //
+ // I haven't found a super great source for the API, but
+ // https://github.com/anthropics/anthropic-sdk-typescript/blob/main/src/resources/messages/messages.ts
+ // is somewhat acceptable but hard to read.
+ Text *string `json:"text,omitempty"`
+ MediaType string `json:"media_type,omitempty"` // for image
+ Source json.RawMessage `json:"source,omitempty"` // for image
// for thinking
Thinking string `json:"thinking,omitempty"`
- Data string `json:"data,omitempty"` // for redacted_thinking
+ Data string `json:"data,omitempty"` // for redacted_thinking or image
Signature string `json:"signature,omitempty"` // for thinking
// for tool_use
@@ -60,9 +71,30 @@
ToolInput json.RawMessage `json:"input,omitempty"`
// for tool_result
- ToolUseID string `json:"tool_use_id,omitempty"`
- ToolError bool `json:"is_error,omitempty"`
- ToolResult string `json:"content,omitempty"`
+ ToolUseID string `json:"tool_use_id,omitempty"`
+ ToolError bool `json:"is_error,omitempty"`
+ // note the recursive nature here; message looks like:
+ // {
+ // "role": "user",
+ // "content": [
+ // {
+ // "type": "tool_result",
+ // "tool_use_id": "toolu_01A09q90qw90lq917835lq9",
+ // "content": [
+ // {"type": "text", "text": "15 degrees"},
+ // {
+ // "type": "image",
+ // "source": {
+ // "type": "base64",
+ // "media_type": "image/jpeg",
+ // "data": "/9j/4AAQSkZJRg...",
+ // }
+ // }
+ // ]
+ // }
+ // ]
+ //}
+ ToolResult []content `json:"content,omitempty"`
// timing information for tool_result; not sent to Claude
StartTime *time.Time `json:"-"`
@@ -217,10 +249,28 @@
}
func fromLLMContent(c llm.Content) content {
- return content{
+ var toolResult []content
+ if len(c.ToolResult) > 0 {
+ toolResult = make([]content, len(c.ToolResult))
+ for i, tr := range c.ToolResult {
+ // For image content inside a tool_result, we need to map it to "image" type
+ if tr.MediaType != "" && tr.MediaType == "image/jpeg" || tr.MediaType == "image/png" {
+ // Format as an image for Claude
+ toolResult[i] = content{
+ Type: "image",
+ Source: json.RawMessage(fmt.Sprintf(`{"type":"base64","media_type":"%s","data":"%s"}`,
+ tr.MediaType, tr.Data)),
+ }
+ } else {
+ toolResult[i] = fromLLMContent(tr)
+ }
+ }
+ }
+
+ d := content{
ID: c.ID,
Type: fromLLMContentType[c.Type],
- Text: c.Text,
+ MediaType: c.MediaType,
Thinking: c.Thinking,
Data: c.Data,
Signature: c.Signature,
@@ -228,9 +278,15 @@
ToolInput: c.ToolInput,
ToolUseID: c.ToolUseID,
ToolError: c.ToolError,
- ToolResult: c.ToolResult,
+ ToolResult: toolResult,
CacheControl: fromLLMCache(c.Cache),
}
+ // Anthropic API complains if Text is specified when it shouldn't be
+ // or not specified when it's the empty string.
+ if c.Type != llm.ContentTypeToolResult && c.Type != llm.ContentTypeToolUse {
+ d.Text = &c.Text
+ }
+ return d
}
func fromLLMToolUse(tu *llm.ToolUse) *toolUse {
@@ -300,10 +356,19 @@
}
func toLLMContent(c content) llm.Content {
- return llm.Content{
+ // Convert toolResult from []content to []llm.Content
+ var toolResultContents []llm.Content
+ if len(c.ToolResult) > 0 {
+ toolResultContents = make([]llm.Content, len(c.ToolResult))
+ for i, tr := range c.ToolResult {
+ toolResultContents[i] = toLLMContent(tr)
+ }
+ }
+
+ ret := llm.Content{
ID: c.ID,
Type: toLLMContentType[c.Type],
- Text: c.Text,
+ MediaType: c.MediaType,
Thinking: c.Thinking,
Data: c.Data,
Signature: c.Signature,
@@ -311,8 +376,12 @@
ToolInput: c.ToolInput,
ToolUseID: c.ToolUseID,
ToolError: c.ToolError,
- ToolResult: c.ToolResult,
+ ToolResult: toolResultContents,
}
+ if c.Text != nil {
+ ret.Text = *c.Text
+ }
+ return ret
}
func toLLMResponse(r *response) *llm.Response {
diff --git a/llm/ant/ant_image_test.go b/llm/ant/ant_image_test.go
new file mode 100644
index 0000000..1bc0c95
--- /dev/null
+++ b/llm/ant/ant_image_test.go
@@ -0,0 +1,78 @@
+package ant
+
+import (
+ "encoding/json"
+ "testing"
+
+ "sketch.dev/llm"
+)
+
+func TestAnthropicImageToolResult(t *testing.T) {
+ // Create a tool result with both text and image content
+ textContent := llm.Content{
+ Type: llm.ContentTypeText,
+ Text: "15 degrees",
+ }
+
+ imageContent := llm.Content{
+ Type: llm.ContentTypeText, // Will be mapped to "image" in Anthropic format
+ MediaType: "image/jpeg",
+ Data: "/9j/4AAQSkZJRg...", // Shortened base64 encoded image
+ }
+
+ toolResult := llm.Content{
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: "toolu_01A09q90qw90lq917835lq9",
+ ToolResult: []llm.Content{textContent, imageContent},
+ }
+
+ // Convert to Anthropic format
+ anthropicContent := fromLLMContent(toolResult)
+
+ // Check the type
+ if anthropicContent.Type != "tool_result" {
+ t.Errorf("Expected type to be 'tool_result', got '%s'", anthropicContent.Type)
+ }
+
+ // Check the tool_use_id
+ if anthropicContent.ToolUseID != "toolu_01A09q90qw90lq917835lq9" {
+ t.Errorf("Expected tool_use_id to be 'toolu_01A09q90qw90lq917835lq9', got '%s'", anthropicContent.ToolUseID)
+ }
+
+ // Check that we have two content items in the tool result
+ if len(anthropicContent.ToolResult) != 2 {
+ t.Errorf("Expected 2 content items, got %d", len(anthropicContent.ToolResult))
+ }
+
+ // Check that the first item is text
+ if anthropicContent.ToolResult[0].Type != "text" {
+ t.Errorf("Expected first content type to be 'text', got '%s'", anthropicContent.ToolResult[0].Type)
+ }
+
+ if *anthropicContent.ToolResult[0].Text != "15 degrees" {
+ t.Errorf("Expected first content text to be '15 degrees', got '%s'", *anthropicContent.ToolResult[0].Text)
+ }
+
+ // Check that the second item is an image
+ if anthropicContent.ToolResult[1].Type != "image" {
+ t.Errorf("Expected second content type to be 'image', got '%s'", anthropicContent.ToolResult[1].Type)
+ }
+
+ // Check that the image source contains the expected format
+ var source map[string]any
+ if err := json.Unmarshal(anthropicContent.ToolResult[1].Source, &source); err != nil {
+ t.Errorf("Failed to unmarshal image source: %v", err)
+ }
+
+ if source["type"] != "base64" {
+ t.Errorf("Expected source type to be 'base64', got '%s'", source["type"])
+ }
+
+ if source["media_type"] != "image/jpeg" {
+ t.Errorf("Expected media_type to be 'image/jpeg', got '%s'", source["media_type"])
+ }
+
+ if source["data"] != "/9j/4AAQSkZJRg..." {
+ t.Errorf("Expected data to be '/9j/4AAQSkZJRg...', got '%s'", source["data"])
+ }
+}
diff --git a/llm/conversation/convo.go b/llm/conversation/convo.go
index 7860a07..c46fcc0 100644
--- a/llm/conversation/convo.go
+++ b/llm/conversation/convo.go
@@ -272,10 +272,13 @@
continue
}
content := llm.Content{
- Type: llm.ContentTypeToolResult,
- ToolUseID: part.ID,
- ToolError: true,
- ToolResult: "not executed; retry possible",
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: part.ID,
+ ToolError: true,
+ ToolResult: []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: "not executed; retry possible",
+ }},
}
prefix = append(prefix, content)
msg.Content = append(prefix, msg.Content...)
@@ -361,7 +364,10 @@
}
content.ToolError = true
- content.ToolResult = "user canceled this too_use"
+ content.ToolResult = []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: "user canceled this too_use",
+ }}
toolResults = append(toolResults, content)
}
return toolResults, nil
@@ -429,17 +435,24 @@
content.ToolUseEndTime = &endTime
content.ToolError = true
- content.ToolResult = err.Error()
+ content.ToolResult = []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: err.Error(),
+ }}
c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, nil, err)
toolResultC <- content
}
- sendRes := func(res string) {
+ sendRes := func(toolResult []llm.Content) {
// Record end time
endTime := time.Now()
content.ToolUseEndTime = &endTime
- content.ToolResult = res
- c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, &res, nil)
+ content.ToolResult = toolResult
+ var firstText string
+ if len(toolResult) > 0 {
+ firstText = toolResult[0].Text
+ }
+ c.Listener.OnToolResult(ctx, c, part.ID, part.ToolName, part.ToolInput, content, &firstText, nil)
toolResultC <- content
}
diff --git a/llm/gem/gem.go b/llm/gem/gem.go
index 6686058..e5cbcf0 100644
--- a/llm/gem/gem.go
+++ b/llm/gem/gem.go
@@ -222,10 +222,24 @@
// Tool result becomes a function response
// Create a map for the response
response := map[string]any{
- "result": c.ToolResult,
- "error": c.ToolError,
+ "error": c.ToolError,
}
+ // Handle tool results: Gemini only supports string results
+ // Combine all text content into a single string
+ var resultText string
+ if len(c.ToolResult) > 0 {
+ // Collect all text from content objects
+ texts := make([]string, 0, len(c.ToolResult))
+ for _, result := range c.ToolResult {
+ if result.Text != "" {
+ texts = append(texts, result.Text)
+ }
+ }
+ resultText = strings.Join(texts, "\n")
+ }
+ response["result"] = resultText
+
// Determine the function name to use - this is critical
funcName := ""
@@ -254,7 +268,7 @@
slog.DebugContext(context.Background(), "gemini_preparing_tool_result",
"tool_use_id", c.ToolUseID,
"mapped_func_name", funcName,
- "result_length", len(c.ToolResult))
+ "result_count", len(c.ToolResult))
content.Parts = append(content.Parts, gemini.Part{
FunctionResponse: &gemini.FunctionResponse{
@@ -464,7 +478,7 @@
"content_idx", j,
"tool_use_id", c.ToolUseID,
"tool_error", c.ToolError,
- "result_length", len(c.ToolResult))
+ "result_count", len(c.ToolResult))
}
}
slog.DebugContext(ctx, "gemini_message",
diff --git a/llm/image_content_test.go b/llm/image_content_test.go
new file mode 100644
index 0000000..4d22838
--- /dev/null
+++ b/llm/image_content_test.go
@@ -0,0 +1,62 @@
+package llm
+
+import (
+ "encoding/json"
+ "testing"
+)
+
+func TestImageContent(t *testing.T) {
+ // Create a Content structure with an image
+ imageContent := Content{
+ Type: ContentTypeText, // In the future, we might add a specific ContentTypeImage
+ MediaType: "image/jpeg",
+ Data: "/9j/4AAQSkZJRg...", // Shortened base64 encoded image
+ }
+
+ // Verify the structure is correct
+ if imageContent.MediaType != "image/jpeg" {
+ t.Errorf("Expected MediaType to be 'image/jpeg', got '%s'", imageContent.MediaType)
+ }
+
+ if imageContent.Data != "/9j/4AAQSkZJRg..." {
+ t.Errorf("Expected Data to contain base64 image data")
+ }
+
+ // Create a tool result that contains text and image content
+ toolResult := Content{
+ Type: ContentTypeToolResult,
+ ToolUseID: "toolu_01A09q90qw90lq917835lq9",
+ ToolResult: []Content{
+ {
+ Type: ContentTypeText,
+ Text: "15 degrees",
+ },
+ imageContent,
+ },
+ }
+
+ // Check that the tool result contains two content items
+ if len(toolResult.ToolResult) != 2 {
+ t.Errorf("Expected tool result to contain 2 content items, got %d", len(toolResult.ToolResult))
+ }
+
+ // Verify JSON marshaling works as expected
+ bytes, err := json.Marshal(toolResult)
+ if err != nil {
+ t.Errorf("Failed to marshal content to JSON: %v", err)
+ }
+
+ // Unmarshal and verify structure is preserved
+ var unmarshaled Content
+ if err := json.Unmarshal(bytes, &unmarshaled); err != nil {
+ t.Errorf("Failed to unmarshal JSON: %v", err)
+ }
+
+ if len(unmarshaled.ToolResult) != 2 {
+ t.Errorf("Expected unmarshaled tool result to contain 2 content items, got %d", len(unmarshaled.ToolResult))
+ }
+
+ if unmarshaled.ToolResult[1].MediaType != "image/jpeg" {
+ t.Errorf("Expected unmarshaled image MediaType to be 'image/jpeg', got '%s'", unmarshaled.ToolResult[1].MediaType)
+ }
+}
diff --git a/llm/llm.go b/llm/llm.go
index 1e53ea3..9331961 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -75,7 +75,7 @@
// The outputs from Run will be sent back to Claude.
// If you do not want to respond to the tool call request from Claude, return ErrDoNotRespond.
// ctx contains extra (rarely used) tool call information; retrieve it with ToolCallInfoFromContext.
- Run func(ctx context.Context, input json.RawMessage) (string, error) `json:"-"`
+ Run func(ctx context.Context, input json.RawMessage) ([]Content, error) `json:"-"`
}
type Content struct {
@@ -83,6 +83,9 @@
Type ContentType
Text string
+ // Media type for image content
+ MediaType string
+
// for thinking
Thinking string
Data string
@@ -95,7 +98,7 @@
// for tool_result
ToolUseID string
ToolError bool
- ToolResult string
+ ToolResult []Content
// timing information for tool_result; added externally; not sent to the LLM
ToolUseStartTime *time.Time
@@ -121,7 +124,7 @@
attrs = append(attrs, slog.String("tool_name", content.ToolName))
attrs = append(attrs, slog.String("tool_input", string(content.ToolInput)))
case ContentTypeToolResult:
- attrs = append(attrs, slog.String("tool_result", content.ToolResult))
+ attrs = append(attrs, slog.Any("tool_result", content.ToolResult))
attrs = append(attrs, slog.Bool("tool_error", content.ToolError))
case ContentTypeThinking:
attrs = append(attrs, slog.String("thinking", content.Text))
@@ -229,3 +232,23 @@
Content: []Content{StringContent(text)},
}
}
+
+// TextContent creates a simple text content for tool results.
+// This is a helper function to create the most common type of tool result content.
+func TextContent(text string) []Content {
+ return []Content{{
+ Type: ContentTypeText,
+ Text: text,
+ }}
+}
+
+// ImageContent creates an image content for tool results.
+// MediaType should be "image/jpeg" or "image/png"
+func ImageContent(text string, mediaType string, base64Data string) []Content {
+ return []Content{{
+ Type: ContentTypeText,
+ Text: text,
+ MediaType: mediaType,
+ Data: base64Data,
+ }}
+}
diff --git a/llm/oai/oai.go b/llm/oai/oai.go
index 8b64157..37484e0 100644
--- a/llm/oai/oai.go
+++ b/llm/oai/oai.go
@@ -9,6 +9,7 @@
"log/slog"
"math/rand/v2"
"net/http"
+ "strings"
"time"
"github.com/sashabaranov/go-openai"
@@ -320,7 +321,20 @@
}
case llm.ContentTypeToolResult:
// Tool results in OpenAI are sent as a separate message with tool_call_id
- return c.ToolResult, nil
+ // OpenAI doesn't support multiple content items or images in tool results
+ // Combine all text content into a single string
+ var resultText string
+ if len(c.ToolResult) > 0 {
+ // Collect all text from content objects
+ texts := make([]string, 0, len(c.ToolResult))
+ for _, result := range c.ToolResult {
+ if result.Text != "" {
+ texts = append(texts, result.Text)
+ }
+ }
+ resultText = strings.Join(texts, "\n")
+ }
+ return resultText, nil
default:
// For thinking or other types, convert to text
return c.Text, nil
@@ -348,9 +362,16 @@
// Process tool results as separate messages, but first
for _, tr := range toolResults {
+ // Convert toolresult array to a string for OpenAI
+ var toolResultContent string
+ if len(tr.ToolResult) > 0 {
+ // For now, just use the first text content in the array
+ toolResultContent = tr.ToolResult[0].Text
+ }
+
m := openai.ChatCompletionMessage{
Role: "tool",
- Content: cmp.Or(tr.ToolResult, " "), // TODO: remove omitempty upstream
+ Content: cmp.Or(toolResultContent, " "), // Use empty space if empty to avoid omitempty issues
ToolCallID: tr.ToolUseID,
}
messages = append(messages, m)
@@ -472,10 +493,13 @@
// toToolResultLLMContent converts a tool result message from OpenAI to llm.Content.
func toToolResultLLMContent(msg openai.ChatCompletionMessage) llm.Content {
return llm.Content{
- Type: llm.ContentTypeToolResult,
- ToolUseID: msg.ToolCallID,
- ToolResult: msg.Content,
- ToolError: false, // OpenAI doesn't specify errors explicitly
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: msg.ToolCallID,
+ ToolResult: []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: msg.Content,
+ }},
+ ToolError: false, // OpenAI doesn't specify errors explicitly
}
}
diff --git a/llm/tool_content_test.go b/llm/tool_content_test.go
new file mode 100644
index 0000000..bfa5cc6
--- /dev/null
+++ b/llm/tool_content_test.go
@@ -0,0 +1,37 @@
+package llm
+
+import (
+ "testing"
+)
+
+func TestToolResultArray(t *testing.T) {
+ // Test a tool result with multiple content items
+ textContent := Content{
+ Type: ContentTypeText,
+ Text: "15 degrees",
+ }
+
+ imageContent := Content{
+ Type: ContentTypeText, // In the future, this could be ContentTypeImage
+ Text: "",
+ MediaType: "image/jpeg",
+ Data: "/9j/4AAQSkZJRg...", // Base64 encoded image sample
+ }
+
+ toolResult := Content{
+ ToolResult: []Content{textContent, imageContent},
+ }
+
+ // Check the structure
+ if len(toolResult.ToolResult) != 2 {
+ t.Errorf("Expected 2 content items in ToolResult, got %d", len(toolResult.ToolResult))
+ }
+
+ if toolResult.ToolResult[0].Text != "15 degrees" {
+ t.Errorf("Expected first item text to be '15 degrees', got '%s'", toolResult.ToolResult[0].Text)
+ }
+
+ if toolResult.ToolResult[1].MediaType != "image/jpeg" {
+ t.Errorf("Expected second item media type to be 'image/jpeg', got '%s'", toolResult.ToolResult[1].MediaType)
+ }
+}
diff --git a/loop/agent.go b/loop/agent.go
index c103919..2c8eec9 100644
--- a/loop/agent.go
+++ b/loop/agent.go
@@ -26,6 +26,7 @@
"sketch.dev/claudetool/onstart"
"sketch.dev/experiment"
"sketch.dev/llm"
+ "sketch.dev/llm/ant"
"sketch.dev/llm/conversation"
)
@@ -228,8 +229,8 @@
if a.TurnDuration != nil {
attrs = append(attrs, slog.Int64("turnDuration", a.TurnDuration.Nanoseconds()))
}
- if a.ToolResult != "" {
- attrs = append(attrs, slog.String("tool_result", a.ToolResult))
+ if len(a.ToolResult) > 0 {
+ attrs = append(attrs, slog.Any("tool_result", a.ToolResult))
}
if a.ToolError {
attrs = append(attrs, slog.Bool("tool_error", a.ToolError))
@@ -554,6 +555,33 @@
a.mu.Unlock()
}
+// contentToString converts []llm.Content to a string, concatenating all text content and skipping non-text types.
+// If there's only one element in the array and it's a text type, it returns that text directly.
+// It also processes nested ToolResult arrays recursively.
+func contentToString(contents []llm.Content) string {
+ if len(contents) == 0 {
+ return ""
+ }
+
+ // If there's only one element and it's a text type, return it directly
+ if len(contents) == 1 && contents[0].Type == llm.ContentTypeText {
+ return contents[0].Text
+ }
+
+ // Otherwise, concatenate all text content
+ var result strings.Builder
+ for _, content := range contents {
+ if content.Type == llm.ContentTypeText {
+ result.WriteString(content.Text)
+ } else if content.Type == llm.ContentTypeToolResult && len(content.ToolResult) > 0 {
+ // Recursively process nested tool results
+ result.WriteString(contentToString(content.ToolResult))
+ }
+ }
+
+ return result.String()
+}
+
// OnToolResult implements ant.Listener.
func (a *Agent) OnToolResult(ctx context.Context, convo *conversation.Convo, toolID string, toolName string, toolInput json.RawMessage, content llm.Content, result *string, err error) {
// Remove the tool call from outstanding calls
@@ -564,7 +592,7 @@
m := AgentMessage{
Type: ToolUseMessageType,
Content: content.Text,
- ToolResult: content.ToolResult,
+ ToolResult: contentToString(content.ToolResult),
ToolError: content.ToolError,
ToolName: toolName,
ToolInput: string(toolInput),
@@ -879,7 +907,8 @@
// Add browser tools if enabled
// if experiment.Enabled("browser") {
if true {
- bTools, browserCleanup := browse.RegisterBrowserTools(a.config.Context)
+ _, supportsScreenshots := a.config.Service.(*ant.Service)
+ bTools, browserCleanup := browse.RegisterBrowserTools(a.config.Context, supportsScreenshots)
// Add cleanup function to context cancel
go func() {
<-a.config.Context.Done()
@@ -943,13 +972,13 @@
},
"required": ["question", "responseOptions"]
}`),
- Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+ Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
// The Run logic for "multiplchoice" tool is a no-op on the server.
// The UI will present a list of options for the user to select from,
// and that's it as far as "executing" the tool_use goes.
// When the user *does* select one of the presented options, that
// responseText gets sent as a chat message on behalf of the user.
- return "end your turn and wait for the user to respond", nil
+ return llm.TextContent("end your turn and wait for the user to respond"), nil
},
}
return ret
@@ -997,28 +1026,28 @@
},
"required": ["title"]
}`),
- Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+ Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
var params struct {
Title string `json:"title"`
}
if err := json.Unmarshal(input, ¶ms); err != nil {
- return "", err
+ return nil, err
}
// We don't allow changing the title once set to be consistent with the previous behavior
// and to prevent accidental title changes
t := a.Title()
if t != "" {
- return "", fmt.Errorf("title already set to: %s", t)
+ return nil, fmt.Errorf("title already set to: %s", t)
}
if params.Title == "" {
- return "", fmt.Errorf("title parameter cannot be empty")
+ return nil, fmt.Errorf("title parameter cannot be empty")
}
a.SetTitle(params.Title)
response := fmt.Sprintf("Title set to %q", params.Title)
- return response, nil
+ return llm.TextContent(response), nil
},
}
return titleTool
@@ -1039,28 +1068,28 @@
},
"required": ["branch_name"]
}`),
- Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+ Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
var params struct {
BranchName string `json:"branch_name"`
}
if err := json.Unmarshal(input, ¶ms); err != nil {
- return "", err
+ return nil, err
}
b := a.BranchName()
if b != "" {
- return "", fmt.Errorf("branch already set to: %s", b)
+ return nil, fmt.Errorf("branch already set to: %s", b)
}
if params.BranchName == "" {
- return "", fmt.Errorf("branch_name parameter cannot be empty")
+ return nil, fmt.Errorf("branch_name must not be empty")
}
if params.BranchName != cleanBranchName(params.BranchName) {
- return "", fmt.Errorf("branch_name parameter must be alphanumeric hyphenated slug")
+ return nil, fmt.Errorf("branch_name parameter must be alphanumeric hyphenated slug")
}
branchName := "sketch/" + params.BranchName
if branchExists(a.workingDir, branchName) {
- return "", fmt.Errorf("branch %q already exists; please choose a different branch name", branchName)
+ return nil, fmt.Errorf("branch %q already exists; please choose a different branch name", branchName)
}
a.SetBranch(branchName)
@@ -1074,7 +1103,7 @@
response += "\n\n" + styleHint
}
- return response, nil
+ return llm.TextContent(response), nil
},
}
return preCommit
@@ -1089,11 +1118,6 @@
a.inbox <- msg
}
-func (a *Agent) ToolResultMessage(ctx context.Context, toolCallID, msg string) {
- a.pushToOutbox(ctx, AgentMessage{Type: UserMessageType, Content: msg, ToolCallId: toolCallID})
- a.inbox <- msg
-}
-
func (a *Agent) CancelToolUse(toolUseID string, cause error) error {
return a.convo.CancelToolUse(toolUseID, cause)
}
@@ -1137,6 +1161,11 @@
m.Timestamp = time.Now()
}
+ // If this is a ToolUseMessage and ToolResult is set but Content is not, copy the ToolResult to Content
+ if m.Type == ToolUseMessageType && m.ToolResult != "" && m.Content == "" {
+ m.Content = m.ToolResult
+ }
+
// If this is an end-of-turn message, calculate the turn duration and add it to the message
if m.EndOfTurn && m.Type == AgentMessageType {
turnDuration := time.Since(a.startOfTurn)
diff --git a/loop/agent_test.go b/loop/agent_test.go
index 72e7ccb..ce44352 100644
--- a/loop/agent_test.go
+++ b/loop/agent_test.go
@@ -680,3 +680,117 @@
t.Errorf("Expected to eventually reach StateEndOfTurn, but never did")
}
}
+
+func TestContentToString(t *testing.T) {
+ tests := []struct {
+ name string
+ contents []llm.Content
+ want string
+ }{
+ {
+ name: "empty",
+ contents: []llm.Content{},
+ want: "",
+ },
+ {
+ name: "single text content",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "hello world"},
+ },
+ want: "hello world",
+ },
+ {
+ name: "multiple text content",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "hello "},
+ {Type: llm.ContentTypeText, Text: "world"},
+ },
+ want: "hello world",
+ },
+ {
+ name: "mixed content types",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "hello "},
+ {Type: llm.ContentTypeText, MediaType: "image/png", Data: "base64data"},
+ {Type: llm.ContentTypeText, Text: "world"},
+ },
+ want: "hello world",
+ },
+ {
+ name: "non-text content only",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeToolUse, ToolName: "example"},
+ },
+ want: "",
+ },
+ {
+ name: "nested tool result",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "outer "},
+ {Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "inner"},
+ }},
+ },
+ want: "outer inner",
+ },
+ {
+ name: "deeply nested tool result",
+ contents: []llm.Content{
+ {Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+ {Type: llm.ContentTypeToolResult, ToolResult: []llm.Content{
+ {Type: llm.ContentTypeText, Text: "deeply nested"},
+ }},
+ }},
+ },
+ want: "deeply nested",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := contentToString(tt.contents); got != tt.want {
+ t.Errorf("contentToString() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
+
+func TestPushToOutbox(t *testing.T) {
+ // Create a new agent
+ a := &Agent{
+ outstandingLLMCalls: make(map[string]struct{}),
+ outstandingToolCalls: make(map[string]string),
+ stateMachine: NewStateMachine(),
+ subscribers: make([]chan *AgentMessage, 0),
+ }
+
+ // Create a channel to receive messages
+ messageCh := make(chan *AgentMessage, 1)
+
+ // Add the channel to the subscribers list
+ a.mu.Lock()
+ a.subscribers = append(a.subscribers, messageCh)
+ a.mu.Unlock()
+
+ // We need to set the text that would be produced by our modified contentToString function
+ resultText := "test resultnested result" // Directly set the expected output
+
+ // In a real-world scenario, this would be coming from a toolResult that contained nested content
+
+ m := AgentMessage{
+ Type: ToolUseMessageType,
+ ToolResult: resultText,
+ }
+
+ // Push the message to the outbox
+ a.pushToOutbox(context.Background(), m)
+
+ // Receive the message from the subscriber
+ received := <-messageCh
+
+ // Check that the Content field contains the concatenated text from ToolResult
+ expected := "test resultnested result"
+ if received.Content != expected {
+ t.Errorf("Expected Content to be %q, got %q", expected, received.Content)
+ }
+}
diff --git a/loop/agent_user_cancel_test.go b/loop/agent_user_cancel_test.go
index 0e5eb43..ed7df1b 100644
--- a/loop/agent_user_cancel_test.go
+++ b/loop/agent_user_cancel_test.go
@@ -74,11 +74,14 @@
toolUseContents := []llm.Content{
{
- Type: llm.ContentTypeToolResult,
- ToolUseID: "tool1",
- Text: "",
- ToolResult: "This is a tool result",
- ToolError: false,
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: "tool1",
+ Text: "",
+ ToolResult: []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: "This is a tool result",
+ }},
+ ToolError: false,
},
}
toolUseResultsMsg := llm.Message{
@@ -344,10 +347,13 @@
}
canceledToolUseContents := []llm.Content{
{
- Type: llm.ContentTypeToolResult,
- ToolUseID: "tool1",
- ToolError: true,
- ToolResult: "user canceled this tool_use",
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: "tool1",
+ ToolError: true,
+ ToolResult: []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: "user canceled this tool_use",
+ }},
},
}
canceledToolUseMsg := llm.Message{
@@ -424,11 +430,14 @@
toolUseContents := []llm.Content{
{
- Type: llm.ContentTypeToolResult,
- ToolUseID: "tool1",
- Text: "",
- ToolResult: "This is a tool result",
- ToolError: false,
+ Type: llm.ContentTypeToolResult,
+ ToolUseID: "tool1",
+ Text: "",
+ ToolResult: []llm.Content{{
+ Type: llm.ContentTypeText,
+ Text: "This is a tool result",
+ }},
+ ToolError: false,
},
}
toolUseResponse := &llm.Response{
diff --git a/loop/donetool.go b/loop/donetool.go
index 4261a98..12db3b8 100644
--- a/loop/donetool.go
+++ b/loop/donetool.go
@@ -19,23 +19,23 @@
Name: "done",
Description: doneDescription,
InputSchema: json.RawMessage(doneChecklistJSONSchema),
- Run: func(ctx context.Context, input json.RawMessage) (string, error) {
+ Run: func(ctx context.Context, input json.RawMessage) ([]llm.Content, error) {
// Cannot be done with a messy git.
if err := codereview.RequireNormalGitState(ctx); err != nil {
- return "", err
+ return nil, err
}
if err := codereview.RequireNoUncommittedChanges(ctx); err != nil {
- return "", err
+ return nil, err
}
// Ensure that the current commit has been reviewed.
head, err := codereview.CurrentCommit(ctx)
if err == nil {
needsReview := !codereview.IsInitialCommit(head) && !codereview.HasReviewed(head)
if needsReview {
- return "", fmt.Errorf("codereview tool has not been run for commit %v", head)
+ return nil, fmt.Errorf("codereview tool has not been run for commit %v", head)
}
}
- return `Please ask the user to review your work. Be concise - users are more likely to read shorter comments.`, nil
+ return llm.TextContent("Please ask the user to review your work. Be concise - users are more likely to read shorter comments."), nil
},
}
}
diff --git a/loop/testdata/agent_loop.httprr b/loop/testdata/agent_loop.httprr
index 64137c0..512e5d1 100644
--- a/loop/testdata/agent_loop.httprr
+++ b/loop/testdata/agent_loop.httprr
@@ -1,9 +1,9 @@
httprr trace v1
-14275 2118
+14628 2230
POST https://api.anthropic.com/v1/messages HTTP/1.1
Host: api.anthropic.com
User-Agent: Go-http-client/1.1
-Content-Length: 14077
+Content-Length: 14430
Anthropic-Version: 2023-06-01
Content-Type: application/json
@@ -350,6 +350,22 @@
}
},
{
+ "name": "browser_scroll_into_view",
+ "description": "Scroll an element into view if it's not visible",
+ "input_schema": {
+ "type": "object",
+ "properties": {
+ "selector": {
+ "type": "string",
+ "description": "CSS selector for the element to scroll into view"
+ }
+ },
+ "required": [
+ "selector"
+ ]
+ }
+ },
+ {
"name": "browser_screenshot",
"description": "Take a screenshot of the page or a specific element",
"input_schema": {
@@ -371,18 +387,18 @@
}
},
{
- "name": "browser_scroll_into_view",
- "description": "Scroll an element into view if it's not visible",
+ "name": "browser_read_image",
+ "description": "Read an image file (such as a screenshot) and encode it for sending to the LLM",
"input_schema": {
"type": "object",
"properties": {
- "selector": {
+ "path": {
"type": "string",
- "description": "CSS selector for the element to scroll into view"
+ "description": "Path to the image file to read"
}
},
"required": [
- "selector"
+ "path"
]
}
},
@@ -448,24 +464,24 @@
Anthropic-Organization-Id: 3c473a21-7208-450a-a9f8-80aebda45c1b
Anthropic-Ratelimit-Input-Tokens-Limit: 200000
Anthropic-Ratelimit-Input-Tokens-Remaining: 200000
-Anthropic-Ratelimit-Input-Tokens-Reset: 2025-05-09T22:32:28Z
+Anthropic-Ratelimit-Input-Tokens-Reset: 2025-05-10T13:34:48Z
Anthropic-Ratelimit-Output-Tokens-Limit: 80000
Anthropic-Ratelimit-Output-Tokens-Remaining: 80000
-Anthropic-Ratelimit-Output-Tokens-Reset: 2025-05-09T22:32:31Z
+Anthropic-Ratelimit-Output-Tokens-Reset: 2025-05-10T13:34:52Z
Anthropic-Ratelimit-Requests-Limit: 4000
Anthropic-Ratelimit-Requests-Remaining: 3999
-Anthropic-Ratelimit-Requests-Reset: 2025-05-09T22:32:27Z
+Anthropic-Ratelimit-Requests-Reset: 2025-05-10T13:34:47Z
Anthropic-Ratelimit-Tokens-Limit: 280000
Anthropic-Ratelimit-Tokens-Remaining: 280000
-Anthropic-Ratelimit-Tokens-Reset: 2025-05-09T22:32:28Z
+Anthropic-Ratelimit-Tokens-Reset: 2025-05-10T13:34:48Z
Cf-Cache-Status: DYNAMIC
-Cf-Ray: 93d4a720b88fb976-SJC
+Cf-Ray: 93d9d0e80caa67fe-SJC
Content-Type: application/json
-Date: Fri, 09 May 2025 22:32:31 GMT
-Request-Id: req_011CNxkUqzPtAV1N4dwqmCn8
+Date: Sat, 10 May 2025 13:34:52 GMT
+Request-Id: req_011CNywHj3Qrj3hEz9qFRVgK
Server: cloudflare
Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
Via: 1.1 google
X-Robots-Tag: none
-{"id":"msg_01D6Uo6fKbA6VEwcrR1EJNDx","type":"message","role":"assistant","model":"claude-3-7-sonnet-20250219","content":[{"type":"text","text":"Here are the tools available to me:\n\n1. bash - Execute shell commands\n2. keyword_search - Find files with search terms\n3. think - Record thoughts or plans\n4. title - Set conversation title\n5. precommit - Create git branch for tracking work\n6. done - Mark task as complete with checklist\n7. codereview - Run automated code review\n8. multiplechoice - Present multiple-choice options\n9. browser_navigate - Navigate to URL\n10. browser_click - Click element with CSS selector\n11. browser_type - Type text into input element\n12. browser_wait_for - Wait for element to appear\n13. browser_get_text - Get text from element\n14. browser_eval - Run JavaScript in browser\n15. browser_screenshot - Take screenshot\n16. browser_scroll_into_view - Scroll element into view\n17. patch - Make precise text edits to files"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":4,"cache_creation_input_tokens":3294,"cache_read_input_tokens":0,"output_tokens":206}}
\ No newline at end of file
+{"id":"msg_014KGRNEmFdTUGqDQN7sRmzc","type":"message","role":"assistant","model":"claude-3-7-sonnet-20250219","content":[{"type":"text","text":"Here are the tools available to me:\n\n1. bash - Executes shell commands\n2. keyword_search - Searches files with given keywords\n3. think - For recording thoughts, notes, and plans\n4. title - Sets conversation title\n5. precommit - Creates git branches and provides commit message guidance\n6. done - Marks task completion with a checklist\n7. codereview - Runs automated code review\n8. multiplechoice - Presents multiple choice options to the user\n9. Browser tools:\n - browser_navigate - Opens a URL\n - browser_click - Clicks elements\n - browser_type - Types text into elements\n - browser_wait_for - Waits for elements\n - browser_get_text - Gets text from elements\n - browser_eval - Runs JavaScript in browser\n - browser_scroll_into_view - Scrolls to elements\n - browser_screenshot - Takes screenshots\n - browser_read_image - Reads image files\n10. patch - Makes precise text modifications to files"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":4,"cache_creation_input_tokens":3376,"cache_read_input_tokens":0,"output_tokens":236}}
\ No newline at end of file