browser: rename browser_read_image to read_image and auto-send screenshots to LLM Rename browser_read_image tool to read_image and modify browser_take_screenshot to automatically send image content to the LLM instead of requiring a separate read_image tool call, streamlining the screenshot workflow. Problem Analysis: The current browser screenshot workflow required two separate tool calls: 1. browser_take_screenshot - saves screenshot and returns file path 2. browser_read_image - reads saved screenshot and sends to LLM This two-step process was inefficient and created unnecessary round trips. Additionally, browser_read_image was specific to browser automation but the functionality of reading and encoding images is more general purpose. Implementation Changes: 1. Screenshot Tool Behavior (claudetool/browse/browse.go): - Modified browser_take_screenshot to automatically return image content - Removed screenshotOutput struct as ID-only response no longer needed - Added base64 encoding of screenshot data directly in screenshotRun - Returns []llm.Content with both text description and image data - Still saves screenshot file for potential future reference - Uses same image encoding format as existing read_image tool 2. Tool Rename (claudetool/browse/browse.go): - Renamed browser_read_image tool to read_image - Updated tool name in NewReadImageTool from 'browser_read_image' to 'read_image' - Maintained all existing functionality and input/output format - Tool description and schema remain unchanged 3. UI Updates (termui/termui.go): - Updated template condition from 'browser_read_image' to 'read_image' - Maintains existing emoji and display format for read_image tool calls 4. WebUI Updates (webui/src/web-components/): - Updated sketch-tool-calls.ts to reference 'read_image' instead of 'browser_read_image' - Renamed sketch-tool-card-browser-read-image.ts to sketch-tool-card-read-image.ts - Updated component class name from SketchToolCardBrowserReadImage to SketchToolCardReadImage - Updated custom element name from 'sketch-tool-card-browser-read-image' to 'sketch-tool-card-read-image' - Updated import statement to reference new component file name - Removed old component file and updated TypeScript declarations 5. Test Updates (claudetool/browse/browse_test.go): - Modified TestGetTools to allow read_image tool without 'browser_' prefix - Added special case handling for read_image in tool naming convention check - All existing tests continue to pass with updated tool name Technical Details: - Screenshot auto-send uses same base64 encoding as existing read_image tool - Content structure matches browser_read_image output format for consistency - File saving still occurs for potential debugging or future reference - Error handling preserves existing behavior with proper fallbacks - Tool count remains the same (12 tools with screenshots, 10 without) Benefits: - Eliminates need for two-step screenshot workflow - Reduces round trips and simplifies user experience - More intuitive tool naming (read_image is general purpose) - Maintains full backward compatibility for read_image functionality - Consistent image encoding across all browser tools - Automatic screenshot viewing improves debugging and validation workflows Testing: - All existing browser tool tests pass with updated expectations - TestReadImageTool verifies renamed tool functionality - Tool naming convention test updated to handle read_image exception - TypeScript compilation successful with no type errors - Web component functionality preserved across rename This enhancement streamlines screenshot workflows while maintaining the general-purpose read_image tool for reading arbitrary image files, creating a more efficient and intuitive browser automation experience. Co-Authored-By: sketch <hello@sketch.dev> Change-ID: se3e81f997f30f01ek

commit: 542bda3968c6dd5b79392dd63e2955e04520401a [log] [tgz]
author: Philip Zeyliger <philip@bold.dev> Wed Jun 11 18:31:03 2025 -0700
committer: Autoformatter <bot@sketch.dev> Thu Jun 12 01:31:34 2025 +0000
tree: ea1a0743849495ca2489c6363d2dc689dd0a56a7
parent: 225e9668aeebc0cae667872dd45222d69ac3cbd8 [diff]
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index dfb963e..5cd28cd 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go

@@ -541,10 +541,6 @@
 	Timeout  string `json:"timeout,omitempty"`
 }
 
-type screenshotOutput struct {
-	ID string `json:"id"`
-}
-
 // NewScreenshotTool creates a tool for taking screenshots
 func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
 	return &llm.Tool{
@@ -606,7 +602,7 @@
 		return llm.TextContent(errorResponse(err)), nil
 	}
 
-	// Save the screenshot and get its ID
+	// Save the screenshot and get its ID for potential future reference
 	id := b.SaveScreenshot(buf)
 	if id == "" {
 		return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
@@ -615,14 +611,21 @@
 	// Get the full path to the screenshot
 	screenshotPath := GetScreenshotPath(id)
 
-	// Return the ID and instructions on how to view the screenshot
-	result := fmt.Sprintf(`{
-  "id": "%s",
-  "path": "%s",
-  "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
-}`, id, screenshotPath)
+	// Encode the image as base64
+	base64Data := base64.StdEncoding.EncodeToString(buf)
 
-	return llm.TextContent(result), nil
+	// Return the screenshot directly to the LLM
+	return []llm.Content{
+		{
+			Type: llm.ContentTypeText,
+			Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
+		},
+		{
+			Type:      llm.ContentTypeText, // Will be mapped to image in content array
+			MediaType: "image/png",
+			Data:      base64Data,
+		},
+	}, nil
 }
 
 // ScrollIntoViewTool definition
@@ -817,7 +820,7 @@
 // NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
 func (b *BrowseTools) NewReadImageTool() *llm.Tool {
 	return &llm.Tool{
-		Name:        "browser_read_image",
+		Name:        "read_image",
 		Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
 		InputSchema: json.RawMessage(`{
 			"type": "object",

diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go
index 35d7911..6a0a31d 100644
--- a/claudetool/browse/browse_test.go
+++ b/claudetool/browse/browse_test.go

@@ -78,7 +78,8 @@
 
 		// Check tool naming convention
 		for _, tool := range toolsWithScreenshots {
-			if !strings.HasPrefix(tool.Name, "browser_") {
+			// Most tools have browser_ prefix, except for read_image
+			if tool.Name != "read_image" && !strings.HasPrefix(tool.Name, "browser_") {
 				t.Errorf("tool name %q does not have prefix 'browser_'", tool.Name)
 			}
 		}

diff --git a/loop/testdata/agent_loop.httprr b/loop/testdata/agent_loop.httprr
index 552bd3f..68cb57a 100644
--- a/loop/testdata/agent_loop.httprr
+++ b/loop/testdata/agent_loop.httprr

@@ -1,9 +1,9 @@
 httprr trace v1
-20370 2576
+20362 2598
 POST https://api.anthropic.com/v1/messages HTTP/1.1

 Host: api.anthropic.com

 User-Agent: Go-http-client/1.1

-Content-Length: 20172

+Content-Length: 20164

 Anthropic-Version: 2023-06-01

 Content-Type: application/json

 

@@ -564,7 +564,7 @@
    }
   },
   {
-   "name": "browser_read_image",
+   "name": "read_image",
    "description": "Read an image file (such as a screenshot) and encode it for sending to the LLM",
    "input_schema": {
     "type": "object",
@@ -597,24 +597,24 @@
 Anthropic-Organization-Id: 3c473a21-7208-450a-a9f8-80aebda45c1b

 Anthropic-Ratelimit-Input-Tokens-Limit: 200000

 Anthropic-Ratelimit-Input-Tokens-Remaining: 200000

-Anthropic-Ratelimit-Input-Tokens-Reset: 2025-06-09T21:16:18Z

+Anthropic-Ratelimit-Input-Tokens-Reset: 2025-06-12T01:30:49Z

 Anthropic-Ratelimit-Output-Tokens-Limit: 80000

 Anthropic-Ratelimit-Output-Tokens-Remaining: 80000

-Anthropic-Ratelimit-Output-Tokens-Reset: 2025-06-09T21:16:25Z

+Anthropic-Ratelimit-Output-Tokens-Reset: 2025-06-12T01:30:57Z

 Anthropic-Ratelimit-Requests-Limit: 4000

 Anthropic-Ratelimit-Requests-Remaining: 3999

-Anthropic-Ratelimit-Requests-Reset: 2025-06-09T21:16:17Z

+Anthropic-Ratelimit-Requests-Reset: 2025-06-12T01:30:48Z

 Anthropic-Ratelimit-Tokens-Limit: 280000

 Anthropic-Ratelimit-Tokens-Remaining: 280000

-Anthropic-Ratelimit-Tokens-Reset: 2025-06-09T21:16:18Z

+Anthropic-Ratelimit-Tokens-Reset: 2025-06-12T01:30:49Z

 Cf-Cache-Status: DYNAMIC

-Cf-Ray: 94d3a62bdfe667a7-SJC

+Cf-Ray: 94e595c2cf2615c2-SJC

 Content-Type: application/json

-Date: Mon, 09 Jun 2025 21:16:25 GMT

-Request-Id: req_011CPyLkKTUhtcyEGZy7mPQw

+Date: Thu, 12 Jun 2025 01:30:57 GMT

+Request-Id: req_011CQ3TmsfarEWcq1a3S6vAH

 Server: cloudflare

 Strict-Transport-Security: max-age=31536000; includeSubDomains; preload

 Via: 1.1 google

 X-Robots-Tag: none

 

-{"id":"msg_01KVVbqNvBwEwHwJcYJzv6Au","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[{"type":"text","text":"Here are the tools available to me:\n\n**File & Code Operations:**\n- `bash` - Execute shell commands\n- `keyword_search` - Search for files/code with keywords\n- `patch` - Make precise text edits to files\n\n**Task Management:**\n- `todo_read` - Read current todo list\n- `todo_write` - Create/manage task lists\n- `think` - Record thoughts and plans\n\n**Git & Code Quality:**\n- `commit-message-style` - Get git commit message guidance\n- `codereview` - Run automated code review\n\n**Browser Automation:**\n- `browser_navigate` - Navigate to URLs\n- `browser_click` - Click elements\n- `browser_type` - Type into inputs\n- `browser_wait_for` - Wait for elements\n- `browser_get_text` - Read page text\n- `browser_eval` - Execute JavaScript\n- `browser_scroll_into_view` - Scroll to elements\n- `browser_resize` - Resize browser window\n- `browser_recent_console_logs` - Get console logs\n- `browser_clear_console_logs` - Clear console logs\n- `browser_take_screenshot` - Take screenshots\n- `browser_read_image` - Read image files\n\n**Utilities:**\n- `set-slug` - Set conversation identifier\n- `about_sketch` - Get info about Sketch environment\n- `multiplechoice` - Present multiple choice questions\n- `done` - Mark work complete with checklist"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":4672,"cache_read_input_tokens":0,"output_tokens":341,"service_tier":"standard"}}
\ No newline at end of file
+{"id":"msg_012nUH4GVNxkfLo2t455oSoi","type":"message","role":"assistant","model":"claude-sonnet-4-20250514","content":[{"type":"text","text":"Here are the tools available to me:\n\n**File & Code Management:**\n- `bash` - Execute shell commands\n- `keyword_search` - Search for files/code with keywords\n- `patch` - Edit files with precise text operations\n- `codereview` - Run automated code review\n\n**Task Management:**\n- `todo_read` - Read current todo list\n- `todo_write` - Create/update task lists\n- `think` - Take notes and form plans\n\n**Git & Project:**\n- `commit-message-style` - Get git commit message guidance\n- `done` - Mark work complete with checklist verification\n- `set-slug` - Set conversation identifier\n\n**Browser Automation:**\n- `browser_navigate` - Navigate to URLs\n- `browser_click` - Click elements\n- `browser_type` - Type into inputs\n- `browser_wait_for` - Wait for elements\n- `browser_get_text` - Read page text\n- `browser_eval` - Execute JavaScript\n- `browser_scroll_into_view` - Scroll to elements\n- `browser_resize` - Resize browser window\n- `browser_take_screenshot` - Capture screenshots\n- `browser_recent_console_logs` - Get console logs\n- `browser_clear_console_logs` - Clear console logs\n\n**Utilities:**\n- `read_image` - Read and encode image files\n- `about_sketch` - Get help with Sketch functionality\n- `multiplechoice` - Present multiple choice questions"}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":3,"cache_creation_input_tokens":4670,"cache_read_input_tokens":0,"output_tokens":342,"service_tier":"standard"}}
\ No newline at end of file

diff --git a/termui/termui.go b/termui/termui.go
index 48c5f53..af6376d 100644
--- a/termui/termui.go
+++ b/termui/termui.go

@@ -76,7 +76,7 @@
  🔄 {{.input.selector -}}
 {{else if eq .msg.ToolName "browser_resize" -}}
  🖼️  {{.input.width}}x{{.input.height -}}
-{{else if eq .msg.ToolName "browser_read_image" -}}
+{{else if eq .msg.ToolName "read_image" -}}
  🖼️  {{.input.path -}}
 {{else if eq .msg.ToolName "browser_recent_console_logs" -}}
  📜 Console logs

diff --git a/webui/src/web-components/sketch-tool-calls.ts b/webui/src/web-components/sketch-tool-calls.ts
index 270213e..508c668 100644
--- a/webui/src/web-components/sketch-tool-calls.ts
+++ b/webui/src/web-components/sketch-tool-calls.ts

@@ -13,7 +13,7 @@
 import "./sketch-tool-card-browser-eval";
 import "./sketch-tool-card-browser-scroll-into-view";
 import "./sketch-tool-card-browser-resize";
-import "./sketch-tool-card-browser-read-image";
+import "./sketch-tool-card-read-image";
 import "./sketch-tool-card-browser-recent-console-logs";
 import "./sketch-tool-card-browser-clear-console-logs";
 
@@ -203,11 +203,11 @@
           .open=${open}
           .toolCall=${toolCall}
         ></sketch-tool-card-browser-resize>`;
-      case "browser_read_image":
-        return html`<sketch-tool-card-browser-read-image
+      case "read_image":
+        return html`<sketch-tool-card-read-image
           .open=${open}
           .toolCall=${toolCall}
-        ></sketch-tool-card-browser-read-image>`;
+        ></sketch-tool-card-read-image>`;
       case "browser_recent_console_logs":
         return html`<sketch-tool-card-browser-recent-console-logs
           .open=${open}

diff --git a/webui/src/web-components/sketch-tool-card-browser-read-image.ts b/webui/src/web-components/sketch-tool-card-read-image.ts
similarity index 88%
rename from webui/src/web-components/sketch-tool-card-browser-read-image.ts
rename to webui/src/web-components/sketch-tool-card-read-image.ts
index cdb96cc..ba9a5af 100644
--- a/webui/src/web-components/sketch-tool-card-browser-read-image.ts
+++ b/webui/src/web-components/sketch-tool-card-read-image.ts

@@ -2,8 +2,8 @@
 import { customElement, property } from "lit/decorators.js";
 import { ToolCall } from "../types";
 
-@customElement("sketch-tool-card-browser-read-image")
-export class SketchToolCardBrowserReadImage extends LitElement {
+@customElement("sketch-tool-card-read-image")
+export class SketchToolCardReadImage extends LitElement {
   @property()
   toolCall: ToolCall;
 
@@ -60,6 +60,6 @@
 
 declare global {
   interface HTMLElementTagNameMap {
-    "sketch-tool-card-browser-read-image": SketchToolCardBrowserReadImage;
+    "sketch-tool-card-read-image": SketchToolCardReadImage;
   }
 }

diff --git a/webui/src/web-components/sketch-tool-card-take-screenshot.ts b/webui/src/web-components/sketch-tool-card-take-screenshot.ts
index 33bfae3..b21686b 100644
--- a/webui/src/web-components/sketch-tool-card-take-screenshot.ts
+++ b/webui/src/web-components/sketch-tool-card-take-screenshot.ts

@@ -89,16 +89,19 @@
       console.error("Error parsing screenshot input:", e);
     }
 
-    // Get the screenshot ID from the result
+    // Extract the screenshot ID from the result text
     let screenshotId = "";
     let hasResult = false;
     if (this.toolCall?.result_message?.tool_result) {
-      try {
-        const result = JSON.parse(this.toolCall.result_message.tool_result);
-        screenshotId = result.id;
+      // The tool result is now a text like "Screenshot taken (saved as /tmp/sketch-screenshots/{id}.png)"
+      // Extract the ID from this text
+      const resultText = this.toolCall.result_message.tool_result;
+      const pathMatch = resultText.match(
+        /\/tmp\/sketch-screenshots\/(.*?)\.png/,
+      );
+      if (pathMatch) {
+        screenshotId = pathMatch[1];
         hasResult = true;
-      } catch (e) {
-        console.error("Error parsing screenshot result:", e);
       }
     }
 
@@ -138,7 +141,7 @@
                         />
                         ${this.imageLoaded
                           ? html`<div class="screenshot-info">
-                              Screenshot ID: ${screenshotId}
+                              Screenshot saved and displayed
                             </div>`
                           : ""}
                       `}
commit	542bda3968c6dd5b79392dd63e2955e04520401a	[log] [tgz]
author	Philip Zeyliger <philip@bold.dev>	Wed Jun 11 18:31:03 2025 -0700
committer	Autoformatter <bot@sketch.dev>	Thu Jun 12 01:31:34 2025 +0000
tree	ea1a0743849495ca2489c6363d2dc689dd0a56a7
parent	225e9668aeebc0cae667872dd45222d69ac3cbd8 [diff]