claudetool: streamline browser tools
browser_click, browser_type, browser_get_text, browser_scroll_into_view,
browser_resize, and wait_for can all be easily implemented using browser_eval,
if browser_eval is given an await parameter.
A bit of testing suggests that they are more robust in practice
that way, and that multiple tool calls can be combined
into a single browser_eval call, which reduces latency.
And Sketch does in fact use them as needed.
Also, bump up timeouts; empirically, 5 seconds is not enough.
As a bonus, fewer tools is better for context management.
Co-Authored-By: sketch <hello@sketch.dev>
Change-ID: s8d8cd418f3e97f26k
diff --git a/claudetool/browse/README.md b/claudetool/browse/README.md
index 69ebf07..74da1bb 100644
--- a/claudetool/browse/README.md
+++ b/claudetool/browse/README.md
@@ -7,14 +7,8 @@
## Available Tools
1. `browser_navigate` - Navigate to a URL and wait for the page to load
-2. `browser_click` - Click an element matching a CSS selector
-3. `browser_type` - Type text into an input field
-4. `browser_wait_for` - Wait for an element to appear in the DOM
-5. `browser_get_text` - Get the text content of an element
-6. `browser_eval` - Evaluate JavaScript in the browser context
-7. `browser_screenshot` - Take a screenshot of the page or a specific element
-8. `browser_scroll_into_view` - Scroll an element into view
-9. `browser_resize` - Resize the browser window to specific dimensions
+2. `browser_eval` - Evaluate JavaScript in the browser context
+3. `browser_screenshot` - Take a screenshot of the page or a specific element
## Usage
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index 577a5a9..81ae105 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go
@@ -173,7 +173,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["url"]
@@ -212,264 +212,19 @@
return llm.ToolOut{LLMContent: llm.TextContent("done")}
}
-// ClickTool definition
-type clickInput struct {
- Selector string `json:"selector"`
- WaitVisible bool `json:"wait_visible,omitempty"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewClickTool creates a tool for clicking elements
-func (b *BrowseTools) NewClickTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_click",
- Description: "Click the first element matching a CSS selector",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to click"
- },
- "wait_visible": {
- "type": "boolean",
- "description": "Wait for the element to be visible before clicking"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.clickRun,
- }
-}
-
-func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input clickInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- actions := []chromedp.Action{
- chromedp.WaitReady(input.Selector),
- }
-
- if input.WaitVisible {
- actions = append(actions, chromedp.WaitVisible(input.Selector))
- }
-
- actions = append(actions, chromedp.Click(input.Selector))
-
- err = chromedp.Run(timeoutCtx, actions...)
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// TypeTool definition
-type typeInput struct {
- Selector string `json:"selector"`
- Text string `json:"text"`
- Clear bool `json:"clear,omitempty"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewTypeTool creates a tool for typing into input elements
-func (b *BrowseTools) NewTypeTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_type",
- Description: "Type text into an input or textarea element",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the input element"
- },
- "text": {
- "type": "string",
- "description": "Text to type into the element"
- },
- "clear": {
- "type": "boolean",
- "description": "Clear the input field before typing"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector", "text"]
- }`),
- Run: b.typeRun,
- }
-}
-
-func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input typeInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- actions := []chromedp.Action{
- chromedp.WaitReady(input.Selector),
- chromedp.WaitVisible(input.Selector),
- }
-
- if input.Clear {
- actions = append(actions, chromedp.Clear(input.Selector))
- }
-
- actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
-
- err = chromedp.Run(timeoutCtx, actions...)
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// WaitForTool definition
-type waitForInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewWaitForTool creates a tool for waiting for elements
-func (b *BrowseTools) NewWaitForTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_wait_for",
- Description: "Wait for an element to be present in the DOM",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to wait for"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.waitForRun,
- }
-}
-
-func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input waitForInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// GetTextTool definition
-type getTextInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewGetTextTool creates a tool for getting text from elements
-func (b *BrowseTools) NewGetTextTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_get_text",
- Description: "Get the innerText of an element, returned in innerText tag. Can be used to read the web page.",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to get text from"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.getTextRun,
- }
-}
-
-func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input getTextInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- var text string
- err = chromedp.Run(timeoutCtx,
- chromedp.WaitReady(input.Selector),
- chromedp.Text(input.Selector, &text),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("<innerText>" + text + "</innerText>")}
-}
-
// EvalTool definition
type evalInput struct {
Expression string `json:"expression"`
Timeout string `json:"timeout,omitempty"`
+ Await *bool `json:"await,omitempty"`
}
// NewEvalTool creates a tool for evaluating JavaScript
func (b *BrowseTools) NewEvalTool() *llm.Tool {
return &llm.Tool{
- Name: "browser_eval",
- Description: "Evaluate JavaScript in the browser context",
+ Name: "browser_eval",
+ Description: `Evaluate JavaScript in the browser context.
+Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
@@ -479,7 +234,11 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
+ },
+ "await": {
+ "type": "boolean",
+ "description": "If true, wait for promises to resolve and return their resolved value (default: true)"
}
},
"required": ["expression"]
@@ -504,7 +263,21 @@
defer cancel()
var result any
- err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
+ var evalOps []chromedp.EvaluateOption
+
+ await := true
+ if input.Await != nil {
+ await = *input.Await
+ }
+ if await {
+ evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
+ return p.WithAwaitPromise(true)
+ })
+ }
+
+ evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
+
+ err = chromedp.Run(timeoutCtx, evalAction)
if err != nil {
return llm.ErrorToolOut(err)
}
@@ -538,7 +311,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
}
}`),
@@ -606,151 +379,11 @@
}}
}
-// ScrollIntoViewTool definition
-type scrollIntoViewInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewScrollIntoViewTool creates a tool for scrolling elements into view
-func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_scroll_into_view",
- Description: "Scroll an element into view if it's not visible",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to scroll into view"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.scrollIntoViewRun,
- }
-}
-
-func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input scrollIntoViewInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- script := fmt.Sprintf(`
- const el = document.querySelector('%s');
- if (el) {
- el.scrollIntoView({behavior: 'smooth', block: 'center'});
- return true;
- }
- return false;
- `, input.Selector)
-
- var result bool
- err = chromedp.Run(timeoutCtx,
- chromedp.WaitReady(input.Selector),
- chromedp.Evaluate(script, &result),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- if !result {
- return llm.ErrorToolOut(fmt.Errorf("element not found: %s", input.Selector))
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// ResizeTool definition
-type resizeInput struct {
- Width int `json:"width"`
- Height int `json:"height"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewResizeTool creates a tool for resizing the browser window
-func (b *BrowseTools) NewResizeTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_resize",
- Description: "Resize the browser window to a specific width and height",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "width": {
- "type": "integer",
- "description": "Window width in pixels"
- },
- "height": {
- "type": "integer",
- "description": "Window height in pixels"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["width", "height"]
- }`),
- Run: b.resizeRun,
- }
-}
-
-func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input resizeInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- // Validate dimensions
- if input.Width <= 0 || input.Height <= 0 {
- return llm.ErrorToolOut(fmt.Errorf("invalid dimensions: width and height must be positive"))
- }
-
- // Resize the browser window
- err = chromedp.Run(timeoutCtx,
- chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
// GetTools returns browser tools, optionally filtering out screenshot-related tools
func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
tools := []*llm.Tool{
b.NewNavigateTool(),
- b.NewClickTool(),
- b.NewTypeTool(),
- b.NewWaitForTool(),
- b.NewGetTextTool(),
b.NewEvalTool(),
- b.NewScrollIntoViewTool(),
- b.NewResizeTool(),
b.NewRecentConsoleLogsTool(),
b.NewClearConsoleLogsTool(),
}
@@ -809,7 +442,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["path"]
@@ -861,16 +494,10 @@
// parseTimeout parses a timeout string and returns a time.Duration
// It returns a default of 5 seconds if the timeout is empty or invalid
func parseTimeout(timeout string) time.Duration {
- if timeout == "" {
- return 5 * time.Second // default 5 seconds
- }
-
dur, err := time.ParseDuration(timeout)
if err != nil {
- // If parsing fails, return the default
- return 5 * time.Second
+ return 15 * time.Second
}
-
return dur
}
diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go
index d1b1eec..def464b 100644
--- a/claudetool/browse/browse_test.go
+++ b/claudetool/browse/browse_test.go
@@ -30,13 +30,8 @@
requiredProps []string
}{
{tools.NewNavigateTool(), "browser_navigate", "Navigate", []string{"url"}},
- {tools.NewClickTool(), "browser_click", "Click", []string{"selector"}},
- {tools.NewTypeTool(), "browser_type", "Type", []string{"selector", "text"}},
- {tools.NewWaitForTool(), "browser_wait_for", "Wait", []string{"selector"}},
- {tools.NewGetTextTool(), "browser_get_text", "Get", []string{"selector"}},
{tools.NewEvalTool(), "browser_eval", "Evaluate", []string{"expression"}},
{tools.NewScreenshotTool(), "browser_take_screenshot", "Take", nil},
- {tools.NewScrollIntoViewTool(), "browser_scroll_into_view", "Scroll", []string{"selector"}},
}
for _, tt := range toolTests {
@@ -78,8 +73,8 @@
// Test with screenshot tools included
t.Run("with screenshots", func(t *testing.T) {
toolsWithScreenshots := tools.GetTools(true)
- if len(toolsWithScreenshots) != 12 {
- t.Errorf("expected 12 tools with screenshots, got %d", len(toolsWithScreenshots))
+ if len(toolsWithScreenshots) != 6 {
+ t.Errorf("expected 6 tools with screenshots, got %d", len(toolsWithScreenshots))
}
// Check tool naming convention
@@ -94,8 +89,8 @@
// Test without screenshot tools
t.Run("without screenshots", func(t *testing.T) {
noScreenshotTools := tools.GetTools(false)
- if len(noScreenshotTools) != 10 {
- t.Errorf("expected 10 tools without screenshots, got %d", len(noScreenshotTools))
+ if len(noScreenshotTools) != 4 {
+ t.Errorf("expected 4 tools without screenshots, got %d", len(noScreenshotTools))
}
})
}
@@ -382,61 +377,3 @@
t.Errorf("Expected default height %v, got %v", expectedHeight, response.Height)
}
}
-
-// TestResizeTool tests the browser resize functionality
-func TestResizeTool(t *testing.T) {
- ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
- defer cancel()
-
- // Skip if CI or headless testing environment
- if os.Getenv("CI") != "" || os.Getenv("HEADLESS_TEST") != "" {
- t.Skip("Skipping browser test in CI/headless environment")
- }
-
- t.Run("ResizeWindow", func(t *testing.T) {
- tools := NewBrowseTools(ctx)
- t.Cleanup(func() {
- tools.Close()
- })
-
- // Resize to mobile dimensions
- resizeTool := tools.NewResizeTool()
- input := json.RawMessage(`{"width": 375, "height": 667}`)
- toolOut := resizeTool.Run(ctx, input)
- if toolOut.Error != nil {
- t.Fatalf("Error: %v", toolOut.Error)
- }
- content := toolOut.LLMContent
- if !strings.Contains(content[0].Text, "done") {
- t.Fatalf("Expected done in response, got: %s", content[0].Text)
- }
-
- // Navigate to a test page and verify using JavaScript to get window dimensions
- navInput := json.RawMessage(`{"url": "https://example.com"}`)
- toolOut = tools.NewNavigateTool().Run(ctx, navInput)
- if toolOut.Error != nil {
- t.Fatalf("Error: %v", toolOut.Error)
- }
- content = toolOut.LLMContent
- if !strings.Contains(content[0].Text, "done") {
- t.Fatalf("Expected done in response, got: %s", content[0].Text)
- }
-
- // Check dimensions via JavaScript
- evalInput := json.RawMessage(`{"expression": "({width: window.innerWidth, height: window.innerHeight})"}`)
- toolOut = tools.NewEvalTool().Run(ctx, evalInput)
- if toolOut.Error != nil {
- t.Fatalf("Error: %v", toolOut.Error)
- }
- content = toolOut.LLMContent
-
- // The dimensions might not be exactly what we set (browser chrome, etc.)
- // but they should be close
- if !strings.Contains(content[0].Text, "width") {
- t.Fatalf("Expected width in response, got: %s", content[0].Text)
- }
- if !strings.Contains(content[0].Text, "height") {
- t.Fatalf("Expected height in response, got: %s", content[0].Text)
- }
- })
-}