claudetool: streamline browser tools
browser_click, browser_type, browser_get_text, browser_scroll_into_view,
browser_resize, and wait_for can all be easily implemented using browser_eval,
if browser_eval is given an await parameter.
A bit of testing suggests that they are more robust in practice
that way, and that multiple tool calls can be combined
into a single browser_eval call, which reduces latency.
And Sketch does in fact use them as needed.
Also, bump up timeouts; empirically, 5 seconds is not enough.
As a bonus, fewer tools is better for context management.
Co-Authored-By: sketch <hello@sketch.dev>
Change-ID: s8d8cd418f3e97f26k
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index 577a5a9..81ae105 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go
@@ -173,7 +173,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["url"]
@@ -212,264 +212,19 @@
return llm.ToolOut{LLMContent: llm.TextContent("done")}
}
-// ClickTool definition
-type clickInput struct {
- Selector string `json:"selector"`
- WaitVisible bool `json:"wait_visible,omitempty"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewClickTool creates a tool for clicking elements
-func (b *BrowseTools) NewClickTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_click",
- Description: "Click the first element matching a CSS selector",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to click"
- },
- "wait_visible": {
- "type": "boolean",
- "description": "Wait for the element to be visible before clicking"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.clickRun,
- }
-}
-
-func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input clickInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- actions := []chromedp.Action{
- chromedp.WaitReady(input.Selector),
- }
-
- if input.WaitVisible {
- actions = append(actions, chromedp.WaitVisible(input.Selector))
- }
-
- actions = append(actions, chromedp.Click(input.Selector))
-
- err = chromedp.Run(timeoutCtx, actions...)
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// TypeTool definition
-type typeInput struct {
- Selector string `json:"selector"`
- Text string `json:"text"`
- Clear bool `json:"clear,omitempty"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewTypeTool creates a tool for typing into input elements
-func (b *BrowseTools) NewTypeTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_type",
- Description: "Type text into an input or textarea element",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the input element"
- },
- "text": {
- "type": "string",
- "description": "Text to type into the element"
- },
- "clear": {
- "type": "boolean",
- "description": "Clear the input field before typing"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector", "text"]
- }`),
- Run: b.typeRun,
- }
-}
-
-func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input typeInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- actions := []chromedp.Action{
- chromedp.WaitReady(input.Selector),
- chromedp.WaitVisible(input.Selector),
- }
-
- if input.Clear {
- actions = append(actions, chromedp.Clear(input.Selector))
- }
-
- actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
-
- err = chromedp.Run(timeoutCtx, actions...)
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// WaitForTool definition
-type waitForInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewWaitForTool creates a tool for waiting for elements
-func (b *BrowseTools) NewWaitForTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_wait_for",
- Description: "Wait for an element to be present in the DOM",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to wait for"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.waitForRun,
- }
-}
-
-func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input waitForInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// GetTextTool definition
-type getTextInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewGetTextTool creates a tool for getting text from elements
-func (b *BrowseTools) NewGetTextTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_get_text",
- Description: "Get the innerText of an element, returned in innerText tag. Can be used to read the web page.",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to get text from"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.getTextRun,
- }
-}
-
-func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input getTextInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- var text string
- err = chromedp.Run(timeoutCtx,
- chromedp.WaitReady(input.Selector),
- chromedp.Text(input.Selector, &text),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("<innerText>" + text + "</innerText>")}
-}
-
// EvalTool definition
type evalInput struct {
Expression string `json:"expression"`
Timeout string `json:"timeout,omitempty"`
+ Await *bool `json:"await,omitempty"`
}
// NewEvalTool creates a tool for evaluating JavaScript
func (b *BrowseTools) NewEvalTool() *llm.Tool {
return &llm.Tool{
- Name: "browser_eval",
- Description: "Evaluate JavaScript in the browser context",
+ Name: "browser_eval",
+ Description: `Evaluate JavaScript in the browser context.
+Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
@@ -479,7 +234,11 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
+ },
+ "await": {
+ "type": "boolean",
+ "description": "If true, wait for promises to resolve and return their resolved value (default: true)"
}
},
"required": ["expression"]
@@ -504,7 +263,21 @@
defer cancel()
var result any
- err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
+ var evalOps []chromedp.EvaluateOption
+
+ await := true
+ if input.Await != nil {
+ await = *input.Await
+ }
+ if await {
+ evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
+ return p.WithAwaitPromise(true)
+ })
+ }
+
+ evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
+
+ err = chromedp.Run(timeoutCtx, evalAction)
if err != nil {
return llm.ErrorToolOut(err)
}
@@ -538,7 +311,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
}
}`),
@@ -606,151 +379,11 @@
}}
}
-// ScrollIntoViewTool definition
-type scrollIntoViewInput struct {
- Selector string `json:"selector"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewScrollIntoViewTool creates a tool for scrolling elements into view
-func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_scroll_into_view",
- Description: "Scroll an element into view if it's not visible",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "selector": {
- "type": "string",
- "description": "CSS selector for the element to scroll into view"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["selector"]
- }`),
- Run: b.scrollIntoViewRun,
- }
-}
-
-func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input scrollIntoViewInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- script := fmt.Sprintf(`
- const el = document.querySelector('%s');
- if (el) {
- el.scrollIntoView({behavior: 'smooth', block: 'center'});
- return true;
- }
- return false;
- `, input.Selector)
-
- var result bool
- err = chromedp.Run(timeoutCtx,
- chromedp.WaitReady(input.Selector),
- chromedp.Evaluate(script, &result),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- if !result {
- return llm.ErrorToolOut(fmt.Errorf("element not found: %s", input.Selector))
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
-// ResizeTool definition
-type resizeInput struct {
- Width int `json:"width"`
- Height int `json:"height"`
- Timeout string `json:"timeout,omitempty"`
-}
-
-// NewResizeTool creates a tool for resizing the browser window
-func (b *BrowseTools) NewResizeTool() *llm.Tool {
- return &llm.Tool{
- Name: "browser_resize",
- Description: "Resize the browser window to a specific width and height",
- InputSchema: json.RawMessage(`{
- "type": "object",
- "properties": {
- "width": {
- "type": "integer",
- "description": "Window width in pixels"
- },
- "height": {
- "type": "integer",
- "description": "Window height in pixels"
- },
- "timeout": {
- "type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
- }
- },
- "required": ["width", "height"]
- }`),
- Run: b.resizeRun,
- }
-}
-
-func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
- var input resizeInput
- if err := json.Unmarshal(m, &input); err != nil {
- return llm.ErrorfToolOut("invalid input: %w", err)
- }
-
- browserCtx, err := b.GetBrowserContext()
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- // Create a timeout context for this operation
- timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
- defer cancel()
-
- // Validate dimensions
- if input.Width <= 0 || input.Height <= 0 {
- return llm.ErrorToolOut(fmt.Errorf("invalid dimensions: width and height must be positive"))
- }
-
- // Resize the browser window
- err = chromedp.Run(timeoutCtx,
- chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
- )
- if err != nil {
- return llm.ErrorToolOut(err)
- }
-
- return llm.ToolOut{LLMContent: llm.TextContent("done")}
-}
-
// GetTools returns browser tools, optionally filtering out screenshot-related tools
func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
tools := []*llm.Tool{
b.NewNavigateTool(),
- b.NewClickTool(),
- b.NewTypeTool(),
- b.NewWaitForTool(),
- b.NewGetTextTool(),
b.NewEvalTool(),
- b.NewScrollIntoViewTool(),
- b.NewResizeTool(),
b.NewRecentConsoleLogsTool(),
b.NewClearConsoleLogsTool(),
}
@@ -809,7 +442,7 @@
},
"timeout": {
"type": "string",
- "description": "Timeout as a Go duration string (default: 5s)"
+ "description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["path"]
@@ -861,16 +494,10 @@
// parseTimeout parses a timeout string and returns a time.Duration
// It returns a default of 5 seconds if the timeout is empty or invalid
func parseTimeout(timeout string) time.Duration {
- if timeout == "" {
- return 5 * time.Second // default 5 seconds
- }
-
dur, err := time.ParseDuration(timeout)
if err != nil {
- // If parsing fails, return the default
- return 5 * time.Second
+ return 15 * time.Second
}
-
return dur
}