Browser tools: initialize lazily and add timeouts.
Also rename browser_screenshot to browser_take_screenshot for clarity\n- Update both Go and UI code to maintain consistency
Co-Authored-By: sketch <hello@sketch.dev>
Change-ID: s8a5cabff914f88dfk
diff --git a/claudetool/browse/browse.go b/claudetool/browse/browse.go
index 60e66ac..a5a1621 100644
--- a/claudetool/browse/browse.go
+++ b/claudetool/browse/browse.go
@@ -124,10 +124,9 @@
// NavigateTool definition
type navigateInput struct {
- URL string `json:"url"`
-}
-
-// NewNavigateTool creates a tool for navigating to URLs
+ URL string `json:"url"`
+ Timeout string `json:"timeout,omitempty"`
+} // NewNavigateTool creates a tool for navigating to URLs
func (b *BrowseTools) NewNavigateTool() *llm.Tool {
return &llm.Tool{
Name: "browser_navigate",
@@ -138,6 +137,10 @@
"url": {
"type": "string",
"description": "The URL to navigate to"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["url"]
@@ -157,7 +160,11 @@
return llm.TextContent(errorResponse(err)), nil
}
- err = chromedp.Run(browserCtx,
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
+ err = chromedp.Run(timeoutCtx,
chromedp.Navigate(input.URL),
chromedp.WaitReady("body"),
)
@@ -172,6 +179,7 @@
type clickInput struct {
Selector string `json:"selector"`
WaitVisible bool `json:"wait_visible,omitempty"`
+ Timeout string `json:"timeout,omitempty"`
}
// NewClickTool creates a tool for clicking elements
@@ -189,6 +197,10 @@
"wait_visible": {
"type": "boolean",
"description": "Wait for the element to be visible before clicking"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["selector"]
@@ -208,6 +220,10 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
actions := []chromedp.Action{
chromedp.WaitReady(input.Selector),
}
@@ -218,7 +234,7 @@
actions = append(actions, chromedp.Click(input.Selector))
- err = chromedp.Run(browserCtx, actions...)
+ err = chromedp.Run(timeoutCtx, actions...)
if err != nil {
return llm.TextContent(errorResponse(err)), nil
}
@@ -231,6 +247,7 @@
Selector string `json:"selector"`
Text string `json:"text"`
Clear bool `json:"clear,omitempty"`
+ Timeout string `json:"timeout,omitempty"`
}
// NewTypeTool creates a tool for typing into input elements
@@ -252,6 +269,10 @@
"clear": {
"type": "boolean",
"description": "Clear the input field before typing"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["selector", "text"]
@@ -271,6 +292,10 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
actions := []chromedp.Action{
chromedp.WaitReady(input.Selector),
chromedp.WaitVisible(input.Selector),
@@ -282,7 +307,7 @@
actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
- err = chromedp.Run(browserCtx, actions...)
+ err = chromedp.Run(timeoutCtx, actions...)
if err != nil {
return llm.TextContent(errorResponse(err)), nil
}
@@ -292,8 +317,8 @@
// WaitForTool definition
type waitForInput struct {
- Selector string `json:"selector"`
- TimeoutMS int `json:"timeout_ms,omitempty"`
+ Selector string `json:"selector"`
+ Timeout string `json:"timeout,omitempty"`
}
// NewWaitForTool creates a tool for waiting for elements
@@ -308,9 +333,9 @@
"type": "string",
"description": "CSS selector for the element to wait for"
},
- "timeout_ms": {
- "type": "integer",
- "description": "Maximum time to wait in milliseconds (default: 30000)"
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["selector"]
@@ -325,17 +350,12 @@
return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
}
- timeout := 30000 // default timeout 30 seconds
- if input.TimeoutMS > 0 {
- timeout = input.TimeoutMS
- }
-
browserCtx, err := b.GetBrowserContext()
if err != nil {
return llm.TextContent(errorResponse(err)), nil
}
- timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
defer cancel()
err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
@@ -349,6 +369,7 @@
// GetTextTool definition
type getTextInput struct {
Selector string `json:"selector"`
+ Timeout string `json:"timeout,omitempty"`
}
type getTextOutput struct {
@@ -359,13 +380,17 @@
func (b *BrowseTools) NewGetTextTool() *llm.Tool {
return &llm.Tool{
Name: "browser_get_text",
- Description: "Get the innerText of an element",
+ Description: "Get the innerText of an element. Can be used to read the web page.",
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"selector": {
"type": "string",
"description": "CSS selector for the element to get text from"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["selector"]
@@ -385,8 +410,12 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
var text string
- err = chromedp.Run(browserCtx,
+ err = chromedp.Run(timeoutCtx,
chromedp.WaitReady(input.Selector),
chromedp.Text(input.Selector, &text),
)
@@ -406,6 +435,7 @@
// EvalTool definition
type evalInput struct {
Expression string `json:"expression"`
+ Timeout string `json:"timeout,omitempty"`
}
type evalOutput struct {
@@ -423,6 +453,10 @@
"expression": {
"type": "string",
"description": "JavaScript expression to evaluate"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["expression"]
@@ -442,8 +476,12 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
var result any
- err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
+ err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
if err != nil {
return llm.TextContent(errorResponse(err)), nil
}
@@ -461,6 +499,7 @@
type screenshotInput struct {
Selector string `json:"selector,omitempty"`
Format string `json:"format,omitempty"`
+ Timeout string `json:"timeout,omitempty"`
}
type screenshotOutput struct {
@@ -470,7 +509,7 @@
// NewScreenshotTool creates a tool for taking screenshots
func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
return &llm.Tool{
- Name: "browser_screenshot",
+ Name: "browser_take_screenshot",
Description: "Take a screenshot of the page or a specific element",
InputSchema: json.RawMessage(`{
"type": "object",
@@ -483,6 +522,10 @@
"type": "string",
"description": "Output format ('base64' or 'png'), defaults to 'base64'",
"enum": ["base64", "png"]
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
}
}`),
@@ -501,6 +544,10 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
var buf []byte
var actions []chromedp.Action
@@ -515,7 +562,7 @@
actions = append(actions, chromedp.CaptureScreenshot(&buf))
}
- err = chromedp.Run(browserCtx, actions...)
+ err = chromedp.Run(timeoutCtx, actions...)
if err != nil {
return llm.TextContent(errorResponse(err)), nil
}
@@ -542,6 +589,7 @@
// ScrollIntoViewTool definition
type scrollIntoViewInput struct {
Selector string `json:"selector"`
+ Timeout string `json:"timeout,omitempty"`
}
// NewScrollIntoViewTool creates a tool for scrolling elements into view
@@ -555,6 +603,10 @@
"selector": {
"type": "string",
"description": "CSS selector for the element to scroll into view"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["selector"]
@@ -574,6 +626,10 @@
return llm.TextContent(errorResponse(err)), nil
}
+ // Create a timeout context for this operation
+ timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
+ defer cancel()
+
script := fmt.Sprintf(`
const el = document.querySelector('%s');
if (el) {
@@ -584,7 +640,7 @@
`, input.Selector)
var result bool
- err = chromedp.Run(browserCtx,
+ err = chromedp.Run(timeoutCtx,
chromedp.WaitReady(input.Selector),
chromedp.Evaluate(script, &result),
)
@@ -647,7 +703,8 @@
// ReadImageTool definition
type readImageInput struct {
- Path string `json:"path"`
+ Path string `json:"path"`
+ Timeout string `json:"timeout,omitempty"`
}
// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
@@ -661,6 +718,10 @@
"path": {
"type": "string",
"description": "Path to the image file to read"
+ },
+ "timeout": {
+ "type": "string",
+ "description": "Timeout as a Go duration string (default: 5s)"
}
},
"required": ["path"]
@@ -708,3 +769,19 @@
},
}, nil
}
+
+// parseTimeout parses a timeout string and returns a time.Duration
+// It returns a default of 5 seconds if the timeout is empty or invalid
+func parseTimeout(timeout string) time.Duration {
+ if timeout == "" {
+ return 5 * time.Second // default 5 seconds
+ }
+
+ dur, err := time.ParseDuration(timeout)
+ if err != nil {
+ // If parsing fails, return the default
+ return 5 * time.Second
+ }
+
+ return dur
+}
diff --git a/claudetool/browse/browse_test.go b/claudetool/browse/browse_test.go
index f1360d8..ae4fefd 100644
--- a/claudetool/browse/browse_test.go
+++ b/claudetool/browse/browse_test.go
@@ -32,7 +32,7 @@
{tools.NewWaitForTool(), "browser_wait_for", "Wait", []string{"selector"}},
{tools.NewGetTextTool(), "browser_get_text", "Get", []string{"selector"}},
{tools.NewEvalTool(), "browser_eval", "Evaluate", []string{"expression"}},
- {tools.NewScreenshotTool(), "browser_screenshot", "Take", nil},
+ {tools.NewScreenshotTool(), "browser_take_screenshot", "Take", nil},
{tools.NewScrollIntoViewTool(), "browser_scroll_into_view", "Scroll", []string{"selector"}},
}
@@ -101,7 +101,7 @@
}
// Create browser tools instance
- ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
tools := NewBrowseTools(ctx)
@@ -147,7 +147,7 @@
}
// Create browser tools instance
- ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
tools := NewBrowseTools(ctx)
diff --git a/claudetool/browse/register.go b/claudetool/browse/register.go
index 183bf14..0e60d73 100644
--- a/claudetool/browse/register.go
+++ b/claudetool/browse/register.go
@@ -2,22 +2,16 @@
import (
"context"
- "log"
"sketch.dev/llm"
)
-// RegisterBrowserTools initializes the browser tools and returns all the tools
-// ready to be added to an agent. It also returns a cleanup function that should
-// be called when done to properly close the browser.
+// RegisterBrowserTools returns all browser tools ready to be added to an agent.
+// It also returns a cleanup function that should be called when done to properly close the browser.
+// The browser will be initialized lazily when a browser tool is first used.
func RegisterBrowserTools(ctx context.Context, supportsScreenshots bool) ([]*llm.Tool, func()) {
browserTools := NewBrowseTools(ctx)
- // Initialize the browser
- if err := browserTools.Initialize(); err != nil {
- log.Printf("Warning: Failed to initialize browser: %v", err)
- }
-
return browserTools.GetTools(supportsScreenshots), func() {
browserTools.Close()
}