blob: bebf88cf6f80a1067d2991dd102f7cfee5c5edd9 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
Philip Zeyligerc0131342025-06-13 21:07:08 -070074 // This is the default when running as root, but we generally need it
75 // when running in a container, even when we aren't root (which is largely
76 // the case for tests).
77 opts = append(opts, chromedp.NoSandbox)
Philip Zeyligera35de5f2025-06-14 12:00:48 -070078 // Setting 'DBUS_SESSION_BUS_ADDRESS=""' or this flag allows tests to pass
79 // in GitHub runner contexts. It's a mystery why the failure isn't clear when this fails.
80 opts = append(opts, chromedp.Flag("--disable-dbus", true))
81 // This can be pretty slow in tests
82 opts = append(opts, chromedp.WSURLReadTimeout(30*time.Second))
Philip Zeyliger33d282f2025-05-03 04:01:54 +000083 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
84 browserCtx, browserCancel := chromedp.NewContext(
85 allocCtx,
Philip Zeyligera35de5f2025-06-14 12:00:48 -070086 chromedp.WithLogf(log.Printf), chromedp.WithErrorf(log.Printf), chromedp.WithBrowserOption(chromedp.WithDialTimeout(30*time.Second)),
Philip Zeyliger33d282f2025-05-03 04:01:54 +000087 )
88
89 b.browserCtx = browserCtx
90 b.browserCtxCancel = browserCancel
91
Philip Zeyliger18e33682025-05-13 16:34:21 -070092 // Set up console log listener
93 chromedp.ListenTarget(browserCtx, func(ev any) {
94 switch e := ev.(type) {
95 case *runtime.EventConsoleAPICalled:
96 b.captureConsoleLog(e)
97 }
98 })
99
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000100 // Ensure the browser starts
101 if err := chromedp.Run(browserCtx); err != nil {
102 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
103 return
104 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +0000105
106 // Set default viewport size to 1280x720 (16:9 widescreen)
107 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
108 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
109 return
110 }
111
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000112 b.initialized = true
113 })
114
115 return b.initErr
116}
117
118// Close shuts down the browser
119func (b *BrowseTools) Close() {
120 b.mux.Lock()
121 defer b.mux.Unlock()
122
123 if b.browserCtxCancel != nil {
124 b.browserCtxCancel()
125 b.browserCtxCancel = nil
126 }
127
128 if b.cancel != nil {
129 b.cancel()
130 }
131
132 b.initialized = false
133 log.Println("Browser closed")
134}
135
136// GetBrowserContext returns the context for browser operations
137func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
138 if err := b.Initialize(); err != nil {
139 return nil, err
140 }
141 return b.browserCtx, nil
142}
143
144// All tools return this as a response when successful
145type baseResponse struct {
146 Status string `json:"status,omitempty"`
147}
148
149func successResponse() string {
150 return `{"status":"success"}`
151}
152
153func errorResponse(err error) string {
154 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
155}
156
157// NavigateTool definition
158type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700159 URL string `json:"url"`
160 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000161}
162
163// isPort80 reports whether urlStr definitely uses port 80.
164func isPort80(urlStr string) bool {
165 parsedURL, err := url.Parse(urlStr)
166 if err != nil {
167 return false
168 }
169 port := parsedURL.Port()
170 return port == "80" || (port == "" && parsedURL.Scheme == "http")
171}
172
173// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000174func (b *BrowseTools) NewNavigateTool() *llm.Tool {
175 return &llm.Tool{
176 Name: "browser_navigate",
177 Description: "Navigate the browser to a specific URL and wait for page to load",
178 InputSchema: json.RawMessage(`{
179 "type": "object",
180 "properties": {
181 "url": {
182 "type": "string",
183 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700184 },
185 "timeout": {
186 "type": "string",
187 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000188 }
189 },
190 "required": ["url"]
191 }`),
192 Run: b.navigateRun,
193 }
194}
195
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700196func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000197 var input navigateInput
198 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700199 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000200 }
201
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000202 if isPort80(input.URL) {
203 return llm.TextContent(errorResponse(fmt.Errorf("port 80 is not the port you're looking for--it is the main sketch server"))), nil
204 }
205
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000206 browserCtx, err := b.GetBrowserContext()
207 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700208 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000209 }
210
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700211 // Create a timeout context for this operation
212 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
213 defer cancel()
214
215 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000216 chromedp.Navigate(input.URL),
217 chromedp.WaitReady("body"),
218 )
219 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700220 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000221 }
222
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700223 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000224}
225
226// ClickTool definition
227type clickInput struct {
228 Selector string `json:"selector"`
229 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700230 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000231}
232
233// NewClickTool creates a tool for clicking elements
234func (b *BrowseTools) NewClickTool() *llm.Tool {
235 return &llm.Tool{
236 Name: "browser_click",
237 Description: "Click the first element matching a CSS selector",
238 InputSchema: json.RawMessage(`{
239 "type": "object",
240 "properties": {
241 "selector": {
242 "type": "string",
243 "description": "CSS selector for the element to click"
244 },
245 "wait_visible": {
246 "type": "boolean",
247 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700248 },
249 "timeout": {
250 "type": "string",
251 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000252 }
253 },
254 "required": ["selector"]
255 }`),
256 Run: b.clickRun,
257 }
258}
259
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700260func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000261 var input clickInput
262 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700263 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000264 }
265
266 browserCtx, err := b.GetBrowserContext()
267 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700268 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000269 }
270
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700271 // Create a timeout context for this operation
272 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
273 defer cancel()
274
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000275 actions := []chromedp.Action{
276 chromedp.WaitReady(input.Selector),
277 }
278
279 if input.WaitVisible {
280 actions = append(actions, chromedp.WaitVisible(input.Selector))
281 }
282
283 actions = append(actions, chromedp.Click(input.Selector))
284
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700285 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000286 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700287 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000288 }
289
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700290 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000291}
292
293// TypeTool definition
294type typeInput struct {
295 Selector string `json:"selector"`
296 Text string `json:"text"`
297 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700298 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000299}
300
301// NewTypeTool creates a tool for typing into input elements
302func (b *BrowseTools) NewTypeTool() *llm.Tool {
303 return &llm.Tool{
304 Name: "browser_type",
305 Description: "Type text into an input or textarea element",
306 InputSchema: json.RawMessage(`{
307 "type": "object",
308 "properties": {
309 "selector": {
310 "type": "string",
311 "description": "CSS selector for the input element"
312 },
313 "text": {
314 "type": "string",
315 "description": "Text to type into the element"
316 },
317 "clear": {
318 "type": "boolean",
319 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700320 },
321 "timeout": {
322 "type": "string",
323 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000324 }
325 },
326 "required": ["selector", "text"]
327 }`),
328 Run: b.typeRun,
329 }
330}
331
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700332func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000333 var input typeInput
334 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700335 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000336 }
337
338 browserCtx, err := b.GetBrowserContext()
339 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700340 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000341 }
342
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700343 // Create a timeout context for this operation
344 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
345 defer cancel()
346
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000347 actions := []chromedp.Action{
348 chromedp.WaitReady(input.Selector),
349 chromedp.WaitVisible(input.Selector),
350 }
351
352 if input.Clear {
353 actions = append(actions, chromedp.Clear(input.Selector))
354 }
355
356 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
357
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700358 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000359 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700360 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000361 }
362
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700363 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000364}
365
366// WaitForTool definition
367type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700368 Selector string `json:"selector"`
369 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000370}
371
372// NewWaitForTool creates a tool for waiting for elements
373func (b *BrowseTools) NewWaitForTool() *llm.Tool {
374 return &llm.Tool{
375 Name: "browser_wait_for",
376 Description: "Wait for an element to be present in the DOM",
377 InputSchema: json.RawMessage(`{
378 "type": "object",
379 "properties": {
380 "selector": {
381 "type": "string",
382 "description": "CSS selector for the element to wait for"
383 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700384 "timeout": {
385 "type": "string",
386 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000387 }
388 },
389 "required": ["selector"]
390 }`),
391 Run: b.waitForRun,
392 }
393}
394
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700395func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000396 var input waitForInput
397 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700398 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000399 }
400
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000401 browserCtx, err := b.GetBrowserContext()
402 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700403 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000404 }
405
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700406 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000407 defer cancel()
408
409 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
410 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700411 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000412 }
413
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700414 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000415}
416
417// GetTextTool definition
418type getTextInput struct {
419 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700420 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000421}
422
423type getTextOutput struct {
424 Text string `json:"text"`
425}
426
427// NewGetTextTool creates a tool for getting text from elements
428func (b *BrowseTools) NewGetTextTool() *llm.Tool {
429 return &llm.Tool{
430 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700431 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000432 InputSchema: json.RawMessage(`{
433 "type": "object",
434 "properties": {
435 "selector": {
436 "type": "string",
437 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700438 },
439 "timeout": {
440 "type": "string",
441 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000442 }
443 },
444 "required": ["selector"]
445 }`),
446 Run: b.getTextRun,
447 }
448}
449
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700450func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000451 var input getTextInput
452 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700453 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000454 }
455
456 browserCtx, err := b.GetBrowserContext()
457 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700458 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000459 }
460
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700461 // Create a timeout context for this operation
462 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
463 defer cancel()
464
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000465 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700466 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000467 chromedp.WaitReady(input.Selector),
468 chromedp.Text(input.Selector, &text),
469 )
470 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700471 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000472 }
473
474 output := getTextOutput{Text: text}
475 result, err := json.Marshal(output)
476 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700477 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000478 }
479
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700480 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000481}
482
483// EvalTool definition
484type evalInput struct {
485 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700486 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000487}
488
489type evalOutput struct {
490 Result any `json:"result"`
491}
492
493// NewEvalTool creates a tool for evaluating JavaScript
494func (b *BrowseTools) NewEvalTool() *llm.Tool {
495 return &llm.Tool{
496 Name: "browser_eval",
497 Description: "Evaluate JavaScript in the browser context",
498 InputSchema: json.RawMessage(`{
499 "type": "object",
500 "properties": {
501 "expression": {
502 "type": "string",
503 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700504 },
505 "timeout": {
506 "type": "string",
507 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000508 }
509 },
510 "required": ["expression"]
511 }`),
512 Run: b.evalRun,
513 }
514}
515
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700516func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000517 var input evalInput
518 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700519 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000520 }
521
522 browserCtx, err := b.GetBrowserContext()
523 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700524 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000525 }
526
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700527 // Create a timeout context for this operation
528 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
529 defer cancel()
530
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000531 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700532 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000533 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700534 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000535 }
536
537 output := evalOutput{Result: result}
538 response, err := json.Marshal(output)
539 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700540 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000541 }
542
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700543 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000544}
545
546// ScreenshotTool definition
547type screenshotInput struct {
548 Selector string `json:"selector,omitempty"`
549 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700550 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000551}
552
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000553// NewScreenshotTool creates a tool for taking screenshots
554func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
555 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700556 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000557 Description: "Take a screenshot of the page or a specific element",
558 InputSchema: json.RawMessage(`{
559 "type": "object",
560 "properties": {
561 "selector": {
562 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700563 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000564 },
565 "format": {
566 "type": "string",
567 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
568 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700569 },
570 "timeout": {
571 "type": "string",
572 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000573 }
574 }
575 }`),
576 Run: b.screenshotRun,
577 }
578}
579
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700580func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000581 var input screenshotInput
582 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700583 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000584 }
585
586 browserCtx, err := b.GetBrowserContext()
587 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700588 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000589 }
590
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700591 // Create a timeout context for this operation
592 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
593 defer cancel()
594
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000595 var buf []byte
596 var actions []chromedp.Action
597
598 if input.Selector != "" {
599 // Take screenshot of specific element
600 actions = append(actions,
601 chromedp.WaitReady(input.Selector),
602 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
603 )
604 } else {
605 // Take full page screenshot
606 actions = append(actions, chromedp.CaptureScreenshot(&buf))
607 }
608
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700609 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000610 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700611 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000612 }
613
Philip Zeyliger542bda32025-06-11 18:31:03 -0700614 // Save the screenshot and get its ID for potential future reference
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000615 id := b.SaveScreenshot(buf)
616 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700617 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000618 }
619
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700620 // Get the full path to the screenshot
621 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000622
Philip Zeyliger542bda32025-06-11 18:31:03 -0700623 // Encode the image as base64
624 base64Data := base64.StdEncoding.EncodeToString(buf)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700625
Philip Zeyliger542bda32025-06-11 18:31:03 -0700626 // Return the screenshot directly to the LLM
627 return []llm.Content{
628 {
629 Type: llm.ContentTypeText,
630 Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
631 },
632 {
633 Type: llm.ContentTypeText, // Will be mapped to image in content array
634 MediaType: "image/png",
635 Data: base64Data,
636 },
637 }, nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000638}
639
640// ScrollIntoViewTool definition
641type scrollIntoViewInput struct {
642 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700643 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000644}
645
646// NewScrollIntoViewTool creates a tool for scrolling elements into view
647func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
648 return &llm.Tool{
649 Name: "browser_scroll_into_view",
650 Description: "Scroll an element into view if it's not visible",
651 InputSchema: json.RawMessage(`{
652 "type": "object",
653 "properties": {
654 "selector": {
655 "type": "string",
656 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700657 },
658 "timeout": {
659 "type": "string",
660 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000661 }
662 },
663 "required": ["selector"]
664 }`),
665 Run: b.scrollIntoViewRun,
666 }
667}
668
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700669func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000670 var input scrollIntoViewInput
671 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700672 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000673 }
674
675 browserCtx, err := b.GetBrowserContext()
676 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700677 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000678 }
679
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700680 // Create a timeout context for this operation
681 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
682 defer cancel()
683
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000684 script := fmt.Sprintf(`
685 const el = document.querySelector('%s');
686 if (el) {
687 el.scrollIntoView({behavior: 'smooth', block: 'center'});
688 return true;
689 }
690 return false;
691 `, input.Selector)
692
693 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700694 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000695 chromedp.WaitReady(input.Selector),
696 chromedp.Evaluate(script, &result),
697 )
698 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700699 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000700 }
701
702 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700703 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000704 }
705
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700706 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000707}
708
Philip Zeyliger05224842025-05-10 18:26:08 -0700709// ResizeTool definition
710type resizeInput struct {
711 Width int `json:"width"`
712 Height int `json:"height"`
713 Timeout string `json:"timeout,omitempty"`
714}
715
716// NewResizeTool creates a tool for resizing the browser window
717func (b *BrowseTools) NewResizeTool() *llm.Tool {
718 return &llm.Tool{
719 Name: "browser_resize",
720 Description: "Resize the browser window to a specific width and height",
721 InputSchema: json.RawMessage(`{
722 "type": "object",
723 "properties": {
724 "width": {
725 "type": "integer",
726 "description": "Window width in pixels"
727 },
728 "height": {
729 "type": "integer",
730 "description": "Window height in pixels"
731 },
732 "timeout": {
733 "type": "string",
734 "description": "Timeout as a Go duration string (default: 5s)"
735 }
736 },
737 "required": ["width", "height"]
738 }`),
739 Run: b.resizeRun,
740 }
741}
742
743func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
744 var input resizeInput
745 if err := json.Unmarshal(m, &input); err != nil {
746 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
747 }
748
749 browserCtx, err := b.GetBrowserContext()
750 if err != nil {
751 return llm.TextContent(errorResponse(err)), nil
752 }
753
754 // Create a timeout context for this operation
755 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
756 defer cancel()
757
758 // Validate dimensions
759 if input.Width <= 0 || input.Height <= 0 {
760 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
761 }
762
763 // Resize the browser window
764 err = chromedp.Run(timeoutCtx,
765 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
766 )
767 if err != nil {
768 return llm.TextContent(errorResponse(err)), nil
769 }
770
771 return llm.TextContent(successResponse()), nil
772}
773
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700774// GetTools returns browser tools, optionally filtering out screenshot-related tools
775func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
776 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000777 b.NewNavigateTool(),
778 b.NewClickTool(),
779 b.NewTypeTool(),
780 b.NewWaitForTool(),
781 b.NewGetTextTool(),
782 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000783 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700784 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700785 b.NewRecentConsoleLogsTool(),
786 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000787 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700788
789 // Add screenshot-related tools if supported
790 if includeScreenshotTools {
791 tools = append(tools, b.NewScreenshotTool())
792 tools = append(tools, b.NewReadImageTool())
793 }
794
795 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000796}
797
798// SaveScreenshot saves a screenshot to disk and returns its ID
799func (b *BrowseTools) SaveScreenshot(data []byte) string {
800 // Generate a unique ID
801 id := uuid.New().String()
802
803 // Save the file
804 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000805 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000806 log.Printf("Failed to save screenshot: %v", err)
807 return ""
808 }
809
810 // Track this screenshot
811 b.screenshotsMutex.Lock()
812 b.screenshots[id] = time.Now()
813 b.screenshotsMutex.Unlock()
814
815 return id
816}
817
818// GetScreenshotPath returns the full path to a screenshot by ID
819func GetScreenshotPath(id string) string {
820 return filepath.Join(ScreenshotDir, id+".png")
821}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700822
823// ReadImageTool definition
824type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700825 Path string `json:"path"`
826 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700827}
828
829// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
830func (b *BrowseTools) NewReadImageTool() *llm.Tool {
831 return &llm.Tool{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700832 Name: "read_image",
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700833 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
834 InputSchema: json.RawMessage(`{
835 "type": "object",
836 "properties": {
837 "path": {
838 "type": "string",
839 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700840 },
841 "timeout": {
842 "type": "string",
843 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700844 }
845 },
846 "required": ["path"]
847 }`),
848 Run: b.readImageRun,
849 }
850}
851
852func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
853 var input readImageInput
854 if err := json.Unmarshal(m, &input); err != nil {
855 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
856 }
857
858 // Check if the path exists
859 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
860 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
861 }
862
863 // Read the file
864 imageData, err := os.ReadFile(input.Path)
865 if err != nil {
866 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
867 }
868
869 // Detect the image type
870 imageType := http.DetectContentType(imageData)
871 if !strings.HasPrefix(imageType, "image/") {
872 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
873 }
874
875 // Encode the image as base64
876 base64Data := base64.StdEncoding.EncodeToString(imageData)
877
878 // Create a Content object that includes both text and the image
879 return []llm.Content{
880 {
881 Type: llm.ContentTypeText,
882 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
883 },
884 {
885 Type: llm.ContentTypeText, // Will be mapped to image in content array
886 MediaType: imageType,
887 Data: base64Data,
888 },
889 }, nil
890}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700891
892// parseTimeout parses a timeout string and returns a time.Duration
893// It returns a default of 5 seconds if the timeout is empty or invalid
894func parseTimeout(timeout string) time.Duration {
895 if timeout == "" {
896 return 5 * time.Second // default 5 seconds
897 }
898
899 dur, err := time.ParseDuration(timeout)
900 if err != nil {
901 // If parsing fails, return the default
902 return 5 * time.Second
903 }
904
905 return dur
906}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700907
908// captureConsoleLog captures a console log event and stores it
909func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
910 // Add to logs with mutex protection
911 b.consoleLogsMutex.Lock()
912 defer b.consoleLogsMutex.Unlock()
913
914 // Add the log and maintain max size
915 b.consoleLogs = append(b.consoleLogs, e)
916 if len(b.consoleLogs) > b.maxConsoleLogs {
917 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
918 }
919}
920
921// RecentConsoleLogsTool definition
922type recentConsoleLogsInput struct {
923 Limit int `json:"limit,omitempty"`
924}
925
926// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
927func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
928 return &llm.Tool{
929 Name: "browser_recent_console_logs",
930 Description: "Get recent browser console logs",
931 InputSchema: json.RawMessage(`{
932 "type": "object",
933 "properties": {
934 "limit": {
935 "type": "integer",
936 "description": "Maximum number of log entries to return (default: 100)"
937 }
938 }
939 }`),
940 Run: b.recentConsoleLogsRun,
941 }
942}
943
944func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
945 var input recentConsoleLogsInput
946 if err := json.Unmarshal(m, &input); err != nil {
947 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
948 }
949
950 // Ensure browser is initialized
951 _, err := b.GetBrowserContext()
952 if err != nil {
953 return llm.TextContent(errorResponse(err)), nil
954 }
955
956 // Apply limit (default to 100 if not specified)
957 limit := 100
958 if input.Limit > 0 {
959 limit = input.Limit
960 }
961
962 // Get console logs with mutex protection
963 b.consoleLogsMutex.Lock()
964 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
965 start := 0
966 if len(b.consoleLogs) > limit {
967 start = len(b.consoleLogs) - limit
968 }
969 logs = append(logs, b.consoleLogs[start:]...)
970 b.consoleLogsMutex.Unlock()
971
972 // Format the logs as JSON
973 logData, err := json.MarshalIndent(logs, "", " ")
974 if err != nil {
975 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
976 }
977
978 // Format the logs
979 var sb strings.Builder
980 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
981
982 if len(logs) == 0 {
983 sb.WriteString("No console logs captured.")
984 } else {
985 // Add the JSON data for full details
986 sb.WriteString(string(logData))
987 }
988
989 return llm.TextContent(sb.String()), nil
990}
991
992// ClearConsoleLogsTool definition
993type clearConsoleLogsInput struct{}
994
995// NewClearConsoleLogsTool creates a tool for clearing console logs
996func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
997 return &llm.Tool{
998 Name: "browser_clear_console_logs",
999 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -07001000 InputSchema: llm.EmptySchema(),
1001 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -07001002 }
1003}
1004
1005func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
1006 var input clearConsoleLogsInput
1007 if err := json.Unmarshal(m, &input); err != nil {
1008 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
1009 }
1010
1011 // Ensure browser is initialized
1012 _, err := b.GetBrowserContext()
1013 if err != nil {
1014 return llm.TextContent(errorResponse(err)), nil
1015 }
1016
1017 // Clear console logs with mutex protection
1018 b.consoleLogsMutex.Lock()
1019 logCount := len(b.consoleLogs)
1020 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
1021 b.consoleLogsMutex.Unlock()
1022
1023 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
1024}