blob: f600c722262901fc9f84c19db1acd2e5828472de [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
Philip Zeyligerc0131342025-06-13 21:07:08 -070074 // This is the default when running as root, but we generally need it
75 // when running in a container, even when we aren't root (which is largely
76 // the case for tests).
77 opts = append(opts, chromedp.NoSandbox)
Philip Zeyligera35de5f2025-06-14 12:00:48 -070078 // Setting 'DBUS_SESSION_BUS_ADDRESS=""' or this flag allows tests to pass
79 // in GitHub runner contexts. It's a mystery why the failure isn't clear when this fails.
80 opts = append(opts, chromedp.Flag("--disable-dbus", true))
81 // This can be pretty slow in tests
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070082 opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
Philip Zeyliger33d282f2025-05-03 04:01:54 +000083 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
84 browserCtx, browserCancel := chromedp.NewContext(
85 allocCtx,
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070086 chromedp.WithLogf(log.Printf), chromedp.WithErrorf(log.Printf), chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
Philip Zeyliger33d282f2025-05-03 04:01:54 +000087 )
88
89 b.browserCtx = browserCtx
90 b.browserCtxCancel = browserCancel
91
Philip Zeyliger18e33682025-05-13 16:34:21 -070092 // Set up console log listener
93 chromedp.ListenTarget(browserCtx, func(ev any) {
94 switch e := ev.(type) {
95 case *runtime.EventConsoleAPICalled:
96 b.captureConsoleLog(e)
97 }
98 })
99
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000100 // Ensure the browser starts
101 if err := chromedp.Run(browserCtx); err != nil {
102 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
103 return
104 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +0000105
106 // Set default viewport size to 1280x720 (16:9 widescreen)
107 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
108 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
109 return
110 }
111
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000112 b.initialized = true
113 })
114
115 return b.initErr
116}
117
118// Close shuts down the browser
119func (b *BrowseTools) Close() {
120 b.mux.Lock()
121 defer b.mux.Unlock()
122
123 if b.browserCtxCancel != nil {
124 b.browserCtxCancel()
125 b.browserCtxCancel = nil
126 }
127
128 if b.cancel != nil {
129 b.cancel()
130 }
131
132 b.initialized = false
133 log.Println("Browser closed")
134}
135
136// GetBrowserContext returns the context for browser operations
137func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
138 if err := b.Initialize(); err != nil {
139 return nil, err
140 }
141 return b.browserCtx, nil
142}
143
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000144func successResponse() string {
145 return `{"status":"success"}`
146}
147
148func errorResponse(err error) string {
149 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
150}
151
152// NavigateTool definition
153type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700154 URL string `json:"url"`
155 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000156}
157
158// isPort80 reports whether urlStr definitely uses port 80.
159func isPort80(urlStr string) bool {
160 parsedURL, err := url.Parse(urlStr)
161 if err != nil {
162 return false
163 }
164 port := parsedURL.Port()
165 return port == "80" || (port == "" && parsedURL.Scheme == "http")
166}
167
168// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000169func (b *BrowseTools) NewNavigateTool() *llm.Tool {
170 return &llm.Tool{
171 Name: "browser_navigate",
172 Description: "Navigate the browser to a specific URL and wait for page to load",
173 InputSchema: json.RawMessage(`{
174 "type": "object",
175 "properties": {
176 "url": {
177 "type": "string",
178 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700179 },
180 "timeout": {
181 "type": "string",
182 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000183 }
184 },
185 "required": ["url"]
186 }`),
187 Run: b.navigateRun,
188 }
189}
190
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700191func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000192 var input navigateInput
193 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700194 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000195 }
196
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000197 if isPort80(input.URL) {
198 return llm.TextContent(errorResponse(fmt.Errorf("port 80 is not the port you're looking for--it is the main sketch server"))), nil
199 }
200
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000201 browserCtx, err := b.GetBrowserContext()
202 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700203 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000204 }
205
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700206 // Create a timeout context for this operation
207 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
208 defer cancel()
209
210 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000211 chromedp.Navigate(input.URL),
212 chromedp.WaitReady("body"),
213 )
214 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700215 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000216 }
217
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700218 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000219}
220
221// ClickTool definition
222type clickInput struct {
223 Selector string `json:"selector"`
224 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700225 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000226}
227
228// NewClickTool creates a tool for clicking elements
229func (b *BrowseTools) NewClickTool() *llm.Tool {
230 return &llm.Tool{
231 Name: "browser_click",
232 Description: "Click the first element matching a CSS selector",
233 InputSchema: json.RawMessage(`{
234 "type": "object",
235 "properties": {
236 "selector": {
237 "type": "string",
238 "description": "CSS selector for the element to click"
239 },
240 "wait_visible": {
241 "type": "boolean",
242 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700243 },
244 "timeout": {
245 "type": "string",
246 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000247 }
248 },
249 "required": ["selector"]
250 }`),
251 Run: b.clickRun,
252 }
253}
254
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700255func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000256 var input clickInput
257 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700258 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000259 }
260
261 browserCtx, err := b.GetBrowserContext()
262 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700263 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000264 }
265
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700266 // Create a timeout context for this operation
267 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
268 defer cancel()
269
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000270 actions := []chromedp.Action{
271 chromedp.WaitReady(input.Selector),
272 }
273
274 if input.WaitVisible {
275 actions = append(actions, chromedp.WaitVisible(input.Selector))
276 }
277
278 actions = append(actions, chromedp.Click(input.Selector))
279
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700280 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000281 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700282 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000283 }
284
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700285 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000286}
287
288// TypeTool definition
289type typeInput struct {
290 Selector string `json:"selector"`
291 Text string `json:"text"`
292 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700293 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000294}
295
296// NewTypeTool creates a tool for typing into input elements
297func (b *BrowseTools) NewTypeTool() *llm.Tool {
298 return &llm.Tool{
299 Name: "browser_type",
300 Description: "Type text into an input or textarea element",
301 InputSchema: json.RawMessage(`{
302 "type": "object",
303 "properties": {
304 "selector": {
305 "type": "string",
306 "description": "CSS selector for the input element"
307 },
308 "text": {
309 "type": "string",
310 "description": "Text to type into the element"
311 },
312 "clear": {
313 "type": "boolean",
314 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700315 },
316 "timeout": {
317 "type": "string",
318 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000319 }
320 },
321 "required": ["selector", "text"]
322 }`),
323 Run: b.typeRun,
324 }
325}
326
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700327func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000328 var input typeInput
329 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700330 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000331 }
332
333 browserCtx, err := b.GetBrowserContext()
334 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700335 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000336 }
337
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700338 // Create a timeout context for this operation
339 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
340 defer cancel()
341
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000342 actions := []chromedp.Action{
343 chromedp.WaitReady(input.Selector),
344 chromedp.WaitVisible(input.Selector),
345 }
346
347 if input.Clear {
348 actions = append(actions, chromedp.Clear(input.Selector))
349 }
350
351 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
352
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700353 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000354 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700355 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000356 }
357
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700358 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000359}
360
361// WaitForTool definition
362type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700363 Selector string `json:"selector"`
364 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000365}
366
367// NewWaitForTool creates a tool for waiting for elements
368func (b *BrowseTools) NewWaitForTool() *llm.Tool {
369 return &llm.Tool{
370 Name: "browser_wait_for",
371 Description: "Wait for an element to be present in the DOM",
372 InputSchema: json.RawMessage(`{
373 "type": "object",
374 "properties": {
375 "selector": {
376 "type": "string",
377 "description": "CSS selector for the element to wait for"
378 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700379 "timeout": {
380 "type": "string",
381 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000382 }
383 },
384 "required": ["selector"]
385 }`),
386 Run: b.waitForRun,
387 }
388}
389
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700390func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000391 var input waitForInput
392 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700393 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000394 }
395
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000396 browserCtx, err := b.GetBrowserContext()
397 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700398 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000399 }
400
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700401 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000402 defer cancel()
403
404 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
405 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700406 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000407 }
408
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700409 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000410}
411
412// GetTextTool definition
413type getTextInput struct {
414 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700415 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000416}
417
418type getTextOutput struct {
419 Text string `json:"text"`
420}
421
422// NewGetTextTool creates a tool for getting text from elements
423func (b *BrowseTools) NewGetTextTool() *llm.Tool {
424 return &llm.Tool{
425 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700426 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000427 InputSchema: json.RawMessage(`{
428 "type": "object",
429 "properties": {
430 "selector": {
431 "type": "string",
432 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700433 },
434 "timeout": {
435 "type": "string",
436 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000437 }
438 },
439 "required": ["selector"]
440 }`),
441 Run: b.getTextRun,
442 }
443}
444
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700445func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000446 var input getTextInput
447 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700448 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000449 }
450
451 browserCtx, err := b.GetBrowserContext()
452 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700453 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000454 }
455
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700456 // Create a timeout context for this operation
457 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
458 defer cancel()
459
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000460 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700461 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000462 chromedp.WaitReady(input.Selector),
463 chromedp.Text(input.Selector, &text),
464 )
465 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700466 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000467 }
468
469 output := getTextOutput{Text: text}
470 result, err := json.Marshal(output)
471 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700472 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000473 }
474
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700475 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000476}
477
478// EvalTool definition
479type evalInput struct {
480 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700481 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000482}
483
484type evalOutput struct {
485 Result any `json:"result"`
486}
487
488// NewEvalTool creates a tool for evaluating JavaScript
489func (b *BrowseTools) NewEvalTool() *llm.Tool {
490 return &llm.Tool{
491 Name: "browser_eval",
492 Description: "Evaluate JavaScript in the browser context",
493 InputSchema: json.RawMessage(`{
494 "type": "object",
495 "properties": {
496 "expression": {
497 "type": "string",
498 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700499 },
500 "timeout": {
501 "type": "string",
502 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000503 }
504 },
505 "required": ["expression"]
506 }`),
507 Run: b.evalRun,
508 }
509}
510
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700511func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000512 var input evalInput
513 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700514 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000515 }
516
517 browserCtx, err := b.GetBrowserContext()
518 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700519 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000520 }
521
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700522 // Create a timeout context for this operation
523 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
524 defer cancel()
525
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000526 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700527 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000528 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700529 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000530 }
531
532 output := evalOutput{Result: result}
533 response, err := json.Marshal(output)
534 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700535 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000536 }
537
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700538 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000539}
540
541// ScreenshotTool definition
542type screenshotInput struct {
543 Selector string `json:"selector,omitempty"`
544 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700545 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000546}
547
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000548// NewScreenshotTool creates a tool for taking screenshots
549func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
550 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700551 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000552 Description: "Take a screenshot of the page or a specific element",
553 InputSchema: json.RawMessage(`{
554 "type": "object",
555 "properties": {
556 "selector": {
557 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700558 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000559 },
560 "format": {
561 "type": "string",
562 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
563 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700564 },
565 "timeout": {
566 "type": "string",
567 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000568 }
569 }
570 }`),
571 Run: b.screenshotRun,
572 }
573}
574
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700575func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000576 var input screenshotInput
577 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700578 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000579 }
580
581 browserCtx, err := b.GetBrowserContext()
582 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700583 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000584 }
585
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700586 // Create a timeout context for this operation
587 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
588 defer cancel()
589
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000590 var buf []byte
591 var actions []chromedp.Action
592
593 if input.Selector != "" {
594 // Take screenshot of specific element
595 actions = append(actions,
596 chromedp.WaitReady(input.Selector),
597 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
598 )
599 } else {
600 // Take full page screenshot
601 actions = append(actions, chromedp.CaptureScreenshot(&buf))
602 }
603
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700604 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000605 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700606 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000607 }
608
Philip Zeyliger542bda32025-06-11 18:31:03 -0700609 // Save the screenshot and get its ID for potential future reference
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000610 id := b.SaveScreenshot(buf)
611 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700612 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000613 }
614
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700615 // Get the full path to the screenshot
616 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000617
Philip Zeyliger542bda32025-06-11 18:31:03 -0700618 // Encode the image as base64
619 base64Data := base64.StdEncoding.EncodeToString(buf)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700620
Philip Zeyliger542bda32025-06-11 18:31:03 -0700621 // Return the screenshot directly to the LLM
622 return []llm.Content{
623 {
624 Type: llm.ContentTypeText,
625 Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
626 },
627 {
628 Type: llm.ContentTypeText, // Will be mapped to image in content array
629 MediaType: "image/png",
630 Data: base64Data,
631 },
632 }, nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000633}
634
635// ScrollIntoViewTool definition
636type scrollIntoViewInput struct {
637 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700638 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000639}
640
641// NewScrollIntoViewTool creates a tool for scrolling elements into view
642func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
643 return &llm.Tool{
644 Name: "browser_scroll_into_view",
645 Description: "Scroll an element into view if it's not visible",
646 InputSchema: json.RawMessage(`{
647 "type": "object",
648 "properties": {
649 "selector": {
650 "type": "string",
651 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700652 },
653 "timeout": {
654 "type": "string",
655 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000656 }
657 },
658 "required": ["selector"]
659 }`),
660 Run: b.scrollIntoViewRun,
661 }
662}
663
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700664func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000665 var input scrollIntoViewInput
666 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700667 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000668 }
669
670 browserCtx, err := b.GetBrowserContext()
671 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700672 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000673 }
674
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700675 // Create a timeout context for this operation
676 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
677 defer cancel()
678
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000679 script := fmt.Sprintf(`
680 const el = document.querySelector('%s');
681 if (el) {
682 el.scrollIntoView({behavior: 'smooth', block: 'center'});
683 return true;
684 }
685 return false;
686 `, input.Selector)
687
688 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700689 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000690 chromedp.WaitReady(input.Selector),
691 chromedp.Evaluate(script, &result),
692 )
693 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700694 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000695 }
696
697 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700698 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000699 }
700
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700701 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000702}
703
Philip Zeyliger05224842025-05-10 18:26:08 -0700704// ResizeTool definition
705type resizeInput struct {
706 Width int `json:"width"`
707 Height int `json:"height"`
708 Timeout string `json:"timeout,omitempty"`
709}
710
711// NewResizeTool creates a tool for resizing the browser window
712func (b *BrowseTools) NewResizeTool() *llm.Tool {
713 return &llm.Tool{
714 Name: "browser_resize",
715 Description: "Resize the browser window to a specific width and height",
716 InputSchema: json.RawMessage(`{
717 "type": "object",
718 "properties": {
719 "width": {
720 "type": "integer",
721 "description": "Window width in pixels"
722 },
723 "height": {
724 "type": "integer",
725 "description": "Window height in pixels"
726 },
727 "timeout": {
728 "type": "string",
729 "description": "Timeout as a Go duration string (default: 5s)"
730 }
731 },
732 "required": ["width", "height"]
733 }`),
734 Run: b.resizeRun,
735 }
736}
737
738func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
739 var input resizeInput
740 if err := json.Unmarshal(m, &input); err != nil {
741 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
742 }
743
744 browserCtx, err := b.GetBrowserContext()
745 if err != nil {
746 return llm.TextContent(errorResponse(err)), nil
747 }
748
749 // Create a timeout context for this operation
750 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
751 defer cancel()
752
753 // Validate dimensions
754 if input.Width <= 0 || input.Height <= 0 {
755 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
756 }
757
758 // Resize the browser window
759 err = chromedp.Run(timeoutCtx,
760 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
761 )
762 if err != nil {
763 return llm.TextContent(errorResponse(err)), nil
764 }
765
766 return llm.TextContent(successResponse()), nil
767}
768
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700769// GetTools returns browser tools, optionally filtering out screenshot-related tools
770func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
771 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000772 b.NewNavigateTool(),
773 b.NewClickTool(),
774 b.NewTypeTool(),
775 b.NewWaitForTool(),
776 b.NewGetTextTool(),
777 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000778 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700779 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700780 b.NewRecentConsoleLogsTool(),
781 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000782 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700783
784 // Add screenshot-related tools if supported
785 if includeScreenshotTools {
786 tools = append(tools, b.NewScreenshotTool())
787 tools = append(tools, b.NewReadImageTool())
788 }
789
790 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000791}
792
793// SaveScreenshot saves a screenshot to disk and returns its ID
794func (b *BrowseTools) SaveScreenshot(data []byte) string {
795 // Generate a unique ID
796 id := uuid.New().String()
797
798 // Save the file
799 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000800 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000801 log.Printf("Failed to save screenshot: %v", err)
802 return ""
803 }
804
805 // Track this screenshot
806 b.screenshotsMutex.Lock()
807 b.screenshots[id] = time.Now()
808 b.screenshotsMutex.Unlock()
809
810 return id
811}
812
813// GetScreenshotPath returns the full path to a screenshot by ID
814func GetScreenshotPath(id string) string {
815 return filepath.Join(ScreenshotDir, id+".png")
816}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700817
818// ReadImageTool definition
819type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700820 Path string `json:"path"`
821 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700822}
823
824// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
825func (b *BrowseTools) NewReadImageTool() *llm.Tool {
826 return &llm.Tool{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700827 Name: "read_image",
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700828 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
829 InputSchema: json.RawMessage(`{
830 "type": "object",
831 "properties": {
832 "path": {
833 "type": "string",
834 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700835 },
836 "timeout": {
837 "type": "string",
838 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700839 }
840 },
841 "required": ["path"]
842 }`),
843 Run: b.readImageRun,
844 }
845}
846
847func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
848 var input readImageInput
849 if err := json.Unmarshal(m, &input); err != nil {
850 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
851 }
852
853 // Check if the path exists
854 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
855 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
856 }
857
858 // Read the file
859 imageData, err := os.ReadFile(input.Path)
860 if err != nil {
861 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
862 }
863
864 // Detect the image type
865 imageType := http.DetectContentType(imageData)
866 if !strings.HasPrefix(imageType, "image/") {
867 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
868 }
869
870 // Encode the image as base64
871 base64Data := base64.StdEncoding.EncodeToString(imageData)
872
873 // Create a Content object that includes both text and the image
874 return []llm.Content{
875 {
876 Type: llm.ContentTypeText,
877 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
878 },
879 {
880 Type: llm.ContentTypeText, // Will be mapped to image in content array
881 MediaType: imageType,
882 Data: base64Data,
883 },
884 }, nil
885}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700886
887// parseTimeout parses a timeout string and returns a time.Duration
888// It returns a default of 5 seconds if the timeout is empty or invalid
889func parseTimeout(timeout string) time.Duration {
890 if timeout == "" {
891 return 5 * time.Second // default 5 seconds
892 }
893
894 dur, err := time.ParseDuration(timeout)
895 if err != nil {
896 // If parsing fails, return the default
897 return 5 * time.Second
898 }
899
900 return dur
901}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700902
903// captureConsoleLog captures a console log event and stores it
904func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
905 // Add to logs with mutex protection
906 b.consoleLogsMutex.Lock()
907 defer b.consoleLogsMutex.Unlock()
908
909 // Add the log and maintain max size
910 b.consoleLogs = append(b.consoleLogs, e)
911 if len(b.consoleLogs) > b.maxConsoleLogs {
912 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
913 }
914}
915
916// RecentConsoleLogsTool definition
917type recentConsoleLogsInput struct {
918 Limit int `json:"limit,omitempty"`
919}
920
921// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
922func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
923 return &llm.Tool{
924 Name: "browser_recent_console_logs",
925 Description: "Get recent browser console logs",
926 InputSchema: json.RawMessage(`{
927 "type": "object",
928 "properties": {
929 "limit": {
930 "type": "integer",
931 "description": "Maximum number of log entries to return (default: 100)"
932 }
933 }
934 }`),
935 Run: b.recentConsoleLogsRun,
936 }
937}
938
939func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
940 var input recentConsoleLogsInput
941 if err := json.Unmarshal(m, &input); err != nil {
942 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
943 }
944
945 // Ensure browser is initialized
946 _, err := b.GetBrowserContext()
947 if err != nil {
948 return llm.TextContent(errorResponse(err)), nil
949 }
950
951 // Apply limit (default to 100 if not specified)
952 limit := 100
953 if input.Limit > 0 {
954 limit = input.Limit
955 }
956
957 // Get console logs with mutex protection
958 b.consoleLogsMutex.Lock()
959 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
960 start := 0
961 if len(b.consoleLogs) > limit {
962 start = len(b.consoleLogs) - limit
963 }
964 logs = append(logs, b.consoleLogs[start:]...)
965 b.consoleLogsMutex.Unlock()
966
967 // Format the logs as JSON
968 logData, err := json.MarshalIndent(logs, "", " ")
969 if err != nil {
970 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
971 }
972
973 // Format the logs
974 var sb strings.Builder
975 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
976
977 if len(logs) == 0 {
978 sb.WriteString("No console logs captured.")
979 } else {
980 // Add the JSON data for full details
981 sb.WriteString(string(logData))
982 }
983
984 return llm.TextContent(sb.String()), nil
985}
986
987// ClearConsoleLogsTool definition
988type clearConsoleLogsInput struct{}
989
990// NewClearConsoleLogsTool creates a tool for clearing console logs
991func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
992 return &llm.Tool{
993 Name: "browser_clear_console_logs",
994 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700995 InputSchema: llm.EmptySchema(),
996 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700997 }
998}
999
1000func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
1001 var input clearConsoleLogsInput
1002 if err := json.Unmarshal(m, &input); err != nil {
1003 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
1004 }
1005
1006 // Ensure browser is initialized
1007 _, err := b.GetBrowserContext()
1008 if err != nil {
1009 return llm.TextContent(errorResponse(err)), nil
1010 }
1011
1012 // Clear console logs with mutex protection
1013 b.consoleLogsMutex.Lock()
1014 logCount := len(b.consoleLogs)
1015 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
1016 b.consoleLogsMutex.Unlock()
1017
1018 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
1019}