blob: 928860dc90acf03b35185641b3f0ae52c8729eff [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
Philip Zeyligerc0131342025-06-13 21:07:08 -070074 // This is the default when running as root, but we generally need it
75 // when running in a container, even when we aren't root (which is largely
76 // the case for tests).
77 opts = append(opts, chromedp.NoSandbox)
Philip Zeyligera35de5f2025-06-14 12:00:48 -070078 // Setting 'DBUS_SESSION_BUS_ADDRESS=""' or this flag allows tests to pass
79 // in GitHub runner contexts. It's a mystery why the failure isn't clear when this fails.
80 opts = append(opts, chromedp.Flag("--disable-dbus", true))
81 // This can be pretty slow in tests
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070082 opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
Philip Zeyliger33d282f2025-05-03 04:01:54 +000083 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
84 browserCtx, browserCancel := chromedp.NewContext(
85 allocCtx,
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070086 chromedp.WithLogf(log.Printf), chromedp.WithErrorf(log.Printf), chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
Philip Zeyliger33d282f2025-05-03 04:01:54 +000087 )
88
89 b.browserCtx = browserCtx
90 b.browserCtxCancel = browserCancel
91
Philip Zeyliger18e33682025-05-13 16:34:21 -070092 // Set up console log listener
93 chromedp.ListenTarget(browserCtx, func(ev any) {
94 switch e := ev.(type) {
95 case *runtime.EventConsoleAPICalled:
96 b.captureConsoleLog(e)
97 }
98 })
99
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000100 // Ensure the browser starts
101 if err := chromedp.Run(browserCtx); err != nil {
102 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
103 return
104 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +0000105
106 // Set default viewport size to 1280x720 (16:9 widescreen)
107 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
108 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
109 return
110 }
111
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000112 b.initialized = true
113 })
114
115 return b.initErr
116}
117
118// Close shuts down the browser
119func (b *BrowseTools) Close() {
120 b.mux.Lock()
121 defer b.mux.Unlock()
122
123 if b.browserCtxCancel != nil {
124 b.browserCtxCancel()
125 b.browserCtxCancel = nil
126 }
127
128 if b.cancel != nil {
129 b.cancel()
130 }
131
132 b.initialized = false
133 log.Println("Browser closed")
134}
135
136// GetBrowserContext returns the context for browser operations
137func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
138 if err := b.Initialize(); err != nil {
139 return nil, err
140 }
141 return b.browserCtx, nil
142}
143
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000144// NavigateTool definition
145type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700146 URL string `json:"url"`
147 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000148}
149
150// isPort80 reports whether urlStr definitely uses port 80.
151func isPort80(urlStr string) bool {
152 parsedURL, err := url.Parse(urlStr)
153 if err != nil {
154 return false
155 }
156 port := parsedURL.Port()
157 return port == "80" || (port == "" && parsedURL.Scheme == "http")
158}
159
160// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000161func (b *BrowseTools) NewNavigateTool() *llm.Tool {
162 return &llm.Tool{
163 Name: "browser_navigate",
164 Description: "Navigate the browser to a specific URL and wait for page to load",
165 InputSchema: json.RawMessage(`{
166 "type": "object",
167 "properties": {
168 "url": {
169 "type": "string",
170 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700171 },
172 "timeout": {
173 "type": "string",
174 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000175 }
176 },
177 "required": ["url"]
178 }`),
179 Run: b.navigateRun,
180 }
181}
182
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700183func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000184 var input navigateInput
185 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000186 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000187 }
188
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000189 if isPort80(input.URL) {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000190 return nil, fmt.Errorf("port 80 is not the port you're looking for--port 80 is the main sketch server")
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000191 }
192
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000193 browserCtx, err := b.GetBrowserContext()
194 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000195 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000196 }
197
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700198 // Create a timeout context for this operation
199 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
200 defer cancel()
201
202 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000203 chromedp.Navigate(input.URL),
204 chromedp.WaitReady("body"),
205 )
206 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000207 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000208 }
209
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000210 return llm.TextContent("done"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000211}
212
213// ClickTool definition
214type clickInput struct {
215 Selector string `json:"selector"`
216 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700217 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000218}
219
220// NewClickTool creates a tool for clicking elements
221func (b *BrowseTools) NewClickTool() *llm.Tool {
222 return &llm.Tool{
223 Name: "browser_click",
224 Description: "Click the first element matching a CSS selector",
225 InputSchema: json.RawMessage(`{
226 "type": "object",
227 "properties": {
228 "selector": {
229 "type": "string",
230 "description": "CSS selector for the element to click"
231 },
232 "wait_visible": {
233 "type": "boolean",
234 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700235 },
236 "timeout": {
237 "type": "string",
238 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000239 }
240 },
241 "required": ["selector"]
242 }`),
243 Run: b.clickRun,
244 }
245}
246
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700247func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000248 var input clickInput
249 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000250 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000251 }
252
253 browserCtx, err := b.GetBrowserContext()
254 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000255 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000256 }
257
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700258 // Create a timeout context for this operation
259 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
260 defer cancel()
261
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000262 actions := []chromedp.Action{
263 chromedp.WaitReady(input.Selector),
264 }
265
266 if input.WaitVisible {
267 actions = append(actions, chromedp.WaitVisible(input.Selector))
268 }
269
270 actions = append(actions, chromedp.Click(input.Selector))
271
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700272 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000273 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000274 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000275 }
276
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000277 return llm.TextContent("done"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000278}
279
280// TypeTool definition
281type typeInput struct {
282 Selector string `json:"selector"`
283 Text string `json:"text"`
284 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700285 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000286}
287
288// NewTypeTool creates a tool for typing into input elements
289func (b *BrowseTools) NewTypeTool() *llm.Tool {
290 return &llm.Tool{
291 Name: "browser_type",
292 Description: "Type text into an input or textarea element",
293 InputSchema: json.RawMessage(`{
294 "type": "object",
295 "properties": {
296 "selector": {
297 "type": "string",
298 "description": "CSS selector for the input element"
299 },
300 "text": {
301 "type": "string",
302 "description": "Text to type into the element"
303 },
304 "clear": {
305 "type": "boolean",
306 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700307 },
308 "timeout": {
309 "type": "string",
310 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000311 }
312 },
313 "required": ["selector", "text"]
314 }`),
315 Run: b.typeRun,
316 }
317}
318
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700319func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000320 var input typeInput
321 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000322 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000323 }
324
325 browserCtx, err := b.GetBrowserContext()
326 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000327 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000328 }
329
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700330 // Create a timeout context for this operation
331 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
332 defer cancel()
333
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000334 actions := []chromedp.Action{
335 chromedp.WaitReady(input.Selector),
336 chromedp.WaitVisible(input.Selector),
337 }
338
339 if input.Clear {
340 actions = append(actions, chromedp.Clear(input.Selector))
341 }
342
343 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
344
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700345 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000346 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000347 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000348 }
349
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000350 return llm.TextContent("done"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000351}
352
353// WaitForTool definition
354type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700355 Selector string `json:"selector"`
356 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000357}
358
359// NewWaitForTool creates a tool for waiting for elements
360func (b *BrowseTools) NewWaitForTool() *llm.Tool {
361 return &llm.Tool{
362 Name: "browser_wait_for",
363 Description: "Wait for an element to be present in the DOM",
364 InputSchema: json.RawMessage(`{
365 "type": "object",
366 "properties": {
367 "selector": {
368 "type": "string",
369 "description": "CSS selector for the element to wait for"
370 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700371 "timeout": {
372 "type": "string",
373 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000374 }
375 },
376 "required": ["selector"]
377 }`),
378 Run: b.waitForRun,
379 }
380}
381
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700382func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000383 var input waitForInput
384 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000385 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000386 }
387
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000388 browserCtx, err := b.GetBrowserContext()
389 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000390 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000391 }
392
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700393 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000394 defer cancel()
395
396 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
397 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000398 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000399 }
400
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000401 return llm.TextContent("done"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000402}
403
404// GetTextTool definition
405type getTextInput struct {
406 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700407 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000408}
409
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000410// NewGetTextTool creates a tool for getting text from elements
411func (b *BrowseTools) NewGetTextTool() *llm.Tool {
412 return &llm.Tool{
413 Name: "browser_get_text",
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000414 Description: "Get the innerText of an element, returned in innerText tag. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000415 InputSchema: json.RawMessage(`{
416 "type": "object",
417 "properties": {
418 "selector": {
419 "type": "string",
420 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700421 },
422 "timeout": {
423 "type": "string",
424 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000425 }
426 },
427 "required": ["selector"]
428 }`),
429 Run: b.getTextRun,
430 }
431}
432
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700433func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000434 var input getTextInput
435 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000436 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000437 }
438
439 browserCtx, err := b.GetBrowserContext()
440 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000441 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000442 }
443
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700444 // Create a timeout context for this operation
445 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
446 defer cancel()
447
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000448 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700449 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000450 chromedp.WaitReady(input.Selector),
451 chromedp.Text(input.Selector, &text),
452 )
453 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000454 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000455 }
456
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000457 return llm.TextContent("<innerText>" + text + "</innerText>"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000458}
459
460// EvalTool definition
461type evalInput struct {
462 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700463 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000464}
465
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000466// NewEvalTool creates a tool for evaluating JavaScript
467func (b *BrowseTools) NewEvalTool() *llm.Tool {
468 return &llm.Tool{
469 Name: "browser_eval",
470 Description: "Evaluate JavaScript in the browser context",
471 InputSchema: json.RawMessage(`{
472 "type": "object",
473 "properties": {
474 "expression": {
475 "type": "string",
476 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700477 },
478 "timeout": {
479 "type": "string",
480 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000481 }
482 },
483 "required": ["expression"]
484 }`),
485 Run: b.evalRun,
486 }
487}
488
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700489func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000490 var input evalInput
491 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000492 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000493 }
494
495 browserCtx, err := b.GetBrowserContext()
496 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000497 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000498 }
499
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700500 // Create a timeout context for this operation
501 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
502 defer cancel()
503
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000504 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700505 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000506 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000507 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000508 }
509
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000510 // Return the result as JSON
511 response, err := json.Marshal(result)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000512 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000513 return nil, fmt.Errorf("failed to marshal response: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000514 }
515
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000516 return llm.TextContent("<javascript_result>" + string(response) + "</javascript_result>"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000517}
518
519// ScreenshotTool definition
520type screenshotInput struct {
521 Selector string `json:"selector,omitempty"`
522 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700523 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000524}
525
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000526// NewScreenshotTool creates a tool for taking screenshots
527func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
528 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700529 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000530 Description: "Take a screenshot of the page or a specific element",
531 InputSchema: json.RawMessage(`{
532 "type": "object",
533 "properties": {
534 "selector": {
535 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700536 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000537 },
538 "format": {
539 "type": "string",
540 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
541 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700542 },
543 "timeout": {
544 "type": "string",
545 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000546 }
547 }
548 }`),
549 Run: b.screenshotRun,
550 }
551}
552
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700553func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000554 var input screenshotInput
555 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000556 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000557 }
558
559 browserCtx, err := b.GetBrowserContext()
560 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000561 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000562 }
563
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700564 // Create a timeout context for this operation
565 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
566 defer cancel()
567
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000568 var buf []byte
569 var actions []chromedp.Action
570
571 if input.Selector != "" {
572 // Take screenshot of specific element
573 actions = append(actions,
574 chromedp.WaitReady(input.Selector),
575 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
576 )
577 } else {
578 // Take full page screenshot
579 actions = append(actions, chromedp.CaptureScreenshot(&buf))
580 }
581
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700582 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000583 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000584 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000585 }
586
Philip Zeyliger542bda32025-06-11 18:31:03 -0700587 // Save the screenshot and get its ID for potential future reference
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000588 id := b.SaveScreenshot(buf)
589 if id == "" {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000590 return nil, fmt.Errorf("failed to save screenshot")
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000591 }
592
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700593 // Get the full path to the screenshot
594 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000595
Philip Zeyliger542bda32025-06-11 18:31:03 -0700596 // Encode the image as base64
597 base64Data := base64.StdEncoding.EncodeToString(buf)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700598
Philip Zeyliger542bda32025-06-11 18:31:03 -0700599 // Return the screenshot directly to the LLM
600 return []llm.Content{
601 {
602 Type: llm.ContentTypeText,
603 Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
604 },
605 {
606 Type: llm.ContentTypeText, // Will be mapped to image in content array
607 MediaType: "image/png",
608 Data: base64Data,
609 },
610 }, nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000611}
612
613// ScrollIntoViewTool definition
614type scrollIntoViewInput struct {
615 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700616 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000617}
618
619// NewScrollIntoViewTool creates a tool for scrolling elements into view
620func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
621 return &llm.Tool{
622 Name: "browser_scroll_into_view",
623 Description: "Scroll an element into view if it's not visible",
624 InputSchema: json.RawMessage(`{
625 "type": "object",
626 "properties": {
627 "selector": {
628 "type": "string",
629 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700630 },
631 "timeout": {
632 "type": "string",
633 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000634 }
635 },
636 "required": ["selector"]
637 }`),
638 Run: b.scrollIntoViewRun,
639 }
640}
641
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700642func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000643 var input scrollIntoViewInput
644 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000645 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000646 }
647
648 browserCtx, err := b.GetBrowserContext()
649 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000650 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000651 }
652
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700653 // Create a timeout context for this operation
654 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
655 defer cancel()
656
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000657 script := fmt.Sprintf(`
658 const el = document.querySelector('%s');
659 if (el) {
660 el.scrollIntoView({behavior: 'smooth', block: 'center'});
661 return true;
662 }
663 return false;
664 `, input.Selector)
665
666 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700667 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000668 chromedp.WaitReady(input.Selector),
669 chromedp.Evaluate(script, &result),
670 )
671 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000672 return nil, err
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000673 }
674
675 if !result {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000676 return nil, fmt.Errorf("element not found: %s", input.Selector)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000677 }
678
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000679 return llm.TextContent("done"), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000680}
681
Philip Zeyliger05224842025-05-10 18:26:08 -0700682// ResizeTool definition
683type resizeInput struct {
684 Width int `json:"width"`
685 Height int `json:"height"`
686 Timeout string `json:"timeout,omitempty"`
687}
688
689// NewResizeTool creates a tool for resizing the browser window
690func (b *BrowseTools) NewResizeTool() *llm.Tool {
691 return &llm.Tool{
692 Name: "browser_resize",
693 Description: "Resize the browser window to a specific width and height",
694 InputSchema: json.RawMessage(`{
695 "type": "object",
696 "properties": {
697 "width": {
698 "type": "integer",
699 "description": "Window width in pixels"
700 },
701 "height": {
702 "type": "integer",
703 "description": "Window height in pixels"
704 },
705 "timeout": {
706 "type": "string",
707 "description": "Timeout as a Go duration string (default: 5s)"
708 }
709 },
710 "required": ["width", "height"]
711 }`),
712 Run: b.resizeRun,
713 }
714}
715
716func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
717 var input resizeInput
718 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000719 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger05224842025-05-10 18:26:08 -0700720 }
721
722 browserCtx, err := b.GetBrowserContext()
723 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000724 return nil, err
Philip Zeyliger05224842025-05-10 18:26:08 -0700725 }
726
727 // Create a timeout context for this operation
728 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
729 defer cancel()
730
731 // Validate dimensions
732 if input.Width <= 0 || input.Height <= 0 {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000733 return nil, fmt.Errorf("invalid dimensions: width and height must be positive")
Philip Zeyliger05224842025-05-10 18:26:08 -0700734 }
735
736 // Resize the browser window
737 err = chromedp.Run(timeoutCtx,
738 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
739 )
740 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000741 return nil, err
Philip Zeyliger05224842025-05-10 18:26:08 -0700742 }
743
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000744 return llm.TextContent("done"), nil
Philip Zeyliger05224842025-05-10 18:26:08 -0700745}
746
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700747// GetTools returns browser tools, optionally filtering out screenshot-related tools
748func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
749 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000750 b.NewNavigateTool(),
751 b.NewClickTool(),
752 b.NewTypeTool(),
753 b.NewWaitForTool(),
754 b.NewGetTextTool(),
755 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000756 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700757 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700758 b.NewRecentConsoleLogsTool(),
759 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000760 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700761
762 // Add screenshot-related tools if supported
763 if includeScreenshotTools {
764 tools = append(tools, b.NewScreenshotTool())
765 tools = append(tools, b.NewReadImageTool())
766 }
767
768 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000769}
770
771// SaveScreenshot saves a screenshot to disk and returns its ID
772func (b *BrowseTools) SaveScreenshot(data []byte) string {
773 // Generate a unique ID
774 id := uuid.New().String()
775
776 // Save the file
777 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000778 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000779 log.Printf("Failed to save screenshot: %v", err)
780 return ""
781 }
782
783 // Track this screenshot
784 b.screenshotsMutex.Lock()
785 b.screenshots[id] = time.Now()
786 b.screenshotsMutex.Unlock()
787
788 return id
789}
790
791// GetScreenshotPath returns the full path to a screenshot by ID
792func GetScreenshotPath(id string) string {
793 return filepath.Join(ScreenshotDir, id+".png")
794}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700795
796// ReadImageTool definition
797type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700798 Path string `json:"path"`
799 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700800}
801
802// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
803func (b *BrowseTools) NewReadImageTool() *llm.Tool {
804 return &llm.Tool{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700805 Name: "read_image",
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700806 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
807 InputSchema: json.RawMessage(`{
808 "type": "object",
809 "properties": {
810 "path": {
811 "type": "string",
812 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700813 },
814 "timeout": {
815 "type": "string",
816 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700817 }
818 },
819 "required": ["path"]
820 }`),
821 Run: b.readImageRun,
822 }
823}
824
825func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
826 var input readImageInput
827 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000828 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700829 }
830
831 // Check if the path exists
832 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000833 return nil, fmt.Errorf("image file not found: %s", input.Path)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700834 }
835
836 // Read the file
837 imageData, err := os.ReadFile(input.Path)
838 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000839 return nil, fmt.Errorf("failed to read image file: %w", err)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700840 }
841
842 // Detect the image type
843 imageType := http.DetectContentType(imageData)
844 if !strings.HasPrefix(imageType, "image/") {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000845 return nil, fmt.Errorf("file is not an image: %s", imageType)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700846 }
847
848 // Encode the image as base64
849 base64Data := base64.StdEncoding.EncodeToString(imageData)
850
851 // Create a Content object that includes both text and the image
852 return []llm.Content{
853 {
854 Type: llm.ContentTypeText,
855 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
856 },
857 {
858 Type: llm.ContentTypeText, // Will be mapped to image in content array
859 MediaType: imageType,
860 Data: base64Data,
861 },
862 }, nil
863}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700864
865// parseTimeout parses a timeout string and returns a time.Duration
866// It returns a default of 5 seconds if the timeout is empty or invalid
867func parseTimeout(timeout string) time.Duration {
868 if timeout == "" {
869 return 5 * time.Second // default 5 seconds
870 }
871
872 dur, err := time.ParseDuration(timeout)
873 if err != nil {
874 // If parsing fails, return the default
875 return 5 * time.Second
876 }
877
878 return dur
879}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700880
881// captureConsoleLog captures a console log event and stores it
882func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
883 // Add to logs with mutex protection
884 b.consoleLogsMutex.Lock()
885 defer b.consoleLogsMutex.Unlock()
886
887 // Add the log and maintain max size
888 b.consoleLogs = append(b.consoleLogs, e)
889 if len(b.consoleLogs) > b.maxConsoleLogs {
890 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
891 }
892}
893
894// RecentConsoleLogsTool definition
895type recentConsoleLogsInput struct {
896 Limit int `json:"limit,omitempty"`
897}
898
899// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
900func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
901 return &llm.Tool{
902 Name: "browser_recent_console_logs",
903 Description: "Get recent browser console logs",
904 InputSchema: json.RawMessage(`{
905 "type": "object",
906 "properties": {
907 "limit": {
908 "type": "integer",
909 "description": "Maximum number of log entries to return (default: 100)"
910 }
911 }
912 }`),
913 Run: b.recentConsoleLogsRun,
914 }
915}
916
917func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
918 var input recentConsoleLogsInput
919 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000920 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700921 }
922
923 // Ensure browser is initialized
924 _, err := b.GetBrowserContext()
925 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000926 return nil, err
Philip Zeyliger18e33682025-05-13 16:34:21 -0700927 }
928
929 // Apply limit (default to 100 if not specified)
930 limit := 100
931 if input.Limit > 0 {
932 limit = input.Limit
933 }
934
935 // Get console logs with mutex protection
936 b.consoleLogsMutex.Lock()
937 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
938 start := 0
939 if len(b.consoleLogs) > limit {
940 start = len(b.consoleLogs) - limit
941 }
942 logs = append(logs, b.consoleLogs[start:]...)
943 b.consoleLogsMutex.Unlock()
944
945 // Format the logs as JSON
946 logData, err := json.MarshalIndent(logs, "", " ")
947 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000948 return nil, fmt.Errorf("failed to serialize logs: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700949 }
950
951 // Format the logs
952 var sb strings.Builder
953 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
954
955 if len(logs) == 0 {
956 sb.WriteString("No console logs captured.")
957 } else {
958 // Add the JSON data for full details
959 sb.WriteString(string(logData))
960 }
961
962 return llm.TextContent(sb.String()), nil
963}
964
965// ClearConsoleLogsTool definition
966type clearConsoleLogsInput struct{}
967
968// NewClearConsoleLogsTool creates a tool for clearing console logs
969func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
970 return &llm.Tool{
971 Name: "browser_clear_console_logs",
972 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700973 InputSchema: llm.EmptySchema(),
974 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700975 }
976}
977
978func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
979 var input clearConsoleLogsInput
980 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000981 return nil, fmt.Errorf("invalid input: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700982 }
983
984 // Ensure browser is initialized
985 _, err := b.GetBrowserContext()
986 if err != nil {
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000987 return nil, err
Philip Zeyliger18e33682025-05-13 16:34:21 -0700988 }
989
990 // Clear console logs with mutex protection
991 b.consoleLogsMutex.Lock()
992 logCount := len(b.consoleLogs)
993 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
994 b.consoleLogsMutex.Unlock()
995
996 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
997}