blob: 5cd28cd5a0e7aa22c4bb2bd56df9e262e24973b3 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
74 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
75 browserCtx, browserCancel := chromedp.NewContext(
76 allocCtx,
77 chromedp.WithLogf(log.Printf),
78 )
79
80 b.browserCtx = browserCtx
81 b.browserCtxCancel = browserCancel
82
Philip Zeyliger18e33682025-05-13 16:34:21 -070083 // Set up console log listener
84 chromedp.ListenTarget(browserCtx, func(ev any) {
85 switch e := ev.(type) {
86 case *runtime.EventConsoleAPICalled:
87 b.captureConsoleLog(e)
88 }
89 })
90
Philip Zeyliger33d282f2025-05-03 04:01:54 +000091 // Ensure the browser starts
92 if err := chromedp.Run(browserCtx); err != nil {
93 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
94 return
95 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +000096
97 // Set default viewport size to 1280x720 (16:9 widescreen)
98 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
99 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
100 return
101 }
102
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000103 b.initialized = true
104 })
105
106 return b.initErr
107}
108
109// Close shuts down the browser
110func (b *BrowseTools) Close() {
111 b.mux.Lock()
112 defer b.mux.Unlock()
113
114 if b.browserCtxCancel != nil {
115 b.browserCtxCancel()
116 b.browserCtxCancel = nil
117 }
118
119 if b.cancel != nil {
120 b.cancel()
121 }
122
123 b.initialized = false
124 log.Println("Browser closed")
125}
126
127// GetBrowserContext returns the context for browser operations
128func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
129 if err := b.Initialize(); err != nil {
130 return nil, err
131 }
132 return b.browserCtx, nil
133}
134
135// All tools return this as a response when successful
136type baseResponse struct {
137 Status string `json:"status,omitempty"`
138}
139
140func successResponse() string {
141 return `{"status":"success"}`
142}
143
144func errorResponse(err error) string {
145 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
146}
147
148// NavigateTool definition
149type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700150 URL string `json:"url"`
151 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000152}
153
154// isPort80 reports whether urlStr definitely uses port 80.
155func isPort80(urlStr string) bool {
156 parsedURL, err := url.Parse(urlStr)
157 if err != nil {
158 return false
159 }
160 port := parsedURL.Port()
161 return port == "80" || (port == "" && parsedURL.Scheme == "http")
162}
163
164// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000165func (b *BrowseTools) NewNavigateTool() *llm.Tool {
166 return &llm.Tool{
167 Name: "browser_navigate",
168 Description: "Navigate the browser to a specific URL and wait for page to load",
169 InputSchema: json.RawMessage(`{
170 "type": "object",
171 "properties": {
172 "url": {
173 "type": "string",
174 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700175 },
176 "timeout": {
177 "type": "string",
178 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000179 }
180 },
181 "required": ["url"]
182 }`),
183 Run: b.navigateRun,
184 }
185}
186
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700187func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000188 var input navigateInput
189 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700190 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000191 }
192
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000193 if isPort80(input.URL) {
194 return llm.TextContent(errorResponse(fmt.Errorf("port 80 is not the port you're looking for--it is the main sketch server"))), nil
195 }
196
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000197 browserCtx, err := b.GetBrowserContext()
198 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700199 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000200 }
201
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700202 // Create a timeout context for this operation
203 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
204 defer cancel()
205
206 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000207 chromedp.Navigate(input.URL),
208 chromedp.WaitReady("body"),
209 )
210 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700211 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000212 }
213
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700214 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000215}
216
217// ClickTool definition
218type clickInput struct {
219 Selector string `json:"selector"`
220 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700221 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000222}
223
224// NewClickTool creates a tool for clicking elements
225func (b *BrowseTools) NewClickTool() *llm.Tool {
226 return &llm.Tool{
227 Name: "browser_click",
228 Description: "Click the first element matching a CSS selector",
229 InputSchema: json.RawMessage(`{
230 "type": "object",
231 "properties": {
232 "selector": {
233 "type": "string",
234 "description": "CSS selector for the element to click"
235 },
236 "wait_visible": {
237 "type": "boolean",
238 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700239 },
240 "timeout": {
241 "type": "string",
242 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000243 }
244 },
245 "required": ["selector"]
246 }`),
247 Run: b.clickRun,
248 }
249}
250
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700251func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000252 var input clickInput
253 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700254 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000255 }
256
257 browserCtx, err := b.GetBrowserContext()
258 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700259 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000260 }
261
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700262 // Create a timeout context for this operation
263 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
264 defer cancel()
265
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000266 actions := []chromedp.Action{
267 chromedp.WaitReady(input.Selector),
268 }
269
270 if input.WaitVisible {
271 actions = append(actions, chromedp.WaitVisible(input.Selector))
272 }
273
274 actions = append(actions, chromedp.Click(input.Selector))
275
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700276 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000277 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700278 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000279 }
280
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700281 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000282}
283
284// TypeTool definition
285type typeInput struct {
286 Selector string `json:"selector"`
287 Text string `json:"text"`
288 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700289 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000290}
291
292// NewTypeTool creates a tool for typing into input elements
293func (b *BrowseTools) NewTypeTool() *llm.Tool {
294 return &llm.Tool{
295 Name: "browser_type",
296 Description: "Type text into an input or textarea element",
297 InputSchema: json.RawMessage(`{
298 "type": "object",
299 "properties": {
300 "selector": {
301 "type": "string",
302 "description": "CSS selector for the input element"
303 },
304 "text": {
305 "type": "string",
306 "description": "Text to type into the element"
307 },
308 "clear": {
309 "type": "boolean",
310 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700311 },
312 "timeout": {
313 "type": "string",
314 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000315 }
316 },
317 "required": ["selector", "text"]
318 }`),
319 Run: b.typeRun,
320 }
321}
322
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700323func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000324 var input typeInput
325 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700326 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000327 }
328
329 browserCtx, err := b.GetBrowserContext()
330 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700331 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000332 }
333
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700334 // Create a timeout context for this operation
335 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
336 defer cancel()
337
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000338 actions := []chromedp.Action{
339 chromedp.WaitReady(input.Selector),
340 chromedp.WaitVisible(input.Selector),
341 }
342
343 if input.Clear {
344 actions = append(actions, chromedp.Clear(input.Selector))
345 }
346
347 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
348
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700349 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000350 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700351 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000352 }
353
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700354 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000355}
356
357// WaitForTool definition
358type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700359 Selector string `json:"selector"`
360 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000361}
362
363// NewWaitForTool creates a tool for waiting for elements
364func (b *BrowseTools) NewWaitForTool() *llm.Tool {
365 return &llm.Tool{
366 Name: "browser_wait_for",
367 Description: "Wait for an element to be present in the DOM",
368 InputSchema: json.RawMessage(`{
369 "type": "object",
370 "properties": {
371 "selector": {
372 "type": "string",
373 "description": "CSS selector for the element to wait for"
374 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700375 "timeout": {
376 "type": "string",
377 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000378 }
379 },
380 "required": ["selector"]
381 }`),
382 Run: b.waitForRun,
383 }
384}
385
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700386func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000387 var input waitForInput
388 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700389 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000390 }
391
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000392 browserCtx, err := b.GetBrowserContext()
393 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700394 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000395 }
396
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700397 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000398 defer cancel()
399
400 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
401 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700402 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000403 }
404
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700405 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000406}
407
408// GetTextTool definition
409type getTextInput struct {
410 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700411 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000412}
413
414type getTextOutput struct {
415 Text string `json:"text"`
416}
417
418// NewGetTextTool creates a tool for getting text from elements
419func (b *BrowseTools) NewGetTextTool() *llm.Tool {
420 return &llm.Tool{
421 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700422 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000423 InputSchema: json.RawMessage(`{
424 "type": "object",
425 "properties": {
426 "selector": {
427 "type": "string",
428 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700429 },
430 "timeout": {
431 "type": "string",
432 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000433 }
434 },
435 "required": ["selector"]
436 }`),
437 Run: b.getTextRun,
438 }
439}
440
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700441func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000442 var input getTextInput
443 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700444 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000445 }
446
447 browserCtx, err := b.GetBrowserContext()
448 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700449 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000450 }
451
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700452 // Create a timeout context for this operation
453 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
454 defer cancel()
455
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000456 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700457 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000458 chromedp.WaitReady(input.Selector),
459 chromedp.Text(input.Selector, &text),
460 )
461 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700462 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000463 }
464
465 output := getTextOutput{Text: text}
466 result, err := json.Marshal(output)
467 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700468 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000469 }
470
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700471 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000472}
473
474// EvalTool definition
475type evalInput struct {
476 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700477 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000478}
479
480type evalOutput struct {
481 Result any `json:"result"`
482}
483
484// NewEvalTool creates a tool for evaluating JavaScript
485func (b *BrowseTools) NewEvalTool() *llm.Tool {
486 return &llm.Tool{
487 Name: "browser_eval",
488 Description: "Evaluate JavaScript in the browser context",
489 InputSchema: json.RawMessage(`{
490 "type": "object",
491 "properties": {
492 "expression": {
493 "type": "string",
494 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700495 },
496 "timeout": {
497 "type": "string",
498 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000499 }
500 },
501 "required": ["expression"]
502 }`),
503 Run: b.evalRun,
504 }
505}
506
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700507func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000508 var input evalInput
509 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700510 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000511 }
512
513 browserCtx, err := b.GetBrowserContext()
514 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700515 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000516 }
517
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700518 // Create a timeout context for this operation
519 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
520 defer cancel()
521
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000522 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700523 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000524 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700525 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000526 }
527
528 output := evalOutput{Result: result}
529 response, err := json.Marshal(output)
530 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700531 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000532 }
533
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700534 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000535}
536
537// ScreenshotTool definition
538type screenshotInput struct {
539 Selector string `json:"selector,omitempty"`
540 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700541 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000542}
543
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000544// NewScreenshotTool creates a tool for taking screenshots
545func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
546 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700547 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000548 Description: "Take a screenshot of the page or a specific element",
549 InputSchema: json.RawMessage(`{
550 "type": "object",
551 "properties": {
552 "selector": {
553 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700554 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000555 },
556 "format": {
557 "type": "string",
558 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
559 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700560 },
561 "timeout": {
562 "type": "string",
563 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000564 }
565 }
566 }`),
567 Run: b.screenshotRun,
568 }
569}
570
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700571func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000572 var input screenshotInput
573 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700574 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000575 }
576
577 browserCtx, err := b.GetBrowserContext()
578 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700579 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000580 }
581
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700582 // Create a timeout context for this operation
583 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
584 defer cancel()
585
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000586 var buf []byte
587 var actions []chromedp.Action
588
589 if input.Selector != "" {
590 // Take screenshot of specific element
591 actions = append(actions,
592 chromedp.WaitReady(input.Selector),
593 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
594 )
595 } else {
596 // Take full page screenshot
597 actions = append(actions, chromedp.CaptureScreenshot(&buf))
598 }
599
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700600 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000601 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700602 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000603 }
604
Philip Zeyliger542bda32025-06-11 18:31:03 -0700605 // Save the screenshot and get its ID for potential future reference
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000606 id := b.SaveScreenshot(buf)
607 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700608 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000609 }
610
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700611 // Get the full path to the screenshot
612 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000613
Philip Zeyliger542bda32025-06-11 18:31:03 -0700614 // Encode the image as base64
615 base64Data := base64.StdEncoding.EncodeToString(buf)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700616
Philip Zeyliger542bda32025-06-11 18:31:03 -0700617 // Return the screenshot directly to the LLM
618 return []llm.Content{
619 {
620 Type: llm.ContentTypeText,
621 Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
622 },
623 {
624 Type: llm.ContentTypeText, // Will be mapped to image in content array
625 MediaType: "image/png",
626 Data: base64Data,
627 },
628 }, nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000629}
630
631// ScrollIntoViewTool definition
632type scrollIntoViewInput struct {
633 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700634 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000635}
636
637// NewScrollIntoViewTool creates a tool for scrolling elements into view
638func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
639 return &llm.Tool{
640 Name: "browser_scroll_into_view",
641 Description: "Scroll an element into view if it's not visible",
642 InputSchema: json.RawMessage(`{
643 "type": "object",
644 "properties": {
645 "selector": {
646 "type": "string",
647 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700648 },
649 "timeout": {
650 "type": "string",
651 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000652 }
653 },
654 "required": ["selector"]
655 }`),
656 Run: b.scrollIntoViewRun,
657 }
658}
659
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700660func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000661 var input scrollIntoViewInput
662 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700663 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000664 }
665
666 browserCtx, err := b.GetBrowserContext()
667 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700668 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000669 }
670
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700671 // Create a timeout context for this operation
672 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
673 defer cancel()
674
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000675 script := fmt.Sprintf(`
676 const el = document.querySelector('%s');
677 if (el) {
678 el.scrollIntoView({behavior: 'smooth', block: 'center'});
679 return true;
680 }
681 return false;
682 `, input.Selector)
683
684 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700685 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000686 chromedp.WaitReady(input.Selector),
687 chromedp.Evaluate(script, &result),
688 )
689 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700690 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000691 }
692
693 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700694 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000695 }
696
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700697 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000698}
699
Philip Zeyliger05224842025-05-10 18:26:08 -0700700// ResizeTool definition
701type resizeInput struct {
702 Width int `json:"width"`
703 Height int `json:"height"`
704 Timeout string `json:"timeout,omitempty"`
705}
706
707// NewResizeTool creates a tool for resizing the browser window
708func (b *BrowseTools) NewResizeTool() *llm.Tool {
709 return &llm.Tool{
710 Name: "browser_resize",
711 Description: "Resize the browser window to a specific width and height",
712 InputSchema: json.RawMessage(`{
713 "type": "object",
714 "properties": {
715 "width": {
716 "type": "integer",
717 "description": "Window width in pixels"
718 },
719 "height": {
720 "type": "integer",
721 "description": "Window height in pixels"
722 },
723 "timeout": {
724 "type": "string",
725 "description": "Timeout as a Go duration string (default: 5s)"
726 }
727 },
728 "required": ["width", "height"]
729 }`),
730 Run: b.resizeRun,
731 }
732}
733
734func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
735 var input resizeInput
736 if err := json.Unmarshal(m, &input); err != nil {
737 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
738 }
739
740 browserCtx, err := b.GetBrowserContext()
741 if err != nil {
742 return llm.TextContent(errorResponse(err)), nil
743 }
744
745 // Create a timeout context for this operation
746 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
747 defer cancel()
748
749 // Validate dimensions
750 if input.Width <= 0 || input.Height <= 0 {
751 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
752 }
753
754 // Resize the browser window
755 err = chromedp.Run(timeoutCtx,
756 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
757 )
758 if err != nil {
759 return llm.TextContent(errorResponse(err)), nil
760 }
761
762 return llm.TextContent(successResponse()), nil
763}
764
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700765// GetTools returns browser tools, optionally filtering out screenshot-related tools
766func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
767 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000768 b.NewNavigateTool(),
769 b.NewClickTool(),
770 b.NewTypeTool(),
771 b.NewWaitForTool(),
772 b.NewGetTextTool(),
773 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000774 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700775 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700776 b.NewRecentConsoleLogsTool(),
777 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000778 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700779
780 // Add screenshot-related tools if supported
781 if includeScreenshotTools {
782 tools = append(tools, b.NewScreenshotTool())
783 tools = append(tools, b.NewReadImageTool())
784 }
785
786 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000787}
788
789// SaveScreenshot saves a screenshot to disk and returns its ID
790func (b *BrowseTools) SaveScreenshot(data []byte) string {
791 // Generate a unique ID
792 id := uuid.New().String()
793
794 // Save the file
795 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000796 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000797 log.Printf("Failed to save screenshot: %v", err)
798 return ""
799 }
800
801 // Track this screenshot
802 b.screenshotsMutex.Lock()
803 b.screenshots[id] = time.Now()
804 b.screenshotsMutex.Unlock()
805
806 return id
807}
808
809// GetScreenshotPath returns the full path to a screenshot by ID
810func GetScreenshotPath(id string) string {
811 return filepath.Join(ScreenshotDir, id+".png")
812}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700813
814// ReadImageTool definition
815type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700816 Path string `json:"path"`
817 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700818}
819
820// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
821func (b *BrowseTools) NewReadImageTool() *llm.Tool {
822 return &llm.Tool{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700823 Name: "read_image",
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700824 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
825 InputSchema: json.RawMessage(`{
826 "type": "object",
827 "properties": {
828 "path": {
829 "type": "string",
830 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700831 },
832 "timeout": {
833 "type": "string",
834 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700835 }
836 },
837 "required": ["path"]
838 }`),
839 Run: b.readImageRun,
840 }
841}
842
843func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
844 var input readImageInput
845 if err := json.Unmarshal(m, &input); err != nil {
846 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
847 }
848
849 // Check if the path exists
850 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
851 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
852 }
853
854 // Read the file
855 imageData, err := os.ReadFile(input.Path)
856 if err != nil {
857 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
858 }
859
860 // Detect the image type
861 imageType := http.DetectContentType(imageData)
862 if !strings.HasPrefix(imageType, "image/") {
863 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
864 }
865
866 // Encode the image as base64
867 base64Data := base64.StdEncoding.EncodeToString(imageData)
868
869 // Create a Content object that includes both text and the image
870 return []llm.Content{
871 {
872 Type: llm.ContentTypeText,
873 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
874 },
875 {
876 Type: llm.ContentTypeText, // Will be mapped to image in content array
877 MediaType: imageType,
878 Data: base64Data,
879 },
880 }, nil
881}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700882
883// parseTimeout parses a timeout string and returns a time.Duration
884// It returns a default of 5 seconds if the timeout is empty or invalid
885func parseTimeout(timeout string) time.Duration {
886 if timeout == "" {
887 return 5 * time.Second // default 5 seconds
888 }
889
890 dur, err := time.ParseDuration(timeout)
891 if err != nil {
892 // If parsing fails, return the default
893 return 5 * time.Second
894 }
895
896 return dur
897}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700898
899// captureConsoleLog captures a console log event and stores it
900func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
901 // Add to logs with mutex protection
902 b.consoleLogsMutex.Lock()
903 defer b.consoleLogsMutex.Unlock()
904
905 // Add the log and maintain max size
906 b.consoleLogs = append(b.consoleLogs, e)
907 if len(b.consoleLogs) > b.maxConsoleLogs {
908 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
909 }
910}
911
912// RecentConsoleLogsTool definition
913type recentConsoleLogsInput struct {
914 Limit int `json:"limit,omitempty"`
915}
916
917// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
918func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
919 return &llm.Tool{
920 Name: "browser_recent_console_logs",
921 Description: "Get recent browser console logs",
922 InputSchema: json.RawMessage(`{
923 "type": "object",
924 "properties": {
925 "limit": {
926 "type": "integer",
927 "description": "Maximum number of log entries to return (default: 100)"
928 }
929 }
930 }`),
931 Run: b.recentConsoleLogsRun,
932 }
933}
934
935func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
936 var input recentConsoleLogsInput
937 if err := json.Unmarshal(m, &input); err != nil {
938 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
939 }
940
941 // Ensure browser is initialized
942 _, err := b.GetBrowserContext()
943 if err != nil {
944 return llm.TextContent(errorResponse(err)), nil
945 }
946
947 // Apply limit (default to 100 if not specified)
948 limit := 100
949 if input.Limit > 0 {
950 limit = input.Limit
951 }
952
953 // Get console logs with mutex protection
954 b.consoleLogsMutex.Lock()
955 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
956 start := 0
957 if len(b.consoleLogs) > limit {
958 start = len(b.consoleLogs) - limit
959 }
960 logs = append(logs, b.consoleLogs[start:]...)
961 b.consoleLogsMutex.Unlock()
962
963 // Format the logs as JSON
964 logData, err := json.MarshalIndent(logs, "", " ")
965 if err != nil {
966 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
967 }
968
969 // Format the logs
970 var sb strings.Builder
971 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
972
973 if len(logs) == 0 {
974 sb.WriteString("No console logs captured.")
975 } else {
976 // Add the JSON data for full details
977 sb.WriteString(string(logData))
978 }
979
980 return llm.TextContent(sb.String()), nil
981}
982
983// ClearConsoleLogsTool definition
984type clearConsoleLogsInput struct{}
985
986// NewClearConsoleLogsTool creates a tool for clearing console logs
987func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
988 return &llm.Tool{
989 Name: "browser_clear_console_logs",
990 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700991 InputSchema: llm.EmptySchema(),
992 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700993 }
994}
995
996func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
997 var input clearConsoleLogsInput
998 if err := json.Unmarshal(m, &input); err != nil {
999 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
1000 }
1001
1002 // Ensure browser is initialized
1003 _, err := b.GetBrowserContext()
1004 if err != nil {
1005 return llm.TextContent(errorResponse(err)), nil
1006 }
1007
1008 // Clear console logs with mutex protection
1009 b.consoleLogsMutex.Lock()
1010 logCount := len(b.consoleLogs)
1011 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
1012 b.consoleLogsMutex.Unlock()
1013
1014 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
1015}