blob: dfb963ea8a1ca8e5233d8a20185b835e9ef70e0c [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
74 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
75 browserCtx, browserCancel := chromedp.NewContext(
76 allocCtx,
77 chromedp.WithLogf(log.Printf),
78 )
79
80 b.browserCtx = browserCtx
81 b.browserCtxCancel = browserCancel
82
Philip Zeyliger18e33682025-05-13 16:34:21 -070083 // Set up console log listener
84 chromedp.ListenTarget(browserCtx, func(ev any) {
85 switch e := ev.(type) {
86 case *runtime.EventConsoleAPICalled:
87 b.captureConsoleLog(e)
88 }
89 })
90
Philip Zeyliger33d282f2025-05-03 04:01:54 +000091 // Ensure the browser starts
92 if err := chromedp.Run(browserCtx); err != nil {
93 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
94 return
95 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +000096
97 // Set default viewport size to 1280x720 (16:9 widescreen)
98 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
99 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
100 return
101 }
102
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000103 b.initialized = true
104 })
105
106 return b.initErr
107}
108
109// Close shuts down the browser
110func (b *BrowseTools) Close() {
111 b.mux.Lock()
112 defer b.mux.Unlock()
113
114 if b.browserCtxCancel != nil {
115 b.browserCtxCancel()
116 b.browserCtxCancel = nil
117 }
118
119 if b.cancel != nil {
120 b.cancel()
121 }
122
123 b.initialized = false
124 log.Println("Browser closed")
125}
126
127// GetBrowserContext returns the context for browser operations
128func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
129 if err := b.Initialize(); err != nil {
130 return nil, err
131 }
132 return b.browserCtx, nil
133}
134
135// All tools return this as a response when successful
136type baseResponse struct {
137 Status string `json:"status,omitempty"`
138}
139
140func successResponse() string {
141 return `{"status":"success"}`
142}
143
144func errorResponse(err error) string {
145 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
146}
147
148// NavigateTool definition
149type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700150 URL string `json:"url"`
151 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000152}
153
154// isPort80 reports whether urlStr definitely uses port 80.
155func isPort80(urlStr string) bool {
156 parsedURL, err := url.Parse(urlStr)
157 if err != nil {
158 return false
159 }
160 port := parsedURL.Port()
161 return port == "80" || (port == "" && parsedURL.Scheme == "http")
162}
163
164// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000165func (b *BrowseTools) NewNavigateTool() *llm.Tool {
166 return &llm.Tool{
167 Name: "browser_navigate",
168 Description: "Navigate the browser to a specific URL and wait for page to load",
169 InputSchema: json.RawMessage(`{
170 "type": "object",
171 "properties": {
172 "url": {
173 "type": "string",
174 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700175 },
176 "timeout": {
177 "type": "string",
178 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000179 }
180 },
181 "required": ["url"]
182 }`),
183 Run: b.navigateRun,
184 }
185}
186
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700187func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000188 var input navigateInput
189 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700190 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000191 }
192
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000193 if isPort80(input.URL) {
194 return llm.TextContent(errorResponse(fmt.Errorf("port 80 is not the port you're looking for--it is the main sketch server"))), nil
195 }
196
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000197 browserCtx, err := b.GetBrowserContext()
198 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700199 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000200 }
201
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700202 // Create a timeout context for this operation
203 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
204 defer cancel()
205
206 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000207 chromedp.Navigate(input.URL),
208 chromedp.WaitReady("body"),
209 )
210 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700211 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000212 }
213
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700214 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000215}
216
217// ClickTool definition
218type clickInput struct {
219 Selector string `json:"selector"`
220 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700221 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000222}
223
224// NewClickTool creates a tool for clicking elements
225func (b *BrowseTools) NewClickTool() *llm.Tool {
226 return &llm.Tool{
227 Name: "browser_click",
228 Description: "Click the first element matching a CSS selector",
229 InputSchema: json.RawMessage(`{
230 "type": "object",
231 "properties": {
232 "selector": {
233 "type": "string",
234 "description": "CSS selector for the element to click"
235 },
236 "wait_visible": {
237 "type": "boolean",
238 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700239 },
240 "timeout": {
241 "type": "string",
242 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000243 }
244 },
245 "required": ["selector"]
246 }`),
247 Run: b.clickRun,
248 }
249}
250
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700251func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000252 var input clickInput
253 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700254 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000255 }
256
257 browserCtx, err := b.GetBrowserContext()
258 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700259 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000260 }
261
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700262 // Create a timeout context for this operation
263 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
264 defer cancel()
265
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000266 actions := []chromedp.Action{
267 chromedp.WaitReady(input.Selector),
268 }
269
270 if input.WaitVisible {
271 actions = append(actions, chromedp.WaitVisible(input.Selector))
272 }
273
274 actions = append(actions, chromedp.Click(input.Selector))
275
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700276 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000277 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700278 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000279 }
280
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700281 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000282}
283
284// TypeTool definition
285type typeInput struct {
286 Selector string `json:"selector"`
287 Text string `json:"text"`
288 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700289 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000290}
291
292// NewTypeTool creates a tool for typing into input elements
293func (b *BrowseTools) NewTypeTool() *llm.Tool {
294 return &llm.Tool{
295 Name: "browser_type",
296 Description: "Type text into an input or textarea element",
297 InputSchema: json.RawMessage(`{
298 "type": "object",
299 "properties": {
300 "selector": {
301 "type": "string",
302 "description": "CSS selector for the input element"
303 },
304 "text": {
305 "type": "string",
306 "description": "Text to type into the element"
307 },
308 "clear": {
309 "type": "boolean",
310 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700311 },
312 "timeout": {
313 "type": "string",
314 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000315 }
316 },
317 "required": ["selector", "text"]
318 }`),
319 Run: b.typeRun,
320 }
321}
322
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700323func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000324 var input typeInput
325 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700326 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000327 }
328
329 browserCtx, err := b.GetBrowserContext()
330 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700331 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000332 }
333
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700334 // Create a timeout context for this operation
335 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
336 defer cancel()
337
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000338 actions := []chromedp.Action{
339 chromedp.WaitReady(input.Selector),
340 chromedp.WaitVisible(input.Selector),
341 }
342
343 if input.Clear {
344 actions = append(actions, chromedp.Clear(input.Selector))
345 }
346
347 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
348
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700349 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000350 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700351 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000352 }
353
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700354 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000355}
356
357// WaitForTool definition
358type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700359 Selector string `json:"selector"`
360 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000361}
362
363// NewWaitForTool creates a tool for waiting for elements
364func (b *BrowseTools) NewWaitForTool() *llm.Tool {
365 return &llm.Tool{
366 Name: "browser_wait_for",
367 Description: "Wait for an element to be present in the DOM",
368 InputSchema: json.RawMessage(`{
369 "type": "object",
370 "properties": {
371 "selector": {
372 "type": "string",
373 "description": "CSS selector for the element to wait for"
374 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700375 "timeout": {
376 "type": "string",
377 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000378 }
379 },
380 "required": ["selector"]
381 }`),
382 Run: b.waitForRun,
383 }
384}
385
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700386func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000387 var input waitForInput
388 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700389 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000390 }
391
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000392 browserCtx, err := b.GetBrowserContext()
393 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700394 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000395 }
396
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700397 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000398 defer cancel()
399
400 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
401 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700402 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000403 }
404
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700405 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000406}
407
408// GetTextTool definition
409type getTextInput struct {
410 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700411 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000412}
413
414type getTextOutput struct {
415 Text string `json:"text"`
416}
417
418// NewGetTextTool creates a tool for getting text from elements
419func (b *BrowseTools) NewGetTextTool() *llm.Tool {
420 return &llm.Tool{
421 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700422 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000423 InputSchema: json.RawMessage(`{
424 "type": "object",
425 "properties": {
426 "selector": {
427 "type": "string",
428 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700429 },
430 "timeout": {
431 "type": "string",
432 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000433 }
434 },
435 "required": ["selector"]
436 }`),
437 Run: b.getTextRun,
438 }
439}
440
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700441func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000442 var input getTextInput
443 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700444 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000445 }
446
447 browserCtx, err := b.GetBrowserContext()
448 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700449 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000450 }
451
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700452 // Create a timeout context for this operation
453 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
454 defer cancel()
455
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000456 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700457 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000458 chromedp.WaitReady(input.Selector),
459 chromedp.Text(input.Selector, &text),
460 )
461 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700462 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000463 }
464
465 output := getTextOutput{Text: text}
466 result, err := json.Marshal(output)
467 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700468 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000469 }
470
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700471 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000472}
473
474// EvalTool definition
475type evalInput struct {
476 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700477 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000478}
479
480type evalOutput struct {
481 Result any `json:"result"`
482}
483
484// NewEvalTool creates a tool for evaluating JavaScript
485func (b *BrowseTools) NewEvalTool() *llm.Tool {
486 return &llm.Tool{
487 Name: "browser_eval",
488 Description: "Evaluate JavaScript in the browser context",
489 InputSchema: json.RawMessage(`{
490 "type": "object",
491 "properties": {
492 "expression": {
493 "type": "string",
494 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700495 },
496 "timeout": {
497 "type": "string",
498 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000499 }
500 },
501 "required": ["expression"]
502 }`),
503 Run: b.evalRun,
504 }
505}
506
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700507func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000508 var input evalInput
509 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700510 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000511 }
512
513 browserCtx, err := b.GetBrowserContext()
514 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700515 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000516 }
517
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700518 // Create a timeout context for this operation
519 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
520 defer cancel()
521
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000522 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700523 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000524 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700525 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000526 }
527
528 output := evalOutput{Result: result}
529 response, err := json.Marshal(output)
530 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700531 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000532 }
533
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700534 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000535}
536
537// ScreenshotTool definition
538type screenshotInput struct {
539 Selector string `json:"selector,omitempty"`
540 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700541 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000542}
543
544type screenshotOutput struct {
545 ID string `json:"id"`
546}
547
548// NewScreenshotTool creates a tool for taking screenshots
549func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
550 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700551 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000552 Description: "Take a screenshot of the page or a specific element",
553 InputSchema: json.RawMessage(`{
554 "type": "object",
555 "properties": {
556 "selector": {
557 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700558 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000559 },
560 "format": {
561 "type": "string",
562 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
563 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700564 },
565 "timeout": {
566 "type": "string",
567 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000568 }
569 }
570 }`),
571 Run: b.screenshotRun,
572 }
573}
574
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700575func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000576 var input screenshotInput
577 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700578 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000579 }
580
581 browserCtx, err := b.GetBrowserContext()
582 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700583 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000584 }
585
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700586 // Create a timeout context for this operation
587 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
588 defer cancel()
589
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000590 var buf []byte
591 var actions []chromedp.Action
592
593 if input.Selector != "" {
594 // Take screenshot of specific element
595 actions = append(actions,
596 chromedp.WaitReady(input.Selector),
597 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
598 )
599 } else {
600 // Take full page screenshot
601 actions = append(actions, chromedp.CaptureScreenshot(&buf))
602 }
603
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700604 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000605 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700606 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000607 }
608
609 // Save the screenshot and get its ID
610 id := b.SaveScreenshot(buf)
611 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700612 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000613 }
614
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700615 // Get the full path to the screenshot
616 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000617
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700618 // Return the ID and instructions on how to view the screenshot
619 result := fmt.Sprintf(`{
620 "id": "%s",
621 "path": "%s",
622 "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
623}`, id, screenshotPath)
624
625 return llm.TextContent(result), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000626}
627
628// ScrollIntoViewTool definition
629type scrollIntoViewInput struct {
630 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700631 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000632}
633
634// NewScrollIntoViewTool creates a tool for scrolling elements into view
635func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
636 return &llm.Tool{
637 Name: "browser_scroll_into_view",
638 Description: "Scroll an element into view if it's not visible",
639 InputSchema: json.RawMessage(`{
640 "type": "object",
641 "properties": {
642 "selector": {
643 "type": "string",
644 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700645 },
646 "timeout": {
647 "type": "string",
648 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000649 }
650 },
651 "required": ["selector"]
652 }`),
653 Run: b.scrollIntoViewRun,
654 }
655}
656
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700657func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000658 var input scrollIntoViewInput
659 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700660 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000661 }
662
663 browserCtx, err := b.GetBrowserContext()
664 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700665 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000666 }
667
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700668 // Create a timeout context for this operation
669 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
670 defer cancel()
671
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000672 script := fmt.Sprintf(`
673 const el = document.querySelector('%s');
674 if (el) {
675 el.scrollIntoView({behavior: 'smooth', block: 'center'});
676 return true;
677 }
678 return false;
679 `, input.Selector)
680
681 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700682 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000683 chromedp.WaitReady(input.Selector),
684 chromedp.Evaluate(script, &result),
685 )
686 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700687 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000688 }
689
690 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700691 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000692 }
693
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700694 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000695}
696
Philip Zeyliger05224842025-05-10 18:26:08 -0700697// ResizeTool definition
698type resizeInput struct {
699 Width int `json:"width"`
700 Height int `json:"height"`
701 Timeout string `json:"timeout,omitempty"`
702}
703
704// NewResizeTool creates a tool for resizing the browser window
705func (b *BrowseTools) NewResizeTool() *llm.Tool {
706 return &llm.Tool{
707 Name: "browser_resize",
708 Description: "Resize the browser window to a specific width and height",
709 InputSchema: json.RawMessage(`{
710 "type": "object",
711 "properties": {
712 "width": {
713 "type": "integer",
714 "description": "Window width in pixels"
715 },
716 "height": {
717 "type": "integer",
718 "description": "Window height in pixels"
719 },
720 "timeout": {
721 "type": "string",
722 "description": "Timeout as a Go duration string (default: 5s)"
723 }
724 },
725 "required": ["width", "height"]
726 }`),
727 Run: b.resizeRun,
728 }
729}
730
731func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
732 var input resizeInput
733 if err := json.Unmarshal(m, &input); err != nil {
734 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
735 }
736
737 browserCtx, err := b.GetBrowserContext()
738 if err != nil {
739 return llm.TextContent(errorResponse(err)), nil
740 }
741
742 // Create a timeout context for this operation
743 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
744 defer cancel()
745
746 // Validate dimensions
747 if input.Width <= 0 || input.Height <= 0 {
748 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
749 }
750
751 // Resize the browser window
752 err = chromedp.Run(timeoutCtx,
753 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
754 )
755 if err != nil {
756 return llm.TextContent(errorResponse(err)), nil
757 }
758
759 return llm.TextContent(successResponse()), nil
760}
761
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700762// GetTools returns browser tools, optionally filtering out screenshot-related tools
763func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
764 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000765 b.NewNavigateTool(),
766 b.NewClickTool(),
767 b.NewTypeTool(),
768 b.NewWaitForTool(),
769 b.NewGetTextTool(),
770 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000771 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700772 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700773 b.NewRecentConsoleLogsTool(),
774 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000775 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700776
777 // Add screenshot-related tools if supported
778 if includeScreenshotTools {
779 tools = append(tools, b.NewScreenshotTool())
780 tools = append(tools, b.NewReadImageTool())
781 }
782
783 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000784}
785
786// SaveScreenshot saves a screenshot to disk and returns its ID
787func (b *BrowseTools) SaveScreenshot(data []byte) string {
788 // Generate a unique ID
789 id := uuid.New().String()
790
791 // Save the file
792 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000793 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000794 log.Printf("Failed to save screenshot: %v", err)
795 return ""
796 }
797
798 // Track this screenshot
799 b.screenshotsMutex.Lock()
800 b.screenshots[id] = time.Now()
801 b.screenshotsMutex.Unlock()
802
803 return id
804}
805
806// GetScreenshotPath returns the full path to a screenshot by ID
807func GetScreenshotPath(id string) string {
808 return filepath.Join(ScreenshotDir, id+".png")
809}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700810
811// ReadImageTool definition
812type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700813 Path string `json:"path"`
814 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700815}
816
817// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
818func (b *BrowseTools) NewReadImageTool() *llm.Tool {
819 return &llm.Tool{
820 Name: "browser_read_image",
821 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
822 InputSchema: json.RawMessage(`{
823 "type": "object",
824 "properties": {
825 "path": {
826 "type": "string",
827 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700828 },
829 "timeout": {
830 "type": "string",
831 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700832 }
833 },
834 "required": ["path"]
835 }`),
836 Run: b.readImageRun,
837 }
838}
839
840func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
841 var input readImageInput
842 if err := json.Unmarshal(m, &input); err != nil {
843 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
844 }
845
846 // Check if the path exists
847 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
848 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
849 }
850
851 // Read the file
852 imageData, err := os.ReadFile(input.Path)
853 if err != nil {
854 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
855 }
856
857 // Detect the image type
858 imageType := http.DetectContentType(imageData)
859 if !strings.HasPrefix(imageType, "image/") {
860 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
861 }
862
863 // Encode the image as base64
864 base64Data := base64.StdEncoding.EncodeToString(imageData)
865
866 // Create a Content object that includes both text and the image
867 return []llm.Content{
868 {
869 Type: llm.ContentTypeText,
870 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
871 },
872 {
873 Type: llm.ContentTypeText, // Will be mapped to image in content array
874 MediaType: imageType,
875 Data: base64Data,
876 },
877 }, nil
878}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700879
880// parseTimeout parses a timeout string and returns a time.Duration
881// It returns a default of 5 seconds if the timeout is empty or invalid
882func parseTimeout(timeout string) time.Duration {
883 if timeout == "" {
884 return 5 * time.Second // default 5 seconds
885 }
886
887 dur, err := time.ParseDuration(timeout)
888 if err != nil {
889 // If parsing fails, return the default
890 return 5 * time.Second
891 }
892
893 return dur
894}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700895
896// captureConsoleLog captures a console log event and stores it
897func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
898 // Add to logs with mutex protection
899 b.consoleLogsMutex.Lock()
900 defer b.consoleLogsMutex.Unlock()
901
902 // Add the log and maintain max size
903 b.consoleLogs = append(b.consoleLogs, e)
904 if len(b.consoleLogs) > b.maxConsoleLogs {
905 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
906 }
907}
908
909// RecentConsoleLogsTool definition
910type recentConsoleLogsInput struct {
911 Limit int `json:"limit,omitempty"`
912}
913
914// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
915func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
916 return &llm.Tool{
917 Name: "browser_recent_console_logs",
918 Description: "Get recent browser console logs",
919 InputSchema: json.RawMessage(`{
920 "type": "object",
921 "properties": {
922 "limit": {
923 "type": "integer",
924 "description": "Maximum number of log entries to return (default: 100)"
925 }
926 }
927 }`),
928 Run: b.recentConsoleLogsRun,
929 }
930}
931
932func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
933 var input recentConsoleLogsInput
934 if err := json.Unmarshal(m, &input); err != nil {
935 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
936 }
937
938 // Ensure browser is initialized
939 _, err := b.GetBrowserContext()
940 if err != nil {
941 return llm.TextContent(errorResponse(err)), nil
942 }
943
944 // Apply limit (default to 100 if not specified)
945 limit := 100
946 if input.Limit > 0 {
947 limit = input.Limit
948 }
949
950 // Get console logs with mutex protection
951 b.consoleLogsMutex.Lock()
952 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
953 start := 0
954 if len(b.consoleLogs) > limit {
955 start = len(b.consoleLogs) - limit
956 }
957 logs = append(logs, b.consoleLogs[start:]...)
958 b.consoleLogsMutex.Unlock()
959
960 // Format the logs as JSON
961 logData, err := json.MarshalIndent(logs, "", " ")
962 if err != nil {
963 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
964 }
965
966 // Format the logs
967 var sb strings.Builder
968 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
969
970 if len(logs) == 0 {
971 sb.WriteString("No console logs captured.")
972 } else {
973 // Add the JSON data for full details
974 sb.WriteString(string(logData))
975 }
976
977 return llm.TextContent(sb.String()), nil
978}
979
980// ClearConsoleLogsTool definition
981type clearConsoleLogsInput struct{}
982
983// NewClearConsoleLogsTool creates a tool for clearing console logs
984func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
985 return &llm.Tool{
986 Name: "browser_clear_console_logs",
987 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700988 InputSchema: llm.EmptySchema(),
989 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700990 }
991}
992
993func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
994 var input clearConsoleLogsInput
995 if err := json.Unmarshal(m, &input); err != nil {
996 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
997 }
998
999 // Ensure browser is initialized
1000 _, err := b.GetBrowserContext()
1001 if err != nil {
1002 return llm.TextContent(errorResponse(err)), nil
1003 }
1004
1005 // Clear console logs with mutex protection
1006 b.consoleLogsMutex.Lock()
1007 logCount := len(b.consoleLogs)
1008 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
1009 b.consoleLogsMutex.Unlock()
1010
1011 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
1012}