blob: 40974e318c0ed2f44e0a41fea6cc154f7c93c388 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000011 "os"
12 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070013 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000014 "sync"
15 "time"
16
Philip Zeyliger18e33682025-05-13 16:34:21 -070017 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000018 "github.com/chromedp/chromedp"
19 "github.com/google/uuid"
20 "sketch.dev/llm"
21)
22
23// ScreenshotDir is the directory where screenshots are stored
24const ScreenshotDir = "/tmp/sketch-screenshots"
25
26// BrowseTools contains all browser tools and manages a shared browser instance
27type BrowseTools struct {
28 ctx context.Context
29 cancel context.CancelFunc
30 browserCtx context.Context
31 browserCtxCancel context.CancelFunc
32 mux sync.Mutex
33 initOnce sync.Once
34 initialized bool
35 initErr error
36 // Map to track screenshots by ID and their creation time
37 screenshots map[string]time.Time
38 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070039 // Console logs storage
40 consoleLogs []*runtime.EventConsoleAPICalled
41 consoleLogsMutex sync.Mutex
42 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000043}
44
45// NewBrowseTools creates a new set of browser automation tools
46func NewBrowseTools(ctx context.Context) *BrowseTools {
47 ctx, cancel := context.WithCancel(ctx)
48
49 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000050 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000051 log.Printf("Failed to create screenshot directory: %v", err)
52 }
53
54 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070055 ctx: ctx,
56 cancel: cancel,
57 screenshots: make(map[string]time.Time),
58 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
59 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000060 }
61
62 return b
63}
64
65// Initialize starts the browser if it's not already running
66func (b *BrowseTools) Initialize() error {
67 b.mux.Lock()
68 defer b.mux.Unlock()
69
70 b.initOnce.Do(func() {
71 // ChromeDP.ExecPath has a list of common places to find Chrome...
72 opts := chromedp.DefaultExecAllocatorOptions[:]
73 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
74 browserCtx, browserCancel := chromedp.NewContext(
75 allocCtx,
76 chromedp.WithLogf(log.Printf),
77 )
78
79 b.browserCtx = browserCtx
80 b.browserCtxCancel = browserCancel
81
Philip Zeyliger18e33682025-05-13 16:34:21 -070082 // Set up console log listener
83 chromedp.ListenTarget(browserCtx, func(ev any) {
84 switch e := ev.(type) {
85 case *runtime.EventConsoleAPICalled:
86 b.captureConsoleLog(e)
87 }
88 })
89
Philip Zeyliger33d282f2025-05-03 04:01:54 +000090 // Ensure the browser starts
91 if err := chromedp.Run(browserCtx); err != nil {
92 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
93 return
94 }
95 b.initialized = true
96 })
97
98 return b.initErr
99}
100
101// Close shuts down the browser
102func (b *BrowseTools) Close() {
103 b.mux.Lock()
104 defer b.mux.Unlock()
105
106 if b.browserCtxCancel != nil {
107 b.browserCtxCancel()
108 b.browserCtxCancel = nil
109 }
110
111 if b.cancel != nil {
112 b.cancel()
113 }
114
115 b.initialized = false
116 log.Println("Browser closed")
117}
118
119// GetBrowserContext returns the context for browser operations
120func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
121 if err := b.Initialize(); err != nil {
122 return nil, err
123 }
124 return b.browserCtx, nil
125}
126
127// All tools return this as a response when successful
128type baseResponse struct {
129 Status string `json:"status,omitempty"`
130}
131
132func successResponse() string {
133 return `{"status":"success"}`
134}
135
136func errorResponse(err error) string {
137 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
138}
139
140// NavigateTool definition
141type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700142 URL string `json:"url"`
143 Timeout string `json:"timeout,omitempty"`
144} // NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000145func (b *BrowseTools) NewNavigateTool() *llm.Tool {
146 return &llm.Tool{
147 Name: "browser_navigate",
148 Description: "Navigate the browser to a specific URL and wait for page to load",
149 InputSchema: json.RawMessage(`{
150 "type": "object",
151 "properties": {
152 "url": {
153 "type": "string",
154 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700155 },
156 "timeout": {
157 "type": "string",
158 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000159 }
160 },
161 "required": ["url"]
162 }`),
163 Run: b.navigateRun,
164 }
165}
166
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700167func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000168 var input navigateInput
169 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700170 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000171 }
172
173 browserCtx, err := b.GetBrowserContext()
174 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700175 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000176 }
177
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700178 // Create a timeout context for this operation
179 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
180 defer cancel()
181
182 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000183 chromedp.Navigate(input.URL),
184 chromedp.WaitReady("body"),
185 )
186 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700187 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000188 }
189
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700190 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000191}
192
193// ClickTool definition
194type clickInput struct {
195 Selector string `json:"selector"`
196 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700197 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000198}
199
200// NewClickTool creates a tool for clicking elements
201func (b *BrowseTools) NewClickTool() *llm.Tool {
202 return &llm.Tool{
203 Name: "browser_click",
204 Description: "Click the first element matching a CSS selector",
205 InputSchema: json.RawMessage(`{
206 "type": "object",
207 "properties": {
208 "selector": {
209 "type": "string",
210 "description": "CSS selector for the element to click"
211 },
212 "wait_visible": {
213 "type": "boolean",
214 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700215 },
216 "timeout": {
217 "type": "string",
218 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000219 }
220 },
221 "required": ["selector"]
222 }`),
223 Run: b.clickRun,
224 }
225}
226
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700227func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000228 var input clickInput
229 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700230 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000231 }
232
233 browserCtx, err := b.GetBrowserContext()
234 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700235 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000236 }
237
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700238 // Create a timeout context for this operation
239 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
240 defer cancel()
241
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000242 actions := []chromedp.Action{
243 chromedp.WaitReady(input.Selector),
244 }
245
246 if input.WaitVisible {
247 actions = append(actions, chromedp.WaitVisible(input.Selector))
248 }
249
250 actions = append(actions, chromedp.Click(input.Selector))
251
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700252 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000253 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700254 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000255 }
256
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700257 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000258}
259
260// TypeTool definition
261type typeInput struct {
262 Selector string `json:"selector"`
263 Text string `json:"text"`
264 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700265 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000266}
267
268// NewTypeTool creates a tool for typing into input elements
269func (b *BrowseTools) NewTypeTool() *llm.Tool {
270 return &llm.Tool{
271 Name: "browser_type",
272 Description: "Type text into an input or textarea element",
273 InputSchema: json.RawMessage(`{
274 "type": "object",
275 "properties": {
276 "selector": {
277 "type": "string",
278 "description": "CSS selector for the input element"
279 },
280 "text": {
281 "type": "string",
282 "description": "Text to type into the element"
283 },
284 "clear": {
285 "type": "boolean",
286 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700287 },
288 "timeout": {
289 "type": "string",
290 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000291 }
292 },
293 "required": ["selector", "text"]
294 }`),
295 Run: b.typeRun,
296 }
297}
298
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700299func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000300 var input typeInput
301 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700302 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000303 }
304
305 browserCtx, err := b.GetBrowserContext()
306 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700307 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000308 }
309
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700310 // Create a timeout context for this operation
311 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
312 defer cancel()
313
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000314 actions := []chromedp.Action{
315 chromedp.WaitReady(input.Selector),
316 chromedp.WaitVisible(input.Selector),
317 }
318
319 if input.Clear {
320 actions = append(actions, chromedp.Clear(input.Selector))
321 }
322
323 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
324
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700325 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000326 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700327 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000328 }
329
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700330 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000331}
332
333// WaitForTool definition
334type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700335 Selector string `json:"selector"`
336 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000337}
338
339// NewWaitForTool creates a tool for waiting for elements
340func (b *BrowseTools) NewWaitForTool() *llm.Tool {
341 return &llm.Tool{
342 Name: "browser_wait_for",
343 Description: "Wait for an element to be present in the DOM",
344 InputSchema: json.RawMessage(`{
345 "type": "object",
346 "properties": {
347 "selector": {
348 "type": "string",
349 "description": "CSS selector for the element to wait for"
350 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700351 "timeout": {
352 "type": "string",
353 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000354 }
355 },
356 "required": ["selector"]
357 }`),
358 Run: b.waitForRun,
359 }
360}
361
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700362func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000363 var input waitForInput
364 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700365 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000366 }
367
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000368 browserCtx, err := b.GetBrowserContext()
369 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700370 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000371 }
372
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700373 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000374 defer cancel()
375
376 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
377 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700378 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000379 }
380
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700381 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000382}
383
384// GetTextTool definition
385type getTextInput struct {
386 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700387 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000388}
389
390type getTextOutput struct {
391 Text string `json:"text"`
392}
393
394// NewGetTextTool creates a tool for getting text from elements
395func (b *BrowseTools) NewGetTextTool() *llm.Tool {
396 return &llm.Tool{
397 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700398 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000399 InputSchema: json.RawMessage(`{
400 "type": "object",
401 "properties": {
402 "selector": {
403 "type": "string",
404 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700405 },
406 "timeout": {
407 "type": "string",
408 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000409 }
410 },
411 "required": ["selector"]
412 }`),
413 Run: b.getTextRun,
414 }
415}
416
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700417func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000418 var input getTextInput
419 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700420 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000421 }
422
423 browserCtx, err := b.GetBrowserContext()
424 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700425 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000426 }
427
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700428 // Create a timeout context for this operation
429 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
430 defer cancel()
431
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000432 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700433 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000434 chromedp.WaitReady(input.Selector),
435 chromedp.Text(input.Selector, &text),
436 )
437 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700438 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000439 }
440
441 output := getTextOutput{Text: text}
442 result, err := json.Marshal(output)
443 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700444 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000445 }
446
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700447 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000448}
449
450// EvalTool definition
451type evalInput struct {
452 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700453 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000454}
455
456type evalOutput struct {
457 Result any `json:"result"`
458}
459
460// NewEvalTool creates a tool for evaluating JavaScript
461func (b *BrowseTools) NewEvalTool() *llm.Tool {
462 return &llm.Tool{
463 Name: "browser_eval",
464 Description: "Evaluate JavaScript in the browser context",
465 InputSchema: json.RawMessage(`{
466 "type": "object",
467 "properties": {
468 "expression": {
469 "type": "string",
470 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700471 },
472 "timeout": {
473 "type": "string",
474 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000475 }
476 },
477 "required": ["expression"]
478 }`),
479 Run: b.evalRun,
480 }
481}
482
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700483func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000484 var input evalInput
485 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700486 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000487 }
488
489 browserCtx, err := b.GetBrowserContext()
490 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700491 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000492 }
493
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700494 // Create a timeout context for this operation
495 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
496 defer cancel()
497
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000498 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700499 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000500 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700501 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000502 }
503
504 output := evalOutput{Result: result}
505 response, err := json.Marshal(output)
506 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700507 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000508 }
509
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700510 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000511}
512
513// ScreenshotTool definition
514type screenshotInput struct {
515 Selector string `json:"selector,omitempty"`
516 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700517 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000518}
519
520type screenshotOutput struct {
521 ID string `json:"id"`
522}
523
524// NewScreenshotTool creates a tool for taking screenshots
525func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
526 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700527 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000528 Description: "Take a screenshot of the page or a specific element",
529 InputSchema: json.RawMessage(`{
530 "type": "object",
531 "properties": {
532 "selector": {
533 "type": "string",
534 "description": "CSS selector for the element to screenshot (optional)"
535 },
536 "format": {
537 "type": "string",
538 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
539 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700540 },
541 "timeout": {
542 "type": "string",
543 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000544 }
545 }
546 }`),
547 Run: b.screenshotRun,
548 }
549}
550
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700551func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000552 var input screenshotInput
553 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700554 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000555 }
556
557 browserCtx, err := b.GetBrowserContext()
558 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700559 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000560 }
561
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700562 // Create a timeout context for this operation
563 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
564 defer cancel()
565
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000566 var buf []byte
567 var actions []chromedp.Action
568
569 if input.Selector != "" {
570 // Take screenshot of specific element
571 actions = append(actions,
572 chromedp.WaitReady(input.Selector),
573 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
574 )
575 } else {
576 // Take full page screenshot
577 actions = append(actions, chromedp.CaptureScreenshot(&buf))
578 }
579
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700580 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000581 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700582 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000583 }
584
585 // Save the screenshot and get its ID
586 id := b.SaveScreenshot(buf)
587 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700588 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000589 }
590
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700591 // Get the full path to the screenshot
592 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000593
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700594 // Return the ID and instructions on how to view the screenshot
595 result := fmt.Sprintf(`{
596 "id": "%s",
597 "path": "%s",
598 "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
599}`, id, screenshotPath)
600
601 return llm.TextContent(result), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000602}
603
604// ScrollIntoViewTool definition
605type scrollIntoViewInput struct {
606 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700607 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000608}
609
610// NewScrollIntoViewTool creates a tool for scrolling elements into view
611func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
612 return &llm.Tool{
613 Name: "browser_scroll_into_view",
614 Description: "Scroll an element into view if it's not visible",
615 InputSchema: json.RawMessage(`{
616 "type": "object",
617 "properties": {
618 "selector": {
619 "type": "string",
620 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700621 },
622 "timeout": {
623 "type": "string",
624 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000625 }
626 },
627 "required": ["selector"]
628 }`),
629 Run: b.scrollIntoViewRun,
630 }
631}
632
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700633func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000634 var input scrollIntoViewInput
635 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700636 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000637 }
638
639 browserCtx, err := b.GetBrowserContext()
640 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700641 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000642 }
643
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700644 // Create a timeout context for this operation
645 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
646 defer cancel()
647
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000648 script := fmt.Sprintf(`
649 const el = document.querySelector('%s');
650 if (el) {
651 el.scrollIntoView({behavior: 'smooth', block: 'center'});
652 return true;
653 }
654 return false;
655 `, input.Selector)
656
657 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700658 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000659 chromedp.WaitReady(input.Selector),
660 chromedp.Evaluate(script, &result),
661 )
662 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700663 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000664 }
665
666 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700667 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000668 }
669
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700670 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000671}
672
Philip Zeyliger05224842025-05-10 18:26:08 -0700673// ResizeTool definition
674type resizeInput struct {
675 Width int `json:"width"`
676 Height int `json:"height"`
677 Timeout string `json:"timeout,omitempty"`
678}
679
680// NewResizeTool creates a tool for resizing the browser window
681func (b *BrowseTools) NewResizeTool() *llm.Tool {
682 return &llm.Tool{
683 Name: "browser_resize",
684 Description: "Resize the browser window to a specific width and height",
685 InputSchema: json.RawMessage(`{
686 "type": "object",
687 "properties": {
688 "width": {
689 "type": "integer",
690 "description": "Window width in pixels"
691 },
692 "height": {
693 "type": "integer",
694 "description": "Window height in pixels"
695 },
696 "timeout": {
697 "type": "string",
698 "description": "Timeout as a Go duration string (default: 5s)"
699 }
700 },
701 "required": ["width", "height"]
702 }`),
703 Run: b.resizeRun,
704 }
705}
706
707func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
708 var input resizeInput
709 if err := json.Unmarshal(m, &input); err != nil {
710 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
711 }
712
713 browserCtx, err := b.GetBrowserContext()
714 if err != nil {
715 return llm.TextContent(errorResponse(err)), nil
716 }
717
718 // Create a timeout context for this operation
719 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
720 defer cancel()
721
722 // Validate dimensions
723 if input.Width <= 0 || input.Height <= 0 {
724 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
725 }
726
727 // Resize the browser window
728 err = chromedp.Run(timeoutCtx,
729 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
730 )
731 if err != nil {
732 return llm.TextContent(errorResponse(err)), nil
733 }
734
735 return llm.TextContent(successResponse()), nil
736}
737
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700738// GetTools returns browser tools, optionally filtering out screenshot-related tools
739func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
740 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000741 b.NewNavigateTool(),
742 b.NewClickTool(),
743 b.NewTypeTool(),
744 b.NewWaitForTool(),
745 b.NewGetTextTool(),
746 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000747 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700748 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700749 b.NewRecentConsoleLogsTool(),
750 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000751 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700752
753 // Add screenshot-related tools if supported
754 if includeScreenshotTools {
755 tools = append(tools, b.NewScreenshotTool())
756 tools = append(tools, b.NewReadImageTool())
757 }
758
759 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000760}
761
762// SaveScreenshot saves a screenshot to disk and returns its ID
763func (b *BrowseTools) SaveScreenshot(data []byte) string {
764 // Generate a unique ID
765 id := uuid.New().String()
766
767 // Save the file
768 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000769 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000770 log.Printf("Failed to save screenshot: %v", err)
771 return ""
772 }
773
774 // Track this screenshot
775 b.screenshotsMutex.Lock()
776 b.screenshots[id] = time.Now()
777 b.screenshotsMutex.Unlock()
778
779 return id
780}
781
782// GetScreenshotPath returns the full path to a screenshot by ID
783func GetScreenshotPath(id string) string {
784 return filepath.Join(ScreenshotDir, id+".png")
785}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700786
787// ReadImageTool definition
788type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700789 Path string `json:"path"`
790 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700791}
792
793// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
794func (b *BrowseTools) NewReadImageTool() *llm.Tool {
795 return &llm.Tool{
796 Name: "browser_read_image",
797 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
798 InputSchema: json.RawMessage(`{
799 "type": "object",
800 "properties": {
801 "path": {
802 "type": "string",
803 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700804 },
805 "timeout": {
806 "type": "string",
807 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700808 }
809 },
810 "required": ["path"]
811 }`),
812 Run: b.readImageRun,
813 }
814}
815
816func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
817 var input readImageInput
818 if err := json.Unmarshal(m, &input); err != nil {
819 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
820 }
821
822 // Check if the path exists
823 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
824 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
825 }
826
827 // Read the file
828 imageData, err := os.ReadFile(input.Path)
829 if err != nil {
830 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
831 }
832
833 // Detect the image type
834 imageType := http.DetectContentType(imageData)
835 if !strings.HasPrefix(imageType, "image/") {
836 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
837 }
838
839 // Encode the image as base64
840 base64Data := base64.StdEncoding.EncodeToString(imageData)
841
842 // Create a Content object that includes both text and the image
843 return []llm.Content{
844 {
845 Type: llm.ContentTypeText,
846 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
847 },
848 {
849 Type: llm.ContentTypeText, // Will be mapped to image in content array
850 MediaType: imageType,
851 Data: base64Data,
852 },
853 }, nil
854}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700855
856// parseTimeout parses a timeout string and returns a time.Duration
857// It returns a default of 5 seconds if the timeout is empty or invalid
858func parseTimeout(timeout string) time.Duration {
859 if timeout == "" {
860 return 5 * time.Second // default 5 seconds
861 }
862
863 dur, err := time.ParseDuration(timeout)
864 if err != nil {
865 // If parsing fails, return the default
866 return 5 * time.Second
867 }
868
869 return dur
870}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700871
872// captureConsoleLog captures a console log event and stores it
873func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
874 // Add to logs with mutex protection
875 b.consoleLogsMutex.Lock()
876 defer b.consoleLogsMutex.Unlock()
877
878 // Add the log and maintain max size
879 b.consoleLogs = append(b.consoleLogs, e)
880 if len(b.consoleLogs) > b.maxConsoleLogs {
881 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
882 }
883}
884
885// RecentConsoleLogsTool definition
886type recentConsoleLogsInput struct {
887 Limit int `json:"limit,omitempty"`
888}
889
890// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
891func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
892 return &llm.Tool{
893 Name: "browser_recent_console_logs",
894 Description: "Get recent browser console logs",
895 InputSchema: json.RawMessage(`{
896 "type": "object",
897 "properties": {
898 "limit": {
899 "type": "integer",
900 "description": "Maximum number of log entries to return (default: 100)"
901 }
902 }
903 }`),
904 Run: b.recentConsoleLogsRun,
905 }
906}
907
908func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
909 var input recentConsoleLogsInput
910 if err := json.Unmarshal(m, &input); err != nil {
911 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
912 }
913
914 // Ensure browser is initialized
915 _, err := b.GetBrowserContext()
916 if err != nil {
917 return llm.TextContent(errorResponse(err)), nil
918 }
919
920 // Apply limit (default to 100 if not specified)
921 limit := 100
922 if input.Limit > 0 {
923 limit = input.Limit
924 }
925
926 // Get console logs with mutex protection
927 b.consoleLogsMutex.Lock()
928 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
929 start := 0
930 if len(b.consoleLogs) > limit {
931 start = len(b.consoleLogs) - limit
932 }
933 logs = append(logs, b.consoleLogs[start:]...)
934 b.consoleLogsMutex.Unlock()
935
936 // Format the logs as JSON
937 logData, err := json.MarshalIndent(logs, "", " ")
938 if err != nil {
939 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
940 }
941
942 // Format the logs
943 var sb strings.Builder
944 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
945
946 if len(logs) == 0 {
947 sb.WriteString("No console logs captured.")
948 } else {
949 // Add the JSON data for full details
950 sb.WriteString(string(logData))
951 }
952
953 return llm.TextContent(sb.String()), nil
954}
955
956// ClearConsoleLogsTool definition
957type clearConsoleLogsInput struct{}
958
959// NewClearConsoleLogsTool creates a tool for clearing console logs
960func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
961 return &llm.Tool{
962 Name: "browser_clear_console_logs",
963 Description: "Clear all captured browser console logs",
964 InputSchema: json.RawMessage(`{
965 "type": "object",
966 "properties": {}
967 }`),
968 Run: b.clearConsoleLogsRun,
969 }
970}
971
972func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
973 var input clearConsoleLogsInput
974 if err := json.Unmarshal(m, &input); err != nil {
975 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
976 }
977
978 // Ensure browser is initialized
979 _, err := b.GetBrowserContext()
980 if err != nil {
981 return llm.TextContent(errorResponse(err)), nil
982 }
983
984 // Clear console logs with mutex protection
985 b.consoleLogsMutex.Lock()
986 logCount := len(b.consoleLogs)
987 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
988 b.consoleLogsMutex.Unlock()
989
990 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
991}