blob: 9c2cca10ed1df25b557a6c81856465f20685f005 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000011 "os"
12 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070013 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000014 "sync"
15 "time"
16
Philip Zeyliger18e33682025-05-13 16:34:21 -070017 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000018 "github.com/chromedp/chromedp"
19 "github.com/google/uuid"
20 "sketch.dev/llm"
21)
22
23// ScreenshotDir is the directory where screenshots are stored
24const ScreenshotDir = "/tmp/sketch-screenshots"
25
26// BrowseTools contains all browser tools and manages a shared browser instance
27type BrowseTools struct {
28 ctx context.Context
29 cancel context.CancelFunc
30 browserCtx context.Context
31 browserCtxCancel context.CancelFunc
32 mux sync.Mutex
33 initOnce sync.Once
34 initialized bool
35 initErr error
36 // Map to track screenshots by ID and their creation time
37 screenshots map[string]time.Time
38 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070039 // Console logs storage
40 consoleLogs []*runtime.EventConsoleAPICalled
41 consoleLogsMutex sync.Mutex
42 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000043}
44
45// NewBrowseTools creates a new set of browser automation tools
46func NewBrowseTools(ctx context.Context) *BrowseTools {
47 ctx, cancel := context.WithCancel(ctx)
48
49 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000050 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000051 log.Printf("Failed to create screenshot directory: %v", err)
52 }
53
54 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070055 ctx: ctx,
56 cancel: cancel,
57 screenshots: make(map[string]time.Time),
58 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
59 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000060 }
61
62 return b
63}
64
65// Initialize starts the browser if it's not already running
66func (b *BrowseTools) Initialize() error {
67 b.mux.Lock()
68 defer b.mux.Unlock()
69
70 b.initOnce.Do(func() {
71 // ChromeDP.ExecPath has a list of common places to find Chrome...
72 opts := chromedp.DefaultExecAllocatorOptions[:]
73 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
74 browserCtx, browserCancel := chromedp.NewContext(
75 allocCtx,
76 chromedp.WithLogf(log.Printf),
77 )
78
79 b.browserCtx = browserCtx
80 b.browserCtxCancel = browserCancel
81
Philip Zeyliger18e33682025-05-13 16:34:21 -070082 // Set up console log listener
83 chromedp.ListenTarget(browserCtx, func(ev any) {
84 switch e := ev.(type) {
85 case *runtime.EventConsoleAPICalled:
86 b.captureConsoleLog(e)
87 }
88 })
89
Philip Zeyliger33d282f2025-05-03 04:01:54 +000090 // Ensure the browser starts
91 if err := chromedp.Run(browserCtx); err != nil {
92 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
93 return
94 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +000095
96 // Set default viewport size to 1280x720 (16:9 widescreen)
97 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
98 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
99 return
100 }
101
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000102 b.initialized = true
103 })
104
105 return b.initErr
106}
107
108// Close shuts down the browser
109func (b *BrowseTools) Close() {
110 b.mux.Lock()
111 defer b.mux.Unlock()
112
113 if b.browserCtxCancel != nil {
114 b.browserCtxCancel()
115 b.browserCtxCancel = nil
116 }
117
118 if b.cancel != nil {
119 b.cancel()
120 }
121
122 b.initialized = false
123 log.Println("Browser closed")
124}
125
126// GetBrowserContext returns the context for browser operations
127func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
128 if err := b.Initialize(); err != nil {
129 return nil, err
130 }
131 return b.browserCtx, nil
132}
133
134// All tools return this as a response when successful
135type baseResponse struct {
136 Status string `json:"status,omitempty"`
137}
138
139func successResponse() string {
140 return `{"status":"success"}`
141}
142
143func errorResponse(err error) string {
144 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
145}
146
147// NavigateTool definition
148type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700149 URL string `json:"url"`
150 Timeout string `json:"timeout,omitempty"`
151} // NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000152func (b *BrowseTools) NewNavigateTool() *llm.Tool {
153 return &llm.Tool{
154 Name: "browser_navigate",
155 Description: "Navigate the browser to a specific URL and wait for page to load",
156 InputSchema: json.RawMessage(`{
157 "type": "object",
158 "properties": {
159 "url": {
160 "type": "string",
161 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700162 },
163 "timeout": {
164 "type": "string",
165 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000166 }
167 },
168 "required": ["url"]
169 }`),
170 Run: b.navigateRun,
171 }
172}
173
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700174func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000175 var input navigateInput
176 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700177 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000178 }
179
180 browserCtx, err := b.GetBrowserContext()
181 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700182 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000183 }
184
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700185 // Create a timeout context for this operation
186 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
187 defer cancel()
188
189 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000190 chromedp.Navigate(input.URL),
191 chromedp.WaitReady("body"),
192 )
193 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700194 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000195 }
196
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700197 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000198}
199
200// ClickTool definition
201type clickInput struct {
202 Selector string `json:"selector"`
203 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700204 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000205}
206
207// NewClickTool creates a tool for clicking elements
208func (b *BrowseTools) NewClickTool() *llm.Tool {
209 return &llm.Tool{
210 Name: "browser_click",
211 Description: "Click the first element matching a CSS selector",
212 InputSchema: json.RawMessage(`{
213 "type": "object",
214 "properties": {
215 "selector": {
216 "type": "string",
217 "description": "CSS selector for the element to click"
218 },
219 "wait_visible": {
220 "type": "boolean",
221 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700222 },
223 "timeout": {
224 "type": "string",
225 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000226 }
227 },
228 "required": ["selector"]
229 }`),
230 Run: b.clickRun,
231 }
232}
233
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700234func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000235 var input clickInput
236 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700237 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000238 }
239
240 browserCtx, err := b.GetBrowserContext()
241 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700242 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000243 }
244
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700245 // Create a timeout context for this operation
246 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
247 defer cancel()
248
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000249 actions := []chromedp.Action{
250 chromedp.WaitReady(input.Selector),
251 }
252
253 if input.WaitVisible {
254 actions = append(actions, chromedp.WaitVisible(input.Selector))
255 }
256
257 actions = append(actions, chromedp.Click(input.Selector))
258
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700259 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000260 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700261 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000262 }
263
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700264 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000265}
266
267// TypeTool definition
268type typeInput struct {
269 Selector string `json:"selector"`
270 Text string `json:"text"`
271 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700272 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000273}
274
275// NewTypeTool creates a tool for typing into input elements
276func (b *BrowseTools) NewTypeTool() *llm.Tool {
277 return &llm.Tool{
278 Name: "browser_type",
279 Description: "Type text into an input or textarea element",
280 InputSchema: json.RawMessage(`{
281 "type": "object",
282 "properties": {
283 "selector": {
284 "type": "string",
285 "description": "CSS selector for the input element"
286 },
287 "text": {
288 "type": "string",
289 "description": "Text to type into the element"
290 },
291 "clear": {
292 "type": "boolean",
293 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700294 },
295 "timeout": {
296 "type": "string",
297 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000298 }
299 },
300 "required": ["selector", "text"]
301 }`),
302 Run: b.typeRun,
303 }
304}
305
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700306func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000307 var input typeInput
308 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700309 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000310 }
311
312 browserCtx, err := b.GetBrowserContext()
313 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700314 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000315 }
316
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700317 // Create a timeout context for this operation
318 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
319 defer cancel()
320
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000321 actions := []chromedp.Action{
322 chromedp.WaitReady(input.Selector),
323 chromedp.WaitVisible(input.Selector),
324 }
325
326 if input.Clear {
327 actions = append(actions, chromedp.Clear(input.Selector))
328 }
329
330 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
331
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700332 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000333 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700334 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000335 }
336
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700337 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000338}
339
340// WaitForTool definition
341type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700342 Selector string `json:"selector"`
343 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000344}
345
346// NewWaitForTool creates a tool for waiting for elements
347func (b *BrowseTools) NewWaitForTool() *llm.Tool {
348 return &llm.Tool{
349 Name: "browser_wait_for",
350 Description: "Wait for an element to be present in the DOM",
351 InputSchema: json.RawMessage(`{
352 "type": "object",
353 "properties": {
354 "selector": {
355 "type": "string",
356 "description": "CSS selector for the element to wait for"
357 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700358 "timeout": {
359 "type": "string",
360 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000361 }
362 },
363 "required": ["selector"]
364 }`),
365 Run: b.waitForRun,
366 }
367}
368
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700369func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000370 var input waitForInput
371 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700372 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000373 }
374
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000375 browserCtx, err := b.GetBrowserContext()
376 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700377 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000378 }
379
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700380 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000381 defer cancel()
382
383 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
384 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700385 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000386 }
387
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700388 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000389}
390
391// GetTextTool definition
392type getTextInput struct {
393 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700394 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000395}
396
397type getTextOutput struct {
398 Text string `json:"text"`
399}
400
401// NewGetTextTool creates a tool for getting text from elements
402func (b *BrowseTools) NewGetTextTool() *llm.Tool {
403 return &llm.Tool{
404 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700405 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000406 InputSchema: json.RawMessage(`{
407 "type": "object",
408 "properties": {
409 "selector": {
410 "type": "string",
411 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700412 },
413 "timeout": {
414 "type": "string",
415 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000416 }
417 },
418 "required": ["selector"]
419 }`),
420 Run: b.getTextRun,
421 }
422}
423
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700424func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000425 var input getTextInput
426 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700427 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000428 }
429
430 browserCtx, err := b.GetBrowserContext()
431 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700432 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000433 }
434
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700435 // Create a timeout context for this operation
436 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
437 defer cancel()
438
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000439 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700440 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000441 chromedp.WaitReady(input.Selector),
442 chromedp.Text(input.Selector, &text),
443 )
444 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700445 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000446 }
447
448 output := getTextOutput{Text: text}
449 result, err := json.Marshal(output)
450 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700451 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000452 }
453
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700454 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000455}
456
457// EvalTool definition
458type evalInput struct {
459 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700460 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000461}
462
463type evalOutput struct {
464 Result any `json:"result"`
465}
466
467// NewEvalTool creates a tool for evaluating JavaScript
468func (b *BrowseTools) NewEvalTool() *llm.Tool {
469 return &llm.Tool{
470 Name: "browser_eval",
471 Description: "Evaluate JavaScript in the browser context",
472 InputSchema: json.RawMessage(`{
473 "type": "object",
474 "properties": {
475 "expression": {
476 "type": "string",
477 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700478 },
479 "timeout": {
480 "type": "string",
481 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000482 }
483 },
484 "required": ["expression"]
485 }`),
486 Run: b.evalRun,
487 }
488}
489
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700490func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000491 var input evalInput
492 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700493 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000494 }
495
496 browserCtx, err := b.GetBrowserContext()
497 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700498 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000499 }
500
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700501 // Create a timeout context for this operation
502 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
503 defer cancel()
504
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000505 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700506 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000507 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700508 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000509 }
510
511 output := evalOutput{Result: result}
512 response, err := json.Marshal(output)
513 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700514 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000515 }
516
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700517 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000518}
519
520// ScreenshotTool definition
521type screenshotInput struct {
522 Selector string `json:"selector,omitempty"`
523 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700524 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000525}
526
527type screenshotOutput struct {
528 ID string `json:"id"`
529}
530
531// NewScreenshotTool creates a tool for taking screenshots
532func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
533 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700534 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000535 Description: "Take a screenshot of the page or a specific element",
536 InputSchema: json.RawMessage(`{
537 "type": "object",
538 "properties": {
539 "selector": {
540 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700541 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000542 },
543 "format": {
544 "type": "string",
545 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
546 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700547 },
548 "timeout": {
549 "type": "string",
550 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000551 }
552 }
553 }`),
554 Run: b.screenshotRun,
555 }
556}
557
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700558func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000559 var input screenshotInput
560 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700561 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000562 }
563
564 browserCtx, err := b.GetBrowserContext()
565 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700566 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000567 }
568
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700569 // Create a timeout context for this operation
570 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
571 defer cancel()
572
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000573 var buf []byte
574 var actions []chromedp.Action
575
576 if input.Selector != "" {
577 // Take screenshot of specific element
578 actions = append(actions,
579 chromedp.WaitReady(input.Selector),
580 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
581 )
582 } else {
583 // Take full page screenshot
584 actions = append(actions, chromedp.CaptureScreenshot(&buf))
585 }
586
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700587 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000588 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700589 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000590 }
591
592 // Save the screenshot and get its ID
593 id := b.SaveScreenshot(buf)
594 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700595 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000596 }
597
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700598 // Get the full path to the screenshot
599 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000600
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700601 // Return the ID and instructions on how to view the screenshot
602 result := fmt.Sprintf(`{
603 "id": "%s",
604 "path": "%s",
605 "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
606}`, id, screenshotPath)
607
608 return llm.TextContent(result), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000609}
610
611// ScrollIntoViewTool definition
612type scrollIntoViewInput struct {
613 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700614 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000615}
616
617// NewScrollIntoViewTool creates a tool for scrolling elements into view
618func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
619 return &llm.Tool{
620 Name: "browser_scroll_into_view",
621 Description: "Scroll an element into view if it's not visible",
622 InputSchema: json.RawMessage(`{
623 "type": "object",
624 "properties": {
625 "selector": {
626 "type": "string",
627 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700628 },
629 "timeout": {
630 "type": "string",
631 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000632 }
633 },
634 "required": ["selector"]
635 }`),
636 Run: b.scrollIntoViewRun,
637 }
638}
639
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700640func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000641 var input scrollIntoViewInput
642 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700643 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000644 }
645
646 browserCtx, err := b.GetBrowserContext()
647 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700648 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000649 }
650
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700651 // Create a timeout context for this operation
652 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
653 defer cancel()
654
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000655 script := fmt.Sprintf(`
656 const el = document.querySelector('%s');
657 if (el) {
658 el.scrollIntoView({behavior: 'smooth', block: 'center'});
659 return true;
660 }
661 return false;
662 `, input.Selector)
663
664 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700665 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000666 chromedp.WaitReady(input.Selector),
667 chromedp.Evaluate(script, &result),
668 )
669 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700670 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000671 }
672
673 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700674 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000675 }
676
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700677 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000678}
679
Philip Zeyliger05224842025-05-10 18:26:08 -0700680// ResizeTool definition
681type resizeInput struct {
682 Width int `json:"width"`
683 Height int `json:"height"`
684 Timeout string `json:"timeout,omitempty"`
685}
686
687// NewResizeTool creates a tool for resizing the browser window
688func (b *BrowseTools) NewResizeTool() *llm.Tool {
689 return &llm.Tool{
690 Name: "browser_resize",
691 Description: "Resize the browser window to a specific width and height",
692 InputSchema: json.RawMessage(`{
693 "type": "object",
694 "properties": {
695 "width": {
696 "type": "integer",
697 "description": "Window width in pixels"
698 },
699 "height": {
700 "type": "integer",
701 "description": "Window height in pixels"
702 },
703 "timeout": {
704 "type": "string",
705 "description": "Timeout as a Go duration string (default: 5s)"
706 }
707 },
708 "required": ["width", "height"]
709 }`),
710 Run: b.resizeRun,
711 }
712}
713
714func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
715 var input resizeInput
716 if err := json.Unmarshal(m, &input); err != nil {
717 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
718 }
719
720 browserCtx, err := b.GetBrowserContext()
721 if err != nil {
722 return llm.TextContent(errorResponse(err)), nil
723 }
724
725 // Create a timeout context for this operation
726 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
727 defer cancel()
728
729 // Validate dimensions
730 if input.Width <= 0 || input.Height <= 0 {
731 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
732 }
733
734 // Resize the browser window
735 err = chromedp.Run(timeoutCtx,
736 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
737 )
738 if err != nil {
739 return llm.TextContent(errorResponse(err)), nil
740 }
741
742 return llm.TextContent(successResponse()), nil
743}
744
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700745// GetTools returns browser tools, optionally filtering out screenshot-related tools
746func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
747 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000748 b.NewNavigateTool(),
749 b.NewClickTool(),
750 b.NewTypeTool(),
751 b.NewWaitForTool(),
752 b.NewGetTextTool(),
753 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000754 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700755 b.NewResizeTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700756 b.NewRecentConsoleLogsTool(),
757 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000758 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700759
760 // Add screenshot-related tools if supported
761 if includeScreenshotTools {
762 tools = append(tools, b.NewScreenshotTool())
763 tools = append(tools, b.NewReadImageTool())
764 }
765
766 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000767}
768
769// SaveScreenshot saves a screenshot to disk and returns its ID
770func (b *BrowseTools) SaveScreenshot(data []byte) string {
771 // Generate a unique ID
772 id := uuid.New().String()
773
774 // Save the file
775 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000776 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000777 log.Printf("Failed to save screenshot: %v", err)
778 return ""
779 }
780
781 // Track this screenshot
782 b.screenshotsMutex.Lock()
783 b.screenshots[id] = time.Now()
784 b.screenshotsMutex.Unlock()
785
786 return id
787}
788
789// GetScreenshotPath returns the full path to a screenshot by ID
790func GetScreenshotPath(id string) string {
791 return filepath.Join(ScreenshotDir, id+".png")
792}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700793
794// ReadImageTool definition
795type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700796 Path string `json:"path"`
797 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700798}
799
800// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
801func (b *BrowseTools) NewReadImageTool() *llm.Tool {
802 return &llm.Tool{
803 Name: "browser_read_image",
804 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
805 InputSchema: json.RawMessage(`{
806 "type": "object",
807 "properties": {
808 "path": {
809 "type": "string",
810 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700811 },
812 "timeout": {
813 "type": "string",
814 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700815 }
816 },
817 "required": ["path"]
818 }`),
819 Run: b.readImageRun,
820 }
821}
822
823func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
824 var input readImageInput
825 if err := json.Unmarshal(m, &input); err != nil {
826 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
827 }
828
829 // Check if the path exists
830 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
831 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
832 }
833
834 // Read the file
835 imageData, err := os.ReadFile(input.Path)
836 if err != nil {
837 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
838 }
839
840 // Detect the image type
841 imageType := http.DetectContentType(imageData)
842 if !strings.HasPrefix(imageType, "image/") {
843 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
844 }
845
846 // Encode the image as base64
847 base64Data := base64.StdEncoding.EncodeToString(imageData)
848
849 // Create a Content object that includes both text and the image
850 return []llm.Content{
851 {
852 Type: llm.ContentTypeText,
853 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
854 },
855 {
856 Type: llm.ContentTypeText, // Will be mapped to image in content array
857 MediaType: imageType,
858 Data: base64Data,
859 },
860 }, nil
861}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700862
863// parseTimeout parses a timeout string and returns a time.Duration
864// It returns a default of 5 seconds if the timeout is empty or invalid
865func parseTimeout(timeout string) time.Duration {
866 if timeout == "" {
867 return 5 * time.Second // default 5 seconds
868 }
869
870 dur, err := time.ParseDuration(timeout)
871 if err != nil {
872 // If parsing fails, return the default
873 return 5 * time.Second
874 }
875
876 return dur
877}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700878
879// captureConsoleLog captures a console log event and stores it
880func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
881 // Add to logs with mutex protection
882 b.consoleLogsMutex.Lock()
883 defer b.consoleLogsMutex.Unlock()
884
885 // Add the log and maintain max size
886 b.consoleLogs = append(b.consoleLogs, e)
887 if len(b.consoleLogs) > b.maxConsoleLogs {
888 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
889 }
890}
891
892// RecentConsoleLogsTool definition
893type recentConsoleLogsInput struct {
894 Limit int `json:"limit,omitempty"`
895}
896
897// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
898func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
899 return &llm.Tool{
900 Name: "browser_recent_console_logs",
901 Description: "Get recent browser console logs",
902 InputSchema: json.RawMessage(`{
903 "type": "object",
904 "properties": {
905 "limit": {
906 "type": "integer",
907 "description": "Maximum number of log entries to return (default: 100)"
908 }
909 }
910 }`),
911 Run: b.recentConsoleLogsRun,
912 }
913}
914
915func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
916 var input recentConsoleLogsInput
917 if err := json.Unmarshal(m, &input); err != nil {
918 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
919 }
920
921 // Ensure browser is initialized
922 _, err := b.GetBrowserContext()
923 if err != nil {
924 return llm.TextContent(errorResponse(err)), nil
925 }
926
927 // Apply limit (default to 100 if not specified)
928 limit := 100
929 if input.Limit > 0 {
930 limit = input.Limit
931 }
932
933 // Get console logs with mutex protection
934 b.consoleLogsMutex.Lock()
935 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
936 start := 0
937 if len(b.consoleLogs) > limit {
938 start = len(b.consoleLogs) - limit
939 }
940 logs = append(logs, b.consoleLogs[start:]...)
941 b.consoleLogsMutex.Unlock()
942
943 // Format the logs as JSON
944 logData, err := json.MarshalIndent(logs, "", " ")
945 if err != nil {
946 return llm.TextContent(errorResponse(fmt.Errorf("failed to serialize logs: %w", err))), nil
947 }
948
949 // Format the logs
950 var sb strings.Builder
951 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
952
953 if len(logs) == 0 {
954 sb.WriteString("No console logs captured.")
955 } else {
956 // Add the JSON data for full details
957 sb.WriteString(string(logData))
958 }
959
960 return llm.TextContent(sb.String()), nil
961}
962
963// ClearConsoleLogsTool definition
964type clearConsoleLogsInput struct{}
965
966// NewClearConsoleLogsTool creates a tool for clearing console logs
967func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
968 return &llm.Tool{
969 Name: "browser_clear_console_logs",
970 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700971 InputSchema: llm.EmptySchema(),
972 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700973 }
974}
975
976func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
977 var input clearConsoleLogsInput
978 if err := json.Unmarshal(m, &input); err != nil {
979 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
980 }
981
982 // Ensure browser is initialized
983 _, err := b.GetBrowserContext()
984 if err != nil {
985 return llm.TextContent(errorResponse(err)), nil
986 }
987
988 // Clear console logs with mutex protection
989 b.consoleLogsMutex.Lock()
990 logCount := len(b.consoleLogs)
991 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
992 b.consoleLogsMutex.Unlock()
993
994 return llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount)), nil
995}