blob: 60e66ac8a4e8a10bad52351ed627fcd6ee27152c [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000011 "os"
12 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070013 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000014 "sync"
15 "time"
16
17 "github.com/chromedp/chromedp"
18 "github.com/google/uuid"
19 "sketch.dev/llm"
20)
21
22// ScreenshotDir is the directory where screenshots are stored
23const ScreenshotDir = "/tmp/sketch-screenshots"
24
25// BrowseTools contains all browser tools and manages a shared browser instance
26type BrowseTools struct {
27 ctx context.Context
28 cancel context.CancelFunc
29 browserCtx context.Context
30 browserCtxCancel context.CancelFunc
31 mux sync.Mutex
32 initOnce sync.Once
33 initialized bool
34 initErr error
35 // Map to track screenshots by ID and their creation time
36 screenshots map[string]time.Time
37 screenshotsMutex sync.Mutex
38}
39
40// NewBrowseTools creates a new set of browser automation tools
41func NewBrowseTools(ctx context.Context) *BrowseTools {
42 ctx, cancel := context.WithCancel(ctx)
43
44 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000045 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000046 log.Printf("Failed to create screenshot directory: %v", err)
47 }
48
49 b := &BrowseTools{
50 ctx: ctx,
51 cancel: cancel,
52 screenshots: make(map[string]time.Time),
53 }
54
55 return b
56}
57
58// Initialize starts the browser if it's not already running
59func (b *BrowseTools) Initialize() error {
60 b.mux.Lock()
61 defer b.mux.Unlock()
62
63 b.initOnce.Do(func() {
64 // ChromeDP.ExecPath has a list of common places to find Chrome...
65 opts := chromedp.DefaultExecAllocatorOptions[:]
66 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
67 browserCtx, browserCancel := chromedp.NewContext(
68 allocCtx,
69 chromedp.WithLogf(log.Printf),
70 )
71
72 b.browserCtx = browserCtx
73 b.browserCtxCancel = browserCancel
74
75 // Ensure the browser starts
76 if err := chromedp.Run(browserCtx); err != nil {
77 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
78 return
79 }
80 b.initialized = true
81 })
82
83 return b.initErr
84}
85
86// Close shuts down the browser
87func (b *BrowseTools) Close() {
88 b.mux.Lock()
89 defer b.mux.Unlock()
90
91 if b.browserCtxCancel != nil {
92 b.browserCtxCancel()
93 b.browserCtxCancel = nil
94 }
95
96 if b.cancel != nil {
97 b.cancel()
98 }
99
100 b.initialized = false
101 log.Println("Browser closed")
102}
103
104// GetBrowserContext returns the context for browser operations
105func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
106 if err := b.Initialize(); err != nil {
107 return nil, err
108 }
109 return b.browserCtx, nil
110}
111
112// All tools return this as a response when successful
113type baseResponse struct {
114 Status string `json:"status,omitempty"`
115}
116
117func successResponse() string {
118 return `{"status":"success"}`
119}
120
121func errorResponse(err error) string {
122 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
123}
124
125// NavigateTool definition
126type navigateInput struct {
127 URL string `json:"url"`
128}
129
130// NewNavigateTool creates a tool for navigating to URLs
131func (b *BrowseTools) NewNavigateTool() *llm.Tool {
132 return &llm.Tool{
133 Name: "browser_navigate",
134 Description: "Navigate the browser to a specific URL and wait for page to load",
135 InputSchema: json.RawMessage(`{
136 "type": "object",
137 "properties": {
138 "url": {
139 "type": "string",
140 "description": "The URL to navigate to"
141 }
142 },
143 "required": ["url"]
144 }`),
145 Run: b.navigateRun,
146 }
147}
148
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700149func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000150 var input navigateInput
151 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700152 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000153 }
154
155 browserCtx, err := b.GetBrowserContext()
156 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700157 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000158 }
159
160 err = chromedp.Run(browserCtx,
161 chromedp.Navigate(input.URL),
162 chromedp.WaitReady("body"),
163 )
164 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700165 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000166 }
167
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700168 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000169}
170
171// ClickTool definition
172type clickInput struct {
173 Selector string `json:"selector"`
174 WaitVisible bool `json:"wait_visible,omitempty"`
175}
176
177// NewClickTool creates a tool for clicking elements
178func (b *BrowseTools) NewClickTool() *llm.Tool {
179 return &llm.Tool{
180 Name: "browser_click",
181 Description: "Click the first element matching a CSS selector",
182 InputSchema: json.RawMessage(`{
183 "type": "object",
184 "properties": {
185 "selector": {
186 "type": "string",
187 "description": "CSS selector for the element to click"
188 },
189 "wait_visible": {
190 "type": "boolean",
191 "description": "Wait for the element to be visible before clicking"
192 }
193 },
194 "required": ["selector"]
195 }`),
196 Run: b.clickRun,
197 }
198}
199
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700200func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000201 var input clickInput
202 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700203 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000204 }
205
206 browserCtx, err := b.GetBrowserContext()
207 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700208 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000209 }
210
211 actions := []chromedp.Action{
212 chromedp.WaitReady(input.Selector),
213 }
214
215 if input.WaitVisible {
216 actions = append(actions, chromedp.WaitVisible(input.Selector))
217 }
218
219 actions = append(actions, chromedp.Click(input.Selector))
220
221 err = chromedp.Run(browserCtx, actions...)
222 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700223 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000224 }
225
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700226 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000227}
228
229// TypeTool definition
230type typeInput struct {
231 Selector string `json:"selector"`
232 Text string `json:"text"`
233 Clear bool `json:"clear,omitempty"`
234}
235
236// NewTypeTool creates a tool for typing into input elements
237func (b *BrowseTools) NewTypeTool() *llm.Tool {
238 return &llm.Tool{
239 Name: "browser_type",
240 Description: "Type text into an input or textarea element",
241 InputSchema: json.RawMessage(`{
242 "type": "object",
243 "properties": {
244 "selector": {
245 "type": "string",
246 "description": "CSS selector for the input element"
247 },
248 "text": {
249 "type": "string",
250 "description": "Text to type into the element"
251 },
252 "clear": {
253 "type": "boolean",
254 "description": "Clear the input field before typing"
255 }
256 },
257 "required": ["selector", "text"]
258 }`),
259 Run: b.typeRun,
260 }
261}
262
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700263func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000264 var input typeInput
265 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700266 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000267 }
268
269 browserCtx, err := b.GetBrowserContext()
270 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700271 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000272 }
273
274 actions := []chromedp.Action{
275 chromedp.WaitReady(input.Selector),
276 chromedp.WaitVisible(input.Selector),
277 }
278
279 if input.Clear {
280 actions = append(actions, chromedp.Clear(input.Selector))
281 }
282
283 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
284
285 err = chromedp.Run(browserCtx, actions...)
286 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700287 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000288 }
289
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700290 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000291}
292
293// WaitForTool definition
294type waitForInput struct {
295 Selector string `json:"selector"`
296 TimeoutMS int `json:"timeout_ms,omitempty"`
297}
298
299// NewWaitForTool creates a tool for waiting for elements
300func (b *BrowseTools) NewWaitForTool() *llm.Tool {
301 return &llm.Tool{
302 Name: "browser_wait_for",
303 Description: "Wait for an element to be present in the DOM",
304 InputSchema: json.RawMessage(`{
305 "type": "object",
306 "properties": {
307 "selector": {
308 "type": "string",
309 "description": "CSS selector for the element to wait for"
310 },
311 "timeout_ms": {
312 "type": "integer",
313 "description": "Maximum time to wait in milliseconds (default: 30000)"
314 }
315 },
316 "required": ["selector"]
317 }`),
318 Run: b.waitForRun,
319 }
320}
321
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700322func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000323 var input waitForInput
324 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700325 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000326 }
327
328 timeout := 30000 // default timeout 30 seconds
329 if input.TimeoutMS > 0 {
330 timeout = input.TimeoutMS
331 }
332
333 browserCtx, err := b.GetBrowserContext()
334 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700335 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000336 }
337
338 timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
339 defer cancel()
340
341 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
342 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700343 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000344 }
345
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700346 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000347}
348
349// GetTextTool definition
350type getTextInput struct {
351 Selector string `json:"selector"`
352}
353
354type getTextOutput struct {
355 Text string `json:"text"`
356}
357
358// NewGetTextTool creates a tool for getting text from elements
359func (b *BrowseTools) NewGetTextTool() *llm.Tool {
360 return &llm.Tool{
361 Name: "browser_get_text",
362 Description: "Get the innerText of an element",
363 InputSchema: json.RawMessage(`{
364 "type": "object",
365 "properties": {
366 "selector": {
367 "type": "string",
368 "description": "CSS selector for the element to get text from"
369 }
370 },
371 "required": ["selector"]
372 }`),
373 Run: b.getTextRun,
374 }
375}
376
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700377func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000378 var input getTextInput
379 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700380 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000381 }
382
383 browserCtx, err := b.GetBrowserContext()
384 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700385 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000386 }
387
388 var text string
389 err = chromedp.Run(browserCtx,
390 chromedp.WaitReady(input.Selector),
391 chromedp.Text(input.Selector, &text),
392 )
393 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700394 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000395 }
396
397 output := getTextOutput{Text: text}
398 result, err := json.Marshal(output)
399 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700400 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000401 }
402
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700403 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000404}
405
406// EvalTool definition
407type evalInput struct {
408 Expression string `json:"expression"`
409}
410
411type evalOutput struct {
412 Result any `json:"result"`
413}
414
415// NewEvalTool creates a tool for evaluating JavaScript
416func (b *BrowseTools) NewEvalTool() *llm.Tool {
417 return &llm.Tool{
418 Name: "browser_eval",
419 Description: "Evaluate JavaScript in the browser context",
420 InputSchema: json.RawMessage(`{
421 "type": "object",
422 "properties": {
423 "expression": {
424 "type": "string",
425 "description": "JavaScript expression to evaluate"
426 }
427 },
428 "required": ["expression"]
429 }`),
430 Run: b.evalRun,
431 }
432}
433
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700434func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000435 var input evalInput
436 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700437 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000438 }
439
440 browserCtx, err := b.GetBrowserContext()
441 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700442 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000443 }
444
445 var result any
446 err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
447 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700448 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000449 }
450
451 output := evalOutput{Result: result}
452 response, err := json.Marshal(output)
453 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700454 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000455 }
456
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700457 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000458}
459
460// ScreenshotTool definition
461type screenshotInput struct {
462 Selector string `json:"selector,omitempty"`
463 Format string `json:"format,omitempty"`
464}
465
466type screenshotOutput struct {
467 ID string `json:"id"`
468}
469
470// NewScreenshotTool creates a tool for taking screenshots
471func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
472 return &llm.Tool{
473 Name: "browser_screenshot",
474 Description: "Take a screenshot of the page or a specific element",
475 InputSchema: json.RawMessage(`{
476 "type": "object",
477 "properties": {
478 "selector": {
479 "type": "string",
480 "description": "CSS selector for the element to screenshot (optional)"
481 },
482 "format": {
483 "type": "string",
484 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
485 "enum": ["base64", "png"]
486 }
487 }
488 }`),
489 Run: b.screenshotRun,
490 }
491}
492
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700493func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000494 var input screenshotInput
495 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700496 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000497 }
498
499 browserCtx, err := b.GetBrowserContext()
500 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700501 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000502 }
503
504 var buf []byte
505 var actions []chromedp.Action
506
507 if input.Selector != "" {
508 // Take screenshot of specific element
509 actions = append(actions,
510 chromedp.WaitReady(input.Selector),
511 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
512 )
513 } else {
514 // Take full page screenshot
515 actions = append(actions, chromedp.CaptureScreenshot(&buf))
516 }
517
518 err = chromedp.Run(browserCtx, actions...)
519 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700520 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000521 }
522
523 // Save the screenshot and get its ID
524 id := b.SaveScreenshot(buf)
525 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700526 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000527 }
528
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700529 // Get the full path to the screenshot
530 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000531
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700532 // Return the ID and instructions on how to view the screenshot
533 result := fmt.Sprintf(`{
534 "id": "%s",
535 "path": "%s",
536 "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
537}`, id, screenshotPath)
538
539 return llm.TextContent(result), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000540}
541
542// ScrollIntoViewTool definition
543type scrollIntoViewInput struct {
544 Selector string `json:"selector"`
545}
546
547// NewScrollIntoViewTool creates a tool for scrolling elements into view
548func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
549 return &llm.Tool{
550 Name: "browser_scroll_into_view",
551 Description: "Scroll an element into view if it's not visible",
552 InputSchema: json.RawMessage(`{
553 "type": "object",
554 "properties": {
555 "selector": {
556 "type": "string",
557 "description": "CSS selector for the element to scroll into view"
558 }
559 },
560 "required": ["selector"]
561 }`),
562 Run: b.scrollIntoViewRun,
563 }
564}
565
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700566func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000567 var input scrollIntoViewInput
568 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700569 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000570 }
571
572 browserCtx, err := b.GetBrowserContext()
573 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700574 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000575 }
576
577 script := fmt.Sprintf(`
578 const el = document.querySelector('%s');
579 if (el) {
580 el.scrollIntoView({behavior: 'smooth', block: 'center'});
581 return true;
582 }
583 return false;
584 `, input.Selector)
585
586 var result bool
587 err = chromedp.Run(browserCtx,
588 chromedp.WaitReady(input.Selector),
589 chromedp.Evaluate(script, &result),
590 )
591 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700592 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000593 }
594
595 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700596 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000597 }
598
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700599 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000600}
601
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700602// GetTools returns browser tools, optionally filtering out screenshot-related tools
603func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
604 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000605 b.NewNavigateTool(),
606 b.NewClickTool(),
607 b.NewTypeTool(),
608 b.NewWaitForTool(),
609 b.NewGetTextTool(),
610 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000611 b.NewScrollIntoViewTool(),
612 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700613
614 // Add screenshot-related tools if supported
615 if includeScreenshotTools {
616 tools = append(tools, b.NewScreenshotTool())
617 tools = append(tools, b.NewReadImageTool())
618 }
619
620 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000621}
622
623// SaveScreenshot saves a screenshot to disk and returns its ID
624func (b *BrowseTools) SaveScreenshot(data []byte) string {
625 // Generate a unique ID
626 id := uuid.New().String()
627
628 // Save the file
629 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000630 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000631 log.Printf("Failed to save screenshot: %v", err)
632 return ""
633 }
634
635 // Track this screenshot
636 b.screenshotsMutex.Lock()
637 b.screenshots[id] = time.Now()
638 b.screenshotsMutex.Unlock()
639
640 return id
641}
642
643// GetScreenshotPath returns the full path to a screenshot by ID
644func GetScreenshotPath(id string) string {
645 return filepath.Join(ScreenshotDir, id+".png")
646}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700647
648// ReadImageTool definition
649type readImageInput struct {
650 Path string `json:"path"`
651}
652
653// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
654func (b *BrowseTools) NewReadImageTool() *llm.Tool {
655 return &llm.Tool{
656 Name: "browser_read_image",
657 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
658 InputSchema: json.RawMessage(`{
659 "type": "object",
660 "properties": {
661 "path": {
662 "type": "string",
663 "description": "Path to the image file to read"
664 }
665 },
666 "required": ["path"]
667 }`),
668 Run: b.readImageRun,
669 }
670}
671
672func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
673 var input readImageInput
674 if err := json.Unmarshal(m, &input); err != nil {
675 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
676 }
677
678 // Check if the path exists
679 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
680 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
681 }
682
683 // Read the file
684 imageData, err := os.ReadFile(input.Path)
685 if err != nil {
686 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
687 }
688
689 // Detect the image type
690 imageType := http.DetectContentType(imageData)
691 if !strings.HasPrefix(imageType, "image/") {
692 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
693 }
694
695 // Encode the image as base64
696 base64Data := base64.StdEncoding.EncodeToString(imageData)
697
698 // Create a Content object that includes both text and the image
699 return []llm.Content{
700 {
701 Type: llm.ContentTypeText,
702 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
703 },
704 {
705 Type: llm.ContentTypeText, // Will be mapped to image in content array
706 MediaType: imageType,
707 Data: base64Data,
708 },
709 }, nil
710}