blob: 8a72390582f4cc4c15b900079bc3348d07e90ed9 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000011 "os"
12 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070013 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000014 "sync"
15 "time"
16
17 "github.com/chromedp/chromedp"
18 "github.com/google/uuid"
19 "sketch.dev/llm"
20)
21
22// ScreenshotDir is the directory where screenshots are stored
23const ScreenshotDir = "/tmp/sketch-screenshots"
24
25// BrowseTools contains all browser tools and manages a shared browser instance
26type BrowseTools struct {
27 ctx context.Context
28 cancel context.CancelFunc
29 browserCtx context.Context
30 browserCtxCancel context.CancelFunc
31 mux sync.Mutex
32 initOnce sync.Once
33 initialized bool
34 initErr error
35 // Map to track screenshots by ID and their creation time
36 screenshots map[string]time.Time
37 screenshotsMutex sync.Mutex
38}
39
40// NewBrowseTools creates a new set of browser automation tools
41func NewBrowseTools(ctx context.Context) *BrowseTools {
42 ctx, cancel := context.WithCancel(ctx)
43
44 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000045 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000046 log.Printf("Failed to create screenshot directory: %v", err)
47 }
48
49 b := &BrowseTools{
50 ctx: ctx,
51 cancel: cancel,
52 screenshots: make(map[string]time.Time),
53 }
54
55 return b
56}
57
58// Initialize starts the browser if it's not already running
59func (b *BrowseTools) Initialize() error {
60 b.mux.Lock()
61 defer b.mux.Unlock()
62
63 b.initOnce.Do(func() {
64 // ChromeDP.ExecPath has a list of common places to find Chrome...
65 opts := chromedp.DefaultExecAllocatorOptions[:]
66 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
67 browserCtx, browserCancel := chromedp.NewContext(
68 allocCtx,
69 chromedp.WithLogf(log.Printf),
70 )
71
72 b.browserCtx = browserCtx
73 b.browserCtxCancel = browserCancel
74
75 // Ensure the browser starts
76 if err := chromedp.Run(browserCtx); err != nil {
77 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
78 return
79 }
80 b.initialized = true
81 })
82
83 return b.initErr
84}
85
86// Close shuts down the browser
87func (b *BrowseTools) Close() {
88 b.mux.Lock()
89 defer b.mux.Unlock()
90
91 if b.browserCtxCancel != nil {
92 b.browserCtxCancel()
93 b.browserCtxCancel = nil
94 }
95
96 if b.cancel != nil {
97 b.cancel()
98 }
99
100 b.initialized = false
101 log.Println("Browser closed")
102}
103
104// GetBrowserContext returns the context for browser operations
105func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
106 if err := b.Initialize(); err != nil {
107 return nil, err
108 }
109 return b.browserCtx, nil
110}
111
112// All tools return this as a response when successful
113type baseResponse struct {
114 Status string `json:"status,omitempty"`
115}
116
117func successResponse() string {
118 return `{"status":"success"}`
119}
120
121func errorResponse(err error) string {
122 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
123}
124
125// NavigateTool definition
126type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700127 URL string `json:"url"`
128 Timeout string `json:"timeout,omitempty"`
129} // NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000130func (b *BrowseTools) NewNavigateTool() *llm.Tool {
131 return &llm.Tool{
132 Name: "browser_navigate",
133 Description: "Navigate the browser to a specific URL and wait for page to load",
134 InputSchema: json.RawMessage(`{
135 "type": "object",
136 "properties": {
137 "url": {
138 "type": "string",
139 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700140 },
141 "timeout": {
142 "type": "string",
143 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000144 }
145 },
146 "required": ["url"]
147 }`),
148 Run: b.navigateRun,
149 }
150}
151
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700152func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000153 var input navigateInput
154 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700155 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000156 }
157
158 browserCtx, err := b.GetBrowserContext()
159 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700160 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000161 }
162
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700163 // Create a timeout context for this operation
164 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
165 defer cancel()
166
167 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000168 chromedp.Navigate(input.URL),
169 chromedp.WaitReady("body"),
170 )
171 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700172 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000173 }
174
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700175 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000176}
177
178// ClickTool definition
179type clickInput struct {
180 Selector string `json:"selector"`
181 WaitVisible bool `json:"wait_visible,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700182 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000183}
184
185// NewClickTool creates a tool for clicking elements
186func (b *BrowseTools) NewClickTool() *llm.Tool {
187 return &llm.Tool{
188 Name: "browser_click",
189 Description: "Click the first element matching a CSS selector",
190 InputSchema: json.RawMessage(`{
191 "type": "object",
192 "properties": {
193 "selector": {
194 "type": "string",
195 "description": "CSS selector for the element to click"
196 },
197 "wait_visible": {
198 "type": "boolean",
199 "description": "Wait for the element to be visible before clicking"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700200 },
201 "timeout": {
202 "type": "string",
203 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000204 }
205 },
206 "required": ["selector"]
207 }`),
208 Run: b.clickRun,
209 }
210}
211
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700212func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000213 var input clickInput
214 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700215 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000216 }
217
218 browserCtx, err := b.GetBrowserContext()
219 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700220 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000221 }
222
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700223 // Create a timeout context for this operation
224 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
225 defer cancel()
226
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000227 actions := []chromedp.Action{
228 chromedp.WaitReady(input.Selector),
229 }
230
231 if input.WaitVisible {
232 actions = append(actions, chromedp.WaitVisible(input.Selector))
233 }
234
235 actions = append(actions, chromedp.Click(input.Selector))
236
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700237 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000238 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700239 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000240 }
241
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700242 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000243}
244
245// TypeTool definition
246type typeInput struct {
247 Selector string `json:"selector"`
248 Text string `json:"text"`
249 Clear bool `json:"clear,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700250 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000251}
252
253// NewTypeTool creates a tool for typing into input elements
254func (b *BrowseTools) NewTypeTool() *llm.Tool {
255 return &llm.Tool{
256 Name: "browser_type",
257 Description: "Type text into an input or textarea element",
258 InputSchema: json.RawMessage(`{
259 "type": "object",
260 "properties": {
261 "selector": {
262 "type": "string",
263 "description": "CSS selector for the input element"
264 },
265 "text": {
266 "type": "string",
267 "description": "Text to type into the element"
268 },
269 "clear": {
270 "type": "boolean",
271 "description": "Clear the input field before typing"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700272 },
273 "timeout": {
274 "type": "string",
275 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000276 }
277 },
278 "required": ["selector", "text"]
279 }`),
280 Run: b.typeRun,
281 }
282}
283
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700284func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000285 var input typeInput
286 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700287 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000288 }
289
290 browserCtx, err := b.GetBrowserContext()
291 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700292 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000293 }
294
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700295 // Create a timeout context for this operation
296 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
297 defer cancel()
298
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000299 actions := []chromedp.Action{
300 chromedp.WaitReady(input.Selector),
301 chromedp.WaitVisible(input.Selector),
302 }
303
304 if input.Clear {
305 actions = append(actions, chromedp.Clear(input.Selector))
306 }
307
308 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
309
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700310 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000311 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700312 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000313 }
314
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700315 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000316}
317
318// WaitForTool definition
319type waitForInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700320 Selector string `json:"selector"`
321 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000322}
323
324// NewWaitForTool creates a tool for waiting for elements
325func (b *BrowseTools) NewWaitForTool() *llm.Tool {
326 return &llm.Tool{
327 Name: "browser_wait_for",
328 Description: "Wait for an element to be present in the DOM",
329 InputSchema: json.RawMessage(`{
330 "type": "object",
331 "properties": {
332 "selector": {
333 "type": "string",
334 "description": "CSS selector for the element to wait for"
335 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700336 "timeout": {
337 "type": "string",
338 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000339 }
340 },
341 "required": ["selector"]
342 }`),
343 Run: b.waitForRun,
344 }
345}
346
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700347func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000348 var input waitForInput
349 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700350 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000351 }
352
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000353 browserCtx, err := b.GetBrowserContext()
354 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700355 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000356 }
357
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700358 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000359 defer cancel()
360
361 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
362 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700363 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000364 }
365
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700366 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000367}
368
369// GetTextTool definition
370type getTextInput struct {
371 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700372 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000373}
374
375type getTextOutput struct {
376 Text string `json:"text"`
377}
378
379// NewGetTextTool creates a tool for getting text from elements
380func (b *BrowseTools) NewGetTextTool() *llm.Tool {
381 return &llm.Tool{
382 Name: "browser_get_text",
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700383 Description: "Get the innerText of an element. Can be used to read the web page.",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000384 InputSchema: json.RawMessage(`{
385 "type": "object",
386 "properties": {
387 "selector": {
388 "type": "string",
389 "description": "CSS selector for the element to get text from"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700390 },
391 "timeout": {
392 "type": "string",
393 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000394 }
395 },
396 "required": ["selector"]
397 }`),
398 Run: b.getTextRun,
399 }
400}
401
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700402func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000403 var input getTextInput
404 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700405 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000406 }
407
408 browserCtx, err := b.GetBrowserContext()
409 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700410 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000411 }
412
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700413 // Create a timeout context for this operation
414 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
415 defer cancel()
416
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000417 var text string
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700418 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000419 chromedp.WaitReady(input.Selector),
420 chromedp.Text(input.Selector, &text),
421 )
422 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700423 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000424 }
425
426 output := getTextOutput{Text: text}
427 result, err := json.Marshal(output)
428 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700429 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000430 }
431
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700432 return llm.TextContent(string(result)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000433}
434
435// EvalTool definition
436type evalInput struct {
437 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700438 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000439}
440
441type evalOutput struct {
442 Result any `json:"result"`
443}
444
445// NewEvalTool creates a tool for evaluating JavaScript
446func (b *BrowseTools) NewEvalTool() *llm.Tool {
447 return &llm.Tool{
448 Name: "browser_eval",
449 Description: "Evaluate JavaScript in the browser context",
450 InputSchema: json.RawMessage(`{
451 "type": "object",
452 "properties": {
453 "expression": {
454 "type": "string",
455 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700456 },
457 "timeout": {
458 "type": "string",
459 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000460 }
461 },
462 "required": ["expression"]
463 }`),
464 Run: b.evalRun,
465 }
466}
467
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700468func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000469 var input evalInput
470 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700471 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000472 }
473
474 browserCtx, err := b.GetBrowserContext()
475 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700476 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000477 }
478
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700479 // Create a timeout context for this operation
480 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
481 defer cancel()
482
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000483 var result any
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700484 err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000485 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700486 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000487 }
488
489 output := evalOutput{Result: result}
490 response, err := json.Marshal(output)
491 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700492 return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000493 }
494
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700495 return llm.TextContent(string(response)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000496}
497
498// ScreenshotTool definition
499type screenshotInput struct {
500 Selector string `json:"selector,omitempty"`
501 Format string `json:"format,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700502 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000503}
504
505type screenshotOutput struct {
506 ID string `json:"id"`
507}
508
509// NewScreenshotTool creates a tool for taking screenshots
510func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
511 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700512 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000513 Description: "Take a screenshot of the page or a specific element",
514 InputSchema: json.RawMessage(`{
515 "type": "object",
516 "properties": {
517 "selector": {
518 "type": "string",
519 "description": "CSS selector for the element to screenshot (optional)"
520 },
521 "format": {
522 "type": "string",
523 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
524 "enum": ["base64", "png"]
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700525 },
526 "timeout": {
527 "type": "string",
528 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000529 }
530 }
531 }`),
532 Run: b.screenshotRun,
533 }
534}
535
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700536func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000537 var input screenshotInput
538 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700539 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000540 }
541
542 browserCtx, err := b.GetBrowserContext()
543 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700544 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000545 }
546
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700547 // Create a timeout context for this operation
548 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
549 defer cancel()
550
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000551 var buf []byte
552 var actions []chromedp.Action
553
554 if input.Selector != "" {
555 // Take screenshot of specific element
556 actions = append(actions,
557 chromedp.WaitReady(input.Selector),
558 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
559 )
560 } else {
561 // Take full page screenshot
562 actions = append(actions, chromedp.CaptureScreenshot(&buf))
563 }
564
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700565 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000566 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700567 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000568 }
569
570 // Save the screenshot and get its ID
571 id := b.SaveScreenshot(buf)
572 if id == "" {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700573 return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000574 }
575
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700576 // Get the full path to the screenshot
577 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000578
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700579 // Return the ID and instructions on how to view the screenshot
580 result := fmt.Sprintf(`{
581 "id": "%s",
582 "path": "%s",
583 "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided."
584}`, id, screenshotPath)
585
586 return llm.TextContent(result), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000587}
588
589// ScrollIntoViewTool definition
590type scrollIntoViewInput struct {
591 Selector string `json:"selector"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700592 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000593}
594
595// NewScrollIntoViewTool creates a tool for scrolling elements into view
596func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
597 return &llm.Tool{
598 Name: "browser_scroll_into_view",
599 Description: "Scroll an element into view if it's not visible",
600 InputSchema: json.RawMessage(`{
601 "type": "object",
602 "properties": {
603 "selector": {
604 "type": "string",
605 "description": "CSS selector for the element to scroll into view"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700606 },
607 "timeout": {
608 "type": "string",
609 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000610 }
611 },
612 "required": ["selector"]
613 }`),
614 Run: b.scrollIntoViewRun,
615 }
616}
617
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700618func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000619 var input scrollIntoViewInput
620 if err := json.Unmarshal(m, &input); err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700621 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000622 }
623
624 browserCtx, err := b.GetBrowserContext()
625 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700626 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000627 }
628
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700629 // Create a timeout context for this operation
630 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
631 defer cancel()
632
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000633 script := fmt.Sprintf(`
634 const el = document.querySelector('%s');
635 if (el) {
636 el.scrollIntoView({behavior: 'smooth', block: 'center'});
637 return true;
638 }
639 return false;
640 `, input.Selector)
641
642 var result bool
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700643 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000644 chromedp.WaitReady(input.Selector),
645 chromedp.Evaluate(script, &result),
646 )
647 if err != nil {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700648 return llm.TextContent(errorResponse(err)), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000649 }
650
651 if !result {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700652 return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000653 }
654
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700655 return llm.TextContent(successResponse()), nil
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000656}
657
Philip Zeyliger05224842025-05-10 18:26:08 -0700658// ResizeTool definition
659type resizeInput struct {
660 Width int `json:"width"`
661 Height int `json:"height"`
662 Timeout string `json:"timeout,omitempty"`
663}
664
665// NewResizeTool creates a tool for resizing the browser window
666func (b *BrowseTools) NewResizeTool() *llm.Tool {
667 return &llm.Tool{
668 Name: "browser_resize",
669 Description: "Resize the browser window to a specific width and height",
670 InputSchema: json.RawMessage(`{
671 "type": "object",
672 "properties": {
673 "width": {
674 "type": "integer",
675 "description": "Window width in pixels"
676 },
677 "height": {
678 "type": "integer",
679 "description": "Window height in pixels"
680 },
681 "timeout": {
682 "type": "string",
683 "description": "Timeout as a Go duration string (default: 5s)"
684 }
685 },
686 "required": ["width", "height"]
687 }`),
688 Run: b.resizeRun,
689 }
690}
691
692func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
693 var input resizeInput
694 if err := json.Unmarshal(m, &input); err != nil {
695 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
696 }
697
698 browserCtx, err := b.GetBrowserContext()
699 if err != nil {
700 return llm.TextContent(errorResponse(err)), nil
701 }
702
703 // Create a timeout context for this operation
704 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
705 defer cancel()
706
707 // Validate dimensions
708 if input.Width <= 0 || input.Height <= 0 {
709 return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil
710 }
711
712 // Resize the browser window
713 err = chromedp.Run(timeoutCtx,
714 chromedp.EmulateViewport(int64(input.Width), int64(input.Height)),
715 )
716 if err != nil {
717 return llm.TextContent(errorResponse(err)), nil
718 }
719
720 return llm.TextContent(successResponse()), nil
721}
722
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700723// GetTools returns browser tools, optionally filtering out screenshot-related tools
724func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
725 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000726 b.NewNavigateTool(),
727 b.NewClickTool(),
728 b.NewTypeTool(),
729 b.NewWaitForTool(),
730 b.NewGetTextTool(),
731 b.NewEvalTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000732 b.NewScrollIntoViewTool(),
Philip Zeyliger05224842025-05-10 18:26:08 -0700733 b.NewResizeTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000734 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700735
736 // Add screenshot-related tools if supported
737 if includeScreenshotTools {
738 tools = append(tools, b.NewScreenshotTool())
739 tools = append(tools, b.NewReadImageTool())
740 }
741
742 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000743}
744
745// SaveScreenshot saves a screenshot to disk and returns its ID
746func (b *BrowseTools) SaveScreenshot(data []byte) string {
747 // Generate a unique ID
748 id := uuid.New().String()
749
750 // Save the file
751 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000752 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000753 log.Printf("Failed to save screenshot: %v", err)
754 return ""
755 }
756
757 // Track this screenshot
758 b.screenshotsMutex.Lock()
759 b.screenshots[id] = time.Now()
760 b.screenshotsMutex.Unlock()
761
762 return id
763}
764
765// GetScreenshotPath returns the full path to a screenshot by ID
766func GetScreenshotPath(id string) string {
767 return filepath.Join(ScreenshotDir, id+".png")
768}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700769
770// ReadImageTool definition
771type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700772 Path string `json:"path"`
773 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700774}
775
776// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
777func (b *BrowseTools) NewReadImageTool() *llm.Tool {
778 return &llm.Tool{
779 Name: "browser_read_image",
780 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
781 InputSchema: json.RawMessage(`{
782 "type": "object",
783 "properties": {
784 "path": {
785 "type": "string",
786 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700787 },
788 "timeout": {
789 "type": "string",
790 "description": "Timeout as a Go duration string (default: 5s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700791 }
792 },
793 "required": ["path"]
794 }`),
795 Run: b.readImageRun,
796 }
797}
798
799func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) {
800 var input readImageInput
801 if err := json.Unmarshal(m, &input); err != nil {
802 return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil
803 }
804
805 // Check if the path exists
806 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
807 return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil
808 }
809
810 // Read the file
811 imageData, err := os.ReadFile(input.Path)
812 if err != nil {
813 return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil
814 }
815
816 // Detect the image type
817 imageType := http.DetectContentType(imageData)
818 if !strings.HasPrefix(imageType, "image/") {
819 return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil
820 }
821
822 // Encode the image as base64
823 base64Data := base64.StdEncoding.EncodeToString(imageData)
824
825 // Create a Content object that includes both text and the image
826 return []llm.Content{
827 {
828 Type: llm.ContentTypeText,
829 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
830 },
831 {
832 Type: llm.ContentTypeText, // Will be mapped to image in content array
833 MediaType: imageType,
834 Data: base64Data,
835 },
836 }, nil
837}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700838
839// parseTimeout parses a timeout string and returns a time.Duration
840// It returns a default of 5 seconds if the timeout is empty or invalid
841func parseTimeout(timeout string) time.Duration {
842 if timeout == "" {
843 return 5 * time.Second // default 5 seconds
844 }
845
846 dur, err := time.ParseDuration(timeout)
847 if err != nil {
848 // If parsing fails, return the default
849 return 5 * time.Second
850 }
851
852 return dur
853}