blob: 52248b8c9784cb3f826e3610dabcd90e22d16197 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
6 "encoding/json"
7 "fmt"
8 "log"
9 "os"
10 "path/filepath"
11 "sync"
12 "time"
13
14 "github.com/chromedp/chromedp"
15 "github.com/google/uuid"
16 "sketch.dev/llm"
17)
18
19// ScreenshotDir is the directory where screenshots are stored
20const ScreenshotDir = "/tmp/sketch-screenshots"
21
22// BrowseTools contains all browser tools and manages a shared browser instance
23type BrowseTools struct {
24 ctx context.Context
25 cancel context.CancelFunc
26 browserCtx context.Context
27 browserCtxCancel context.CancelFunc
28 mux sync.Mutex
29 initOnce sync.Once
30 initialized bool
31 initErr error
32 // Map to track screenshots by ID and their creation time
33 screenshots map[string]time.Time
34 screenshotsMutex sync.Mutex
35}
36
37// NewBrowseTools creates a new set of browser automation tools
38func NewBrowseTools(ctx context.Context) *BrowseTools {
39 ctx, cancel := context.WithCancel(ctx)
40
41 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000042 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000043 log.Printf("Failed to create screenshot directory: %v", err)
44 }
45
46 b := &BrowseTools{
47 ctx: ctx,
48 cancel: cancel,
49 screenshots: make(map[string]time.Time),
50 }
51
52 return b
53}
54
55// Initialize starts the browser if it's not already running
56func (b *BrowseTools) Initialize() error {
57 b.mux.Lock()
58 defer b.mux.Unlock()
59
60 b.initOnce.Do(func() {
61 // ChromeDP.ExecPath has a list of common places to find Chrome...
62 opts := chromedp.DefaultExecAllocatorOptions[:]
63 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
64 browserCtx, browserCancel := chromedp.NewContext(
65 allocCtx,
66 chromedp.WithLogf(log.Printf),
67 )
68
69 b.browserCtx = browserCtx
70 b.browserCtxCancel = browserCancel
71
72 // Ensure the browser starts
73 if err := chromedp.Run(browserCtx); err != nil {
74 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
75 return
76 }
77 b.initialized = true
78 })
79
80 return b.initErr
81}
82
83// Close shuts down the browser
84func (b *BrowseTools) Close() {
85 b.mux.Lock()
86 defer b.mux.Unlock()
87
88 if b.browserCtxCancel != nil {
89 b.browserCtxCancel()
90 b.browserCtxCancel = nil
91 }
92
93 if b.cancel != nil {
94 b.cancel()
95 }
96
97 b.initialized = false
98 log.Println("Browser closed")
99}
100
101// GetBrowserContext returns the context for browser operations
102func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
103 if err := b.Initialize(); err != nil {
104 return nil, err
105 }
106 return b.browserCtx, nil
107}
108
109// All tools return this as a response when successful
110type baseResponse struct {
111 Status string `json:"status,omitempty"`
112}
113
114func successResponse() string {
115 return `{"status":"success"}`
116}
117
118func errorResponse(err error) string {
119 return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
120}
121
122// NavigateTool definition
123type navigateInput struct {
124 URL string `json:"url"`
125}
126
127// NewNavigateTool creates a tool for navigating to URLs
128func (b *BrowseTools) NewNavigateTool() *llm.Tool {
129 return &llm.Tool{
130 Name: "browser_navigate",
131 Description: "Navigate the browser to a specific URL and wait for page to load",
132 InputSchema: json.RawMessage(`{
133 "type": "object",
134 "properties": {
135 "url": {
136 "type": "string",
137 "description": "The URL to navigate to"
138 }
139 },
140 "required": ["url"]
141 }`),
142 Run: b.navigateRun,
143 }
144}
145
146func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) (string, error) {
147 var input navigateInput
148 if err := json.Unmarshal(m, &input); err != nil {
149 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
150 }
151
152 browserCtx, err := b.GetBrowserContext()
153 if err != nil {
154 return errorResponse(err), nil
155 }
156
157 err = chromedp.Run(browserCtx,
158 chromedp.Navigate(input.URL),
159 chromedp.WaitReady("body"),
160 )
161 if err != nil {
162 return errorResponse(err), nil
163 }
164
165 return successResponse(), nil
166}
167
168// ClickTool definition
169type clickInput struct {
170 Selector string `json:"selector"`
171 WaitVisible bool `json:"wait_visible,omitempty"`
172}
173
174// NewClickTool creates a tool for clicking elements
175func (b *BrowseTools) NewClickTool() *llm.Tool {
176 return &llm.Tool{
177 Name: "browser_click",
178 Description: "Click the first element matching a CSS selector",
179 InputSchema: json.RawMessage(`{
180 "type": "object",
181 "properties": {
182 "selector": {
183 "type": "string",
184 "description": "CSS selector for the element to click"
185 },
186 "wait_visible": {
187 "type": "boolean",
188 "description": "Wait for the element to be visible before clicking"
189 }
190 },
191 "required": ["selector"]
192 }`),
193 Run: b.clickRun,
194 }
195}
196
197func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) (string, error) {
198 var input clickInput
199 if err := json.Unmarshal(m, &input); err != nil {
200 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
201 }
202
203 browserCtx, err := b.GetBrowserContext()
204 if err != nil {
205 return errorResponse(err), nil
206 }
207
208 actions := []chromedp.Action{
209 chromedp.WaitReady(input.Selector),
210 }
211
212 if input.WaitVisible {
213 actions = append(actions, chromedp.WaitVisible(input.Selector))
214 }
215
216 actions = append(actions, chromedp.Click(input.Selector))
217
218 err = chromedp.Run(browserCtx, actions...)
219 if err != nil {
220 return errorResponse(err), nil
221 }
222
223 return successResponse(), nil
224}
225
226// TypeTool definition
227type typeInput struct {
228 Selector string `json:"selector"`
229 Text string `json:"text"`
230 Clear bool `json:"clear,omitempty"`
231}
232
233// NewTypeTool creates a tool for typing into input elements
234func (b *BrowseTools) NewTypeTool() *llm.Tool {
235 return &llm.Tool{
236 Name: "browser_type",
237 Description: "Type text into an input or textarea element",
238 InputSchema: json.RawMessage(`{
239 "type": "object",
240 "properties": {
241 "selector": {
242 "type": "string",
243 "description": "CSS selector for the input element"
244 },
245 "text": {
246 "type": "string",
247 "description": "Text to type into the element"
248 },
249 "clear": {
250 "type": "boolean",
251 "description": "Clear the input field before typing"
252 }
253 },
254 "required": ["selector", "text"]
255 }`),
256 Run: b.typeRun,
257 }
258}
259
260func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) (string, error) {
261 var input typeInput
262 if err := json.Unmarshal(m, &input); err != nil {
263 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
264 }
265
266 browserCtx, err := b.GetBrowserContext()
267 if err != nil {
268 return errorResponse(err), nil
269 }
270
271 actions := []chromedp.Action{
272 chromedp.WaitReady(input.Selector),
273 chromedp.WaitVisible(input.Selector),
274 }
275
276 if input.Clear {
277 actions = append(actions, chromedp.Clear(input.Selector))
278 }
279
280 actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
281
282 err = chromedp.Run(browserCtx, actions...)
283 if err != nil {
284 return errorResponse(err), nil
285 }
286
287 return successResponse(), nil
288}
289
290// WaitForTool definition
291type waitForInput struct {
292 Selector string `json:"selector"`
293 TimeoutMS int `json:"timeout_ms,omitempty"`
294}
295
296// NewWaitForTool creates a tool for waiting for elements
297func (b *BrowseTools) NewWaitForTool() *llm.Tool {
298 return &llm.Tool{
299 Name: "browser_wait_for",
300 Description: "Wait for an element to be present in the DOM",
301 InputSchema: json.RawMessage(`{
302 "type": "object",
303 "properties": {
304 "selector": {
305 "type": "string",
306 "description": "CSS selector for the element to wait for"
307 },
308 "timeout_ms": {
309 "type": "integer",
310 "description": "Maximum time to wait in milliseconds (default: 30000)"
311 }
312 },
313 "required": ["selector"]
314 }`),
315 Run: b.waitForRun,
316 }
317}
318
319func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) (string, error) {
320 var input waitForInput
321 if err := json.Unmarshal(m, &input); err != nil {
322 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
323 }
324
325 timeout := 30000 // default timeout 30 seconds
326 if input.TimeoutMS > 0 {
327 timeout = input.TimeoutMS
328 }
329
330 browserCtx, err := b.GetBrowserContext()
331 if err != nil {
332 return errorResponse(err), nil
333 }
334
335 timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
336 defer cancel()
337
338 err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
339 if err != nil {
340 return errorResponse(err), nil
341 }
342
343 return successResponse(), nil
344}
345
346// GetTextTool definition
347type getTextInput struct {
348 Selector string `json:"selector"`
349}
350
351type getTextOutput struct {
352 Text string `json:"text"`
353}
354
355// NewGetTextTool creates a tool for getting text from elements
356func (b *BrowseTools) NewGetTextTool() *llm.Tool {
357 return &llm.Tool{
358 Name: "browser_get_text",
359 Description: "Get the innerText of an element",
360 InputSchema: json.RawMessage(`{
361 "type": "object",
362 "properties": {
363 "selector": {
364 "type": "string",
365 "description": "CSS selector for the element to get text from"
366 }
367 },
368 "required": ["selector"]
369 }`),
370 Run: b.getTextRun,
371 }
372}
373
374func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) (string, error) {
375 var input getTextInput
376 if err := json.Unmarshal(m, &input); err != nil {
377 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
378 }
379
380 browserCtx, err := b.GetBrowserContext()
381 if err != nil {
382 return errorResponse(err), nil
383 }
384
385 var text string
386 err = chromedp.Run(browserCtx,
387 chromedp.WaitReady(input.Selector),
388 chromedp.Text(input.Selector, &text),
389 )
390 if err != nil {
391 return errorResponse(err), nil
392 }
393
394 output := getTextOutput{Text: text}
395 result, err := json.Marshal(output)
396 if err != nil {
397 return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
398 }
399
400 return string(result), nil
401}
402
403// EvalTool definition
404type evalInput struct {
405 Expression string `json:"expression"`
406}
407
408type evalOutput struct {
409 Result any `json:"result"`
410}
411
412// NewEvalTool creates a tool for evaluating JavaScript
413func (b *BrowseTools) NewEvalTool() *llm.Tool {
414 return &llm.Tool{
415 Name: "browser_eval",
416 Description: "Evaluate JavaScript in the browser context",
417 InputSchema: json.RawMessage(`{
418 "type": "object",
419 "properties": {
420 "expression": {
421 "type": "string",
422 "description": "JavaScript expression to evaluate"
423 }
424 },
425 "required": ["expression"]
426 }`),
427 Run: b.evalRun,
428 }
429}
430
431func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) (string, error) {
432 var input evalInput
433 if err := json.Unmarshal(m, &input); err != nil {
434 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
435 }
436
437 browserCtx, err := b.GetBrowserContext()
438 if err != nil {
439 return errorResponse(err), nil
440 }
441
442 var result any
443 err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
444 if err != nil {
445 return errorResponse(err), nil
446 }
447
448 output := evalOutput{Result: result}
449 response, err := json.Marshal(output)
450 if err != nil {
451 return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
452 }
453
454 return string(response), nil
455}
456
457// ScreenshotTool definition
458type screenshotInput struct {
459 Selector string `json:"selector,omitempty"`
460 Format string `json:"format,omitempty"`
461}
462
463type screenshotOutput struct {
464 ID string `json:"id"`
465}
466
467// NewScreenshotTool creates a tool for taking screenshots
468func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
469 return &llm.Tool{
470 Name: "browser_screenshot",
471 Description: "Take a screenshot of the page or a specific element",
472 InputSchema: json.RawMessage(`{
473 "type": "object",
474 "properties": {
475 "selector": {
476 "type": "string",
477 "description": "CSS selector for the element to screenshot (optional)"
478 },
479 "format": {
480 "type": "string",
481 "description": "Output format ('base64' or 'png'), defaults to 'base64'",
482 "enum": ["base64", "png"]
483 }
484 }
485 }`),
486 Run: b.screenshotRun,
487 }
488}
489
490func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) (string, error) {
491 var input screenshotInput
492 if err := json.Unmarshal(m, &input); err != nil {
493 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
494 }
495
496 browserCtx, err := b.GetBrowserContext()
497 if err != nil {
498 return errorResponse(err), nil
499 }
500
501 var buf []byte
502 var actions []chromedp.Action
503
504 if input.Selector != "" {
505 // Take screenshot of specific element
506 actions = append(actions,
507 chromedp.WaitReady(input.Selector),
508 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
509 )
510 } else {
511 // Take full page screenshot
512 actions = append(actions, chromedp.CaptureScreenshot(&buf))
513 }
514
515 err = chromedp.Run(browserCtx, actions...)
516 if err != nil {
517 return errorResponse(err), nil
518 }
519
520 // Save the screenshot and get its ID
521 id := b.SaveScreenshot(buf)
522 if id == "" {
523 return errorResponse(fmt.Errorf("failed to save screenshot")), nil
524 }
525
526 // Return the ID in the response
527 output := screenshotOutput{ID: id}
528 response, err := json.Marshal(output)
529 if err != nil {
530 return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
531 }
532
533 return string(response), nil
534}
535
536// ScrollIntoViewTool definition
537type scrollIntoViewInput struct {
538 Selector string `json:"selector"`
539}
540
541// NewScrollIntoViewTool creates a tool for scrolling elements into view
542func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool {
543 return &llm.Tool{
544 Name: "browser_scroll_into_view",
545 Description: "Scroll an element into view if it's not visible",
546 InputSchema: json.RawMessage(`{
547 "type": "object",
548 "properties": {
549 "selector": {
550 "type": "string",
551 "description": "CSS selector for the element to scroll into view"
552 }
553 },
554 "required": ["selector"]
555 }`),
556 Run: b.scrollIntoViewRun,
557 }
558}
559
560func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) (string, error) {
561 var input scrollIntoViewInput
562 if err := json.Unmarshal(m, &input); err != nil {
563 return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
564 }
565
566 browserCtx, err := b.GetBrowserContext()
567 if err != nil {
568 return errorResponse(err), nil
569 }
570
571 script := fmt.Sprintf(`
572 const el = document.querySelector('%s');
573 if (el) {
574 el.scrollIntoView({behavior: 'smooth', block: 'center'});
575 return true;
576 }
577 return false;
578 `, input.Selector)
579
580 var result bool
581 err = chromedp.Run(browserCtx,
582 chromedp.WaitReady(input.Selector),
583 chromedp.Evaluate(script, &result),
584 )
585 if err != nil {
586 return errorResponse(err), nil
587 }
588
589 if !result {
590 return errorResponse(fmt.Errorf("element not found: %s", input.Selector)), nil
591 }
592
593 return successResponse(), nil
594}
595
596// GetAllTools returns all browser tools
597func (b *BrowseTools) GetAllTools() []*llm.Tool {
598 return []*llm.Tool{
599 b.NewNavigateTool(),
600 b.NewClickTool(),
601 b.NewTypeTool(),
602 b.NewWaitForTool(),
603 b.NewGetTextTool(),
604 b.NewEvalTool(),
605 b.NewScreenshotTool(),
606 b.NewScrollIntoViewTool(),
607 }
608}
609
610// SaveScreenshot saves a screenshot to disk and returns its ID
611func (b *BrowseTools) SaveScreenshot(data []byte) string {
612 // Generate a unique ID
613 id := uuid.New().String()
614
615 // Save the file
616 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000617 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000618 log.Printf("Failed to save screenshot: %v", err)
619 return ""
620 }
621
622 // Track this screenshot
623 b.screenshotsMutex.Lock()
624 b.screenshots[id] = time.Now()
625 b.screenshotsMutex.Unlock()
626
627 return id
628}
629
630// GetScreenshotPath returns the full path to a screenshot by ID
631func GetScreenshotPath(id string) string {
632 return filepath.Join(ScreenshotDir, id+".png")
633}