blob: 81ae105bd94756ed8c9349000af26c07798b51a4 [file] [log] [blame]
Philip Zeyliger33d282f2025-05-03 04:01:54 +00001// Package browse provides browser automation tools for the agent
2package browse
3
4import (
5 "context"
Philip Zeyliger72252cb2025-05-10 17:00:08 -07006 "encoding/base64"
Philip Zeyliger33d282f2025-05-03 04:01:54 +00007 "encoding/json"
8 "fmt"
9 "log"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070010 "net/http"
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +000011 "net/url"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000012 "os"
13 "path/filepath"
Philip Zeyliger72252cb2025-05-10 17:00:08 -070014 "strings"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000015 "sync"
16 "time"
17
Philip Zeyliger18e33682025-05-13 16:34:21 -070018 "github.com/chromedp/cdproto/runtime"
Philip Zeyliger33d282f2025-05-03 04:01:54 +000019 "github.com/chromedp/chromedp"
20 "github.com/google/uuid"
21 "sketch.dev/llm"
22)
23
24// ScreenshotDir is the directory where screenshots are stored
25const ScreenshotDir = "/tmp/sketch-screenshots"
26
27// BrowseTools contains all browser tools and manages a shared browser instance
28type BrowseTools struct {
29 ctx context.Context
30 cancel context.CancelFunc
31 browserCtx context.Context
32 browserCtxCancel context.CancelFunc
33 mux sync.Mutex
34 initOnce sync.Once
35 initialized bool
36 initErr error
37 // Map to track screenshots by ID and their creation time
38 screenshots map[string]time.Time
39 screenshotsMutex sync.Mutex
Philip Zeyliger18e33682025-05-13 16:34:21 -070040 // Console logs storage
41 consoleLogs []*runtime.EventConsoleAPICalled
42 consoleLogsMutex sync.Mutex
43 maxConsoleLogs int
Philip Zeyliger33d282f2025-05-03 04:01:54 +000044}
45
46// NewBrowseTools creates a new set of browser automation tools
47func NewBrowseTools(ctx context.Context) *BrowseTools {
48 ctx, cancel := context.WithCancel(ctx)
49
50 // Ensure the screenshot directory exists
Autoformatter4962f152025-05-06 17:24:20 +000051 if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +000052 log.Printf("Failed to create screenshot directory: %v", err)
53 }
54
55 b := &BrowseTools{
Philip Zeyliger18e33682025-05-13 16:34:21 -070056 ctx: ctx,
57 cancel: cancel,
58 screenshots: make(map[string]time.Time),
59 consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
60 maxConsoleLogs: 100,
Philip Zeyliger33d282f2025-05-03 04:01:54 +000061 }
62
63 return b
64}
65
66// Initialize starts the browser if it's not already running
67func (b *BrowseTools) Initialize() error {
68 b.mux.Lock()
69 defer b.mux.Unlock()
70
71 b.initOnce.Do(func() {
72 // ChromeDP.ExecPath has a list of common places to find Chrome...
73 opts := chromedp.DefaultExecAllocatorOptions[:]
Philip Zeyligerc0131342025-06-13 21:07:08 -070074 // This is the default when running as root, but we generally need it
75 // when running in a container, even when we aren't root (which is largely
76 // the case for tests).
77 opts = append(opts, chromedp.NoSandbox)
Philip Zeyligera35de5f2025-06-14 12:00:48 -070078 // Setting 'DBUS_SESSION_BUS_ADDRESS=""' or this flag allows tests to pass
79 // in GitHub runner contexts. It's a mystery why the failure isn't clear when this fails.
80 opts = append(opts, chromedp.Flag("--disable-dbus", true))
81 // This can be pretty slow in tests
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070082 opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
Philip Zeyliger9b39aa62025-07-14 11:56:02 -070083 // Add environment variable to mark this as a sketch internal process
84 opts = append(opts, chromedp.Env("SKETCH_IGNORE_PORTS=1"))
Philip Zeyliger33d282f2025-05-03 04:01:54 +000085 allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
86 browserCtx, browserCancel := chromedp.NewContext(
87 allocCtx,
Philip Zeyligerfe51d1d2025-06-16 21:19:44 -070088 chromedp.WithLogf(log.Printf), chromedp.WithErrorf(log.Printf), chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
Philip Zeyliger33d282f2025-05-03 04:01:54 +000089 )
90
91 b.browserCtx = browserCtx
92 b.browserCtxCancel = browserCancel
93
Philip Zeyliger18e33682025-05-13 16:34:21 -070094 // Set up console log listener
95 chromedp.ListenTarget(browserCtx, func(ev any) {
96 switch e := ev.(type) {
97 case *runtime.EventConsoleAPICalled:
98 b.captureConsoleLog(e)
99 }
100 })
101
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000102 // Ensure the browser starts
103 if err := chromedp.Run(browserCtx); err != nil {
104 b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
105 return
106 }
Josh Bleecher Snyder7fbc8e42025-05-29 19:42:25 +0000107
108 // Set default viewport size to 1280x720 (16:9 widescreen)
109 if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
110 b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
111 return
112 }
113
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000114 b.initialized = true
115 })
116
117 return b.initErr
118}
119
120// Close shuts down the browser
121func (b *BrowseTools) Close() {
122 b.mux.Lock()
123 defer b.mux.Unlock()
124
125 if b.browserCtxCancel != nil {
126 b.browserCtxCancel()
127 b.browserCtxCancel = nil
128 }
129
130 if b.cancel != nil {
131 b.cancel()
132 }
133
134 b.initialized = false
135 log.Println("Browser closed")
136}
137
138// GetBrowserContext returns the context for browser operations
139func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
140 if err := b.Initialize(); err != nil {
141 return nil, err
142 }
143 return b.browserCtx, nil
144}
145
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000146// NavigateTool definition
147type navigateInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700148 URL string `json:"url"`
149 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000150}
151
152// isPort80 reports whether urlStr definitely uses port 80.
153func isPort80(urlStr string) bool {
154 parsedURL, err := url.Parse(urlStr)
155 if err != nil {
156 return false
157 }
158 port := parsedURL.Port()
159 return port == "80" || (port == "" && parsedURL.Scheme == "http")
160}
161
162// NewNavigateTool creates a tool for navigating to URLs
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000163func (b *BrowseTools) NewNavigateTool() *llm.Tool {
164 return &llm.Tool{
165 Name: "browser_navigate",
166 Description: "Navigate the browser to a specific URL and wait for page to load",
167 InputSchema: json.RawMessage(`{
168 "type": "object",
169 "properties": {
170 "url": {
171 "type": "string",
172 "description": "The URL to navigate to"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700173 },
174 "timeout": {
175 "type": "string",
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000176 "description": "Timeout as a Go duration string (default: 15s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000177 }
178 },
179 "required": ["url"]
180 }`),
181 Run: b.navigateRun,
182 }
183}
184
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700185func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000186 var input navigateInput
187 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700188 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000189 }
190
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000191 if isPort80(input.URL) {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700192 return llm.ErrorToolOut(fmt.Errorf("port 80 is not the port you're looking for--port 80 is the main sketch server"))
Josh Bleecher Snyderbf381a72025-05-29 23:45:02 +0000193 }
194
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000195 browserCtx, err := b.GetBrowserContext()
196 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700197 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000198 }
199
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700200 // Create a timeout context for this operation
201 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
202 defer cancel()
203
204 err = chromedp.Run(timeoutCtx,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000205 chromedp.Navigate(input.URL),
206 chromedp.WaitReady("body"),
207 )
208 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700209 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000210 }
211
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700212 return llm.ToolOut{LLMContent: llm.TextContent("done")}
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000213}
214
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000215// EvalTool definition
216type evalInput struct {
217 Expression string `json:"expression"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700218 Timeout string `json:"timeout,omitempty"`
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000219 Await *bool `json:"await,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000220}
221
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000222// NewEvalTool creates a tool for evaluating JavaScript
223func (b *BrowseTools) NewEvalTool() *llm.Tool {
224 return &llm.Tool{
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000225 Name: "browser_eval",
226 Description: `Evaluate JavaScript in the browser context.
227Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000228 InputSchema: json.RawMessage(`{
229 "type": "object",
230 "properties": {
231 "expression": {
232 "type": "string",
233 "description": "JavaScript expression to evaluate"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700234 },
235 "timeout": {
236 "type": "string",
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000237 "description": "Timeout as a Go duration string (default: 15s)"
238 },
239 "await": {
240 "type": "boolean",
241 "description": "If true, wait for promises to resolve and return their resolved value (default: true)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000242 }
243 },
244 "required": ["expression"]
245 }`),
246 Run: b.evalRun,
247 }
248}
249
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700250func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000251 var input evalInput
252 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700253 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000254 }
255
256 browserCtx, err := b.GetBrowserContext()
257 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700258 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000259 }
260
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700261 // Create a timeout context for this operation
262 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
263 defer cancel()
264
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000265 var result any
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000266 var evalOps []chromedp.EvaluateOption
267
268 await := true
269 if input.Await != nil {
270 await = *input.Await
271 }
272 if await {
273 evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
274 return p.WithAwaitPromise(true)
275 })
276 }
277
278 evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
279
280 err = chromedp.Run(timeoutCtx, evalAction)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000281 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700282 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000283 }
284
Josh Bleecher Snydercb557262025-06-30 23:55:20 +0000285 // Return the result as JSON
286 response, err := json.Marshal(result)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000287 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700288 return llm.ErrorfToolOut("failed to marshal response: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000289 }
290
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700291 return llm.ToolOut{LLMContent: llm.TextContent("<javascript_result>" + string(response) + "</javascript_result>")}
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000292}
293
294// ScreenshotTool definition
295type screenshotInput struct {
296 Selector string `json:"selector,omitempty"`
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700297 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000298}
299
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000300// NewScreenshotTool creates a tool for taking screenshots
301func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
302 return &llm.Tool{
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700303 Name: "browser_take_screenshot",
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000304 Description: "Take a screenshot of the page or a specific element",
305 InputSchema: json.RawMessage(`{
306 "type": "object",
307 "properties": {
308 "selector": {
309 "type": "string",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700310 "description": "CSS selector for the element to screenshot (optional)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000311 },
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700312 "timeout": {
313 "type": "string",
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000314 "description": "Timeout as a Go duration string (default: 15s)"
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000315 }
316 }
317 }`),
318 Run: b.screenshotRun,
319 }
320}
321
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700322func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000323 var input screenshotInput
324 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700325 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000326 }
327
328 browserCtx, err := b.GetBrowserContext()
329 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700330 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000331 }
332
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700333 // Create a timeout context for this operation
334 timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
335 defer cancel()
336
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000337 var buf []byte
338 var actions []chromedp.Action
339
340 if input.Selector != "" {
341 // Take screenshot of specific element
342 actions = append(actions,
343 chromedp.WaitReady(input.Selector),
344 chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
345 )
346 } else {
347 // Take full page screenshot
348 actions = append(actions, chromedp.CaptureScreenshot(&buf))
349 }
350
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700351 err = chromedp.Run(timeoutCtx, actions...)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000352 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700353 return llm.ErrorToolOut(err)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000354 }
355
Philip Zeyliger542bda32025-06-11 18:31:03 -0700356 // Save the screenshot and get its ID for potential future reference
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000357 id := b.SaveScreenshot(buf)
358 if id == "" {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700359 return llm.ErrorToolOut(fmt.Errorf("failed to save screenshot"))
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000360 }
361
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700362 // Get the full path to the screenshot
363 screenshotPath := GetScreenshotPath(id)
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000364
Philip Zeyliger542bda32025-06-11 18:31:03 -0700365 // Encode the image as base64
366 base64Data := base64.StdEncoding.EncodeToString(buf)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700367
Philip Zeyliger542bda32025-06-11 18:31:03 -0700368 // Return the screenshot directly to the LLM
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700369 return llm.ToolOut{LLMContent: []llm.Content{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700370 {
371 Type: llm.ContentTypeText,
372 Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
373 },
374 {
375 Type: llm.ContentTypeText, // Will be mapped to image in content array
376 MediaType: "image/png",
377 Data: base64Data,
378 },
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700379 }}
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000380}
381
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700382// GetTools returns browser tools, optionally filtering out screenshot-related tools
383func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
384 tools := []*llm.Tool{
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000385 b.NewNavigateTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000386 b.NewEvalTool(),
Philip Zeyliger18e33682025-05-13 16:34:21 -0700387 b.NewRecentConsoleLogsTool(),
388 b.NewClearConsoleLogsTool(),
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000389 }
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700390
391 // Add screenshot-related tools if supported
392 if includeScreenshotTools {
393 tools = append(tools, b.NewScreenshotTool())
394 tools = append(tools, b.NewReadImageTool())
395 }
396
397 return tools
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000398}
399
400// SaveScreenshot saves a screenshot to disk and returns its ID
401func (b *BrowseTools) SaveScreenshot(data []byte) string {
402 // Generate a unique ID
403 id := uuid.New().String()
404
405 // Save the file
406 filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter4962f152025-05-06 17:24:20 +0000407 if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger33d282f2025-05-03 04:01:54 +0000408 log.Printf("Failed to save screenshot: %v", err)
409 return ""
410 }
411
412 // Track this screenshot
413 b.screenshotsMutex.Lock()
414 b.screenshots[id] = time.Now()
415 b.screenshotsMutex.Unlock()
416
417 return id
418}
419
420// GetScreenshotPath returns the full path to a screenshot by ID
421func GetScreenshotPath(id string) string {
422 return filepath.Join(ScreenshotDir, id+".png")
423}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700424
425// ReadImageTool definition
426type readImageInput struct {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700427 Path string `json:"path"`
428 Timeout string `json:"timeout,omitempty"`
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700429}
430
431// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
432func (b *BrowseTools) NewReadImageTool() *llm.Tool {
433 return &llm.Tool{
Philip Zeyliger542bda32025-06-11 18:31:03 -0700434 Name: "read_image",
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700435 Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
436 InputSchema: json.RawMessage(`{
437 "type": "object",
438 "properties": {
439 "path": {
440 "type": "string",
441 "description": "Path to the image file to read"
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700442 },
443 "timeout": {
444 "type": "string",
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000445 "description": "Timeout as a Go duration string (default: 15s)"
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700446 }
447 },
448 "required": ["path"]
449 }`),
450 Run: b.readImageRun,
451 }
452}
453
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700454func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700455 var input readImageInput
456 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700457 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700458 }
459
460 // Check if the path exists
461 if _, err := os.Stat(input.Path); os.IsNotExist(err) {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700462 return llm.ErrorfToolOut("image file not found: %s", input.Path)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700463 }
464
465 // Read the file
466 imageData, err := os.ReadFile(input.Path)
467 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700468 return llm.ErrorfToolOut("failed to read image file: %w", err)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700469 }
470
471 // Detect the image type
472 imageType := http.DetectContentType(imageData)
473 if !strings.HasPrefix(imageType, "image/") {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700474 return llm.ErrorfToolOut("file is not an image: %s", imageType)
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700475 }
476
477 // Encode the image as base64
478 base64Data := base64.StdEncoding.EncodeToString(imageData)
479
480 // Create a Content object that includes both text and the image
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700481 return llm.ToolOut{LLMContent: []llm.Content{
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700482 {
483 Type: llm.ContentTypeText,
484 Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
485 },
486 {
487 Type: llm.ContentTypeText, // Will be mapped to image in content array
488 MediaType: imageType,
489 Data: base64Data,
490 },
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700491 }}
Philip Zeyliger72252cb2025-05-10 17:00:08 -0700492}
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700493
494// parseTimeout parses a timeout string and returns a time.Duration
495// It returns a default of 5 seconds if the timeout is empty or invalid
496func parseTimeout(timeout string) time.Duration {
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700497 dur, err := time.ParseDuration(timeout)
498 if err != nil {
Josh Bleecher Snydera271a212025-07-30 23:08:00 +0000499 return 15 * time.Second
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700500 }
Philip Zeyliger80b488d2025-05-10 18:21:54 -0700501 return dur
502}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700503
504// captureConsoleLog captures a console log event and stores it
505func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
506 // Add to logs with mutex protection
507 b.consoleLogsMutex.Lock()
508 defer b.consoleLogsMutex.Unlock()
509
510 // Add the log and maintain max size
511 b.consoleLogs = append(b.consoleLogs, e)
512 if len(b.consoleLogs) > b.maxConsoleLogs {
513 b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
514 }
515}
516
517// RecentConsoleLogsTool definition
518type recentConsoleLogsInput struct {
519 Limit int `json:"limit,omitempty"`
520}
521
522// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
523func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
524 return &llm.Tool{
525 Name: "browser_recent_console_logs",
526 Description: "Get recent browser console logs",
527 InputSchema: json.RawMessage(`{
528 "type": "object",
529 "properties": {
530 "limit": {
531 "type": "integer",
532 "description": "Maximum number of log entries to return (default: 100)"
533 }
534 }
535 }`),
536 Run: b.recentConsoleLogsRun,
537 }
538}
539
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700540func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger18e33682025-05-13 16:34:21 -0700541 var input recentConsoleLogsInput
542 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700543 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700544 }
545
546 // Ensure browser is initialized
547 _, err := b.GetBrowserContext()
548 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700549 return llm.ErrorToolOut(err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700550 }
551
552 // Apply limit (default to 100 if not specified)
553 limit := 100
554 if input.Limit > 0 {
555 limit = input.Limit
556 }
557
558 // Get console logs with mutex protection
559 b.consoleLogsMutex.Lock()
560 logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
561 start := 0
562 if len(b.consoleLogs) > limit {
563 start = len(b.consoleLogs) - limit
564 }
565 logs = append(logs, b.consoleLogs[start:]...)
566 b.consoleLogsMutex.Unlock()
567
568 // Format the logs as JSON
569 logData, err := json.MarshalIndent(logs, "", " ")
570 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700571 return llm.ErrorfToolOut("failed to serialize logs: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700572 }
573
574 // Format the logs
575 var sb strings.Builder
576 sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
577
578 if len(logs) == 0 {
579 sb.WriteString("No console logs captured.")
580 } else {
581 // Add the JSON data for full details
582 sb.WriteString(string(logData))
583 }
584
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700585 return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700586}
587
588// ClearConsoleLogsTool definition
589type clearConsoleLogsInput struct{}
590
591// NewClearConsoleLogsTool creates a tool for clearing console logs
592func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
593 return &llm.Tool{
594 Name: "browser_clear_console_logs",
595 Description: "Clear all captured browser console logs",
Josh Bleecher Snyder74d690e2025-05-14 18:16:03 -0700596 InputSchema: llm.EmptySchema(),
597 Run: b.clearConsoleLogsRun,
Philip Zeyliger18e33682025-05-13 16:34:21 -0700598 }
599}
600
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700601func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
Philip Zeyliger18e33682025-05-13 16:34:21 -0700602 var input clearConsoleLogsInput
603 if err := json.Unmarshal(m, &input); err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700604 return llm.ErrorfToolOut("invalid input: %w", err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700605 }
606
607 // Ensure browser is initialized
608 _, err := b.GetBrowserContext()
609 if err != nil {
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700610 return llm.ErrorToolOut(err)
Philip Zeyliger18e33682025-05-13 16:34:21 -0700611 }
612
613 // Clear console logs with mutex protection
614 b.consoleLogsMutex.Lock()
615 logCount := len(b.consoleLogs)
616 b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
617 b.consoleLogsMutex.Unlock()
618
Josh Bleecher Snyder43b60b92025-07-21 14:57:10 -0700619 return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount))}
Philip Zeyliger18e33682025-05-13 16:34:21 -0700620}