| // Package browse provides browser automation tools for the agent |
| package browse |
| |
| import ( |
| "context" |
| "encoding/base64" |
| "encoding/json" |
| "fmt" |
| "log" |
| "net/http" |
| "os" |
| "path/filepath" |
| "strings" |
| "sync" |
| "time" |
| |
| "github.com/chromedp/chromedp" |
| "github.com/google/uuid" |
| "sketch.dev/llm" |
| ) |
| |
| // ScreenshotDir is the directory where screenshots are stored |
| const ScreenshotDir = "/tmp/sketch-screenshots" |
| |
| // BrowseTools contains all browser tools and manages a shared browser instance |
| type BrowseTools struct { |
| ctx context.Context |
| cancel context.CancelFunc |
| browserCtx context.Context |
| browserCtxCancel context.CancelFunc |
| mux sync.Mutex |
| initOnce sync.Once |
| initialized bool |
| initErr error |
| // Map to track screenshots by ID and their creation time |
| screenshots map[string]time.Time |
| screenshotsMutex sync.Mutex |
| } |
| |
| // NewBrowseTools creates a new set of browser automation tools |
| func NewBrowseTools(ctx context.Context) *BrowseTools { |
| ctx, cancel := context.WithCancel(ctx) |
| |
| // Ensure the screenshot directory exists |
| if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil { |
| log.Printf("Failed to create screenshot directory: %v", err) |
| } |
| |
| b := &BrowseTools{ |
| ctx: ctx, |
| cancel: cancel, |
| screenshots: make(map[string]time.Time), |
| } |
| |
| return b |
| } |
| |
| // Initialize starts the browser if it's not already running |
| func (b *BrowseTools) Initialize() error { |
| b.mux.Lock() |
| defer b.mux.Unlock() |
| |
| b.initOnce.Do(func() { |
| // ChromeDP.ExecPath has a list of common places to find Chrome... |
| opts := chromedp.DefaultExecAllocatorOptions[:] |
| allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...) |
| browserCtx, browserCancel := chromedp.NewContext( |
| allocCtx, |
| chromedp.WithLogf(log.Printf), |
| ) |
| |
| b.browserCtx = browserCtx |
| b.browserCtxCancel = browserCancel |
| |
| // Ensure the browser starts |
| if err := chromedp.Run(browserCtx); err != nil { |
| b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err) |
| return |
| } |
| b.initialized = true |
| }) |
| |
| return b.initErr |
| } |
| |
| // Close shuts down the browser |
| func (b *BrowseTools) Close() { |
| b.mux.Lock() |
| defer b.mux.Unlock() |
| |
| if b.browserCtxCancel != nil { |
| b.browserCtxCancel() |
| b.browserCtxCancel = nil |
| } |
| |
| if b.cancel != nil { |
| b.cancel() |
| } |
| |
| b.initialized = false |
| log.Println("Browser closed") |
| } |
| |
| // GetBrowserContext returns the context for browser operations |
| func (b *BrowseTools) GetBrowserContext() (context.Context, error) { |
| if err := b.Initialize(); err != nil { |
| return nil, err |
| } |
| return b.browserCtx, nil |
| } |
| |
| // All tools return this as a response when successful |
| type baseResponse struct { |
| Status string `json:"status,omitempty"` |
| } |
| |
| func successResponse() string { |
| return `{"status":"success"}` |
| } |
| |
| func errorResponse(err error) string { |
| return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error()) |
| } |
| |
| // NavigateTool definition |
| type navigateInput struct { |
| URL string `json:"url"` |
| Timeout string `json:"timeout,omitempty"` |
| } // NewNavigateTool creates a tool for navigating to URLs |
| func (b *BrowseTools) NewNavigateTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_navigate", |
| Description: "Navigate the browser to a specific URL and wait for page to load", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "url": { |
| "type": "string", |
| "description": "The URL to navigate to" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["url"] |
| }`), |
| Run: b.navigateRun, |
| } |
| } |
| |
| func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input navigateInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| err = chromedp.Run(timeoutCtx, |
| chromedp.Navigate(input.URL), |
| chromedp.WaitReady("body"), |
| ) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // ClickTool definition |
| type clickInput struct { |
| Selector string `json:"selector"` |
| WaitVisible bool `json:"wait_visible,omitempty"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewClickTool creates a tool for clicking elements |
| func (b *BrowseTools) NewClickTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_click", |
| Description: "Click the first element matching a CSS selector", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the element to click" |
| }, |
| "wait_visible": { |
| "type": "boolean", |
| "description": "Wait for the element to be visible before clicking" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["selector"] |
| }`), |
| Run: b.clickRun, |
| } |
| } |
| |
| func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input clickInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| actions := []chromedp.Action{ |
| chromedp.WaitReady(input.Selector), |
| } |
| |
| if input.WaitVisible { |
| actions = append(actions, chromedp.WaitVisible(input.Selector)) |
| } |
| |
| actions = append(actions, chromedp.Click(input.Selector)) |
| |
| err = chromedp.Run(timeoutCtx, actions...) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // TypeTool definition |
| type typeInput struct { |
| Selector string `json:"selector"` |
| Text string `json:"text"` |
| Clear bool `json:"clear,omitempty"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewTypeTool creates a tool for typing into input elements |
| func (b *BrowseTools) NewTypeTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_type", |
| Description: "Type text into an input or textarea element", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the input element" |
| }, |
| "text": { |
| "type": "string", |
| "description": "Text to type into the element" |
| }, |
| "clear": { |
| "type": "boolean", |
| "description": "Clear the input field before typing" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["selector", "text"] |
| }`), |
| Run: b.typeRun, |
| } |
| } |
| |
| func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input typeInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| actions := []chromedp.Action{ |
| chromedp.WaitReady(input.Selector), |
| chromedp.WaitVisible(input.Selector), |
| } |
| |
| if input.Clear { |
| actions = append(actions, chromedp.Clear(input.Selector)) |
| } |
| |
| actions = append(actions, chromedp.SendKeys(input.Selector, input.Text)) |
| |
| err = chromedp.Run(timeoutCtx, actions...) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // WaitForTool definition |
| type waitForInput struct { |
| Selector string `json:"selector"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewWaitForTool creates a tool for waiting for elements |
| func (b *BrowseTools) NewWaitForTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_wait_for", |
| Description: "Wait for an element to be present in the DOM", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the element to wait for" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["selector"] |
| }`), |
| Run: b.waitForRun, |
| } |
| } |
| |
| func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input waitForInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector)) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // GetTextTool definition |
| type getTextInput struct { |
| Selector string `json:"selector"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| type getTextOutput struct { |
| Text string `json:"text"` |
| } |
| |
| // NewGetTextTool creates a tool for getting text from elements |
| func (b *BrowseTools) NewGetTextTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_get_text", |
| Description: "Get the innerText of an element. Can be used to read the web page.", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the element to get text from" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["selector"] |
| }`), |
| Run: b.getTextRun, |
| } |
| } |
| |
| func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input getTextInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| var text string |
| err = chromedp.Run(timeoutCtx, |
| chromedp.WaitReady(input.Selector), |
| chromedp.Text(input.Selector, &text), |
| ) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| output := getTextOutput{Text: text} |
| result, err := json.Marshal(output) |
| if err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil |
| } |
| |
| return llm.TextContent(string(result)), nil |
| } |
| |
| // EvalTool definition |
| type evalInput struct { |
| Expression string `json:"expression"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| type evalOutput struct { |
| Result any `json:"result"` |
| } |
| |
| // NewEvalTool creates a tool for evaluating JavaScript |
| func (b *BrowseTools) NewEvalTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_eval", |
| Description: "Evaluate JavaScript in the browser context", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "expression": { |
| "type": "string", |
| "description": "JavaScript expression to evaluate" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["expression"] |
| }`), |
| Run: b.evalRun, |
| } |
| } |
| |
| func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input evalInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| var result any |
| err = chromedp.Run(timeoutCtx, chromedp.Evaluate(input.Expression, &result)) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| output := evalOutput{Result: result} |
| response, err := json.Marshal(output) |
| if err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("failed to marshal response: %w", err))), nil |
| } |
| |
| return llm.TextContent(string(response)), nil |
| } |
| |
| // ScreenshotTool definition |
| type screenshotInput struct { |
| Selector string `json:"selector,omitempty"` |
| Format string `json:"format,omitempty"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| type screenshotOutput struct { |
| ID string `json:"id"` |
| } |
| |
| // NewScreenshotTool creates a tool for taking screenshots |
| func (b *BrowseTools) NewScreenshotTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_take_screenshot", |
| Description: "Take a screenshot of the page or a specific element", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the element to screenshot (optional)" |
| }, |
| "format": { |
| "type": "string", |
| "description": "Output format ('base64' or 'png'), defaults to 'base64'", |
| "enum": ["base64", "png"] |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| } |
| }`), |
| Run: b.screenshotRun, |
| } |
| } |
| |
| func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input screenshotInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| var buf []byte |
| var actions []chromedp.Action |
| |
| if input.Selector != "" { |
| // Take screenshot of specific element |
| actions = append(actions, |
| chromedp.WaitReady(input.Selector), |
| chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible), |
| ) |
| } else { |
| // Take full page screenshot |
| actions = append(actions, chromedp.CaptureScreenshot(&buf)) |
| } |
| |
| err = chromedp.Run(timeoutCtx, actions...) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Save the screenshot and get its ID |
| id := b.SaveScreenshot(buf) |
| if id == "" { |
| return llm.TextContent(errorResponse(fmt.Errorf("failed to save screenshot"))), nil |
| } |
| |
| // Get the full path to the screenshot |
| screenshotPath := GetScreenshotPath(id) |
| |
| // Return the ID and instructions on how to view the screenshot |
| result := fmt.Sprintf(`{ |
| "id": "%s", |
| "path": "%s", |
| "message": "Screenshot saved. To view this screenshot in the conversation, use the read_image tool with the path provided." |
| }`, id, screenshotPath) |
| |
| return llm.TextContent(result), nil |
| } |
| |
| // ScrollIntoViewTool definition |
| type scrollIntoViewInput struct { |
| Selector string `json:"selector"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewScrollIntoViewTool creates a tool for scrolling elements into view |
| func (b *BrowseTools) NewScrollIntoViewTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_scroll_into_view", |
| Description: "Scroll an element into view if it's not visible", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "selector": { |
| "type": "string", |
| "description": "CSS selector for the element to scroll into view" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["selector"] |
| }`), |
| Run: b.scrollIntoViewRun, |
| } |
| } |
| |
| func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input scrollIntoViewInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| script := fmt.Sprintf(` |
| const el = document.querySelector('%s'); |
| if (el) { |
| el.scrollIntoView({behavior: 'smooth', block: 'center'}); |
| return true; |
| } |
| return false; |
| `, input.Selector) |
| |
| var result bool |
| err = chromedp.Run(timeoutCtx, |
| chromedp.WaitReady(input.Selector), |
| chromedp.Evaluate(script, &result), |
| ) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| if !result { |
| return llm.TextContent(errorResponse(fmt.Errorf("element not found: %s", input.Selector))), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // ResizeTool definition |
| type resizeInput struct { |
| Width int `json:"width"` |
| Height int `json:"height"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewResizeTool creates a tool for resizing the browser window |
| func (b *BrowseTools) NewResizeTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_resize", |
| Description: "Resize the browser window to a specific width and height", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "width": { |
| "type": "integer", |
| "description": "Window width in pixels" |
| }, |
| "height": { |
| "type": "integer", |
| "description": "Window height in pixels" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["width", "height"] |
| }`), |
| Run: b.resizeRun, |
| } |
| } |
| |
| func (b *BrowseTools) resizeRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input resizeInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| browserCtx, err := b.GetBrowserContext() |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| // Create a timeout context for this operation |
| timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout)) |
| defer cancel() |
| |
| // Validate dimensions |
| if input.Width <= 0 || input.Height <= 0 { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid dimensions: width and height must be positive"))), nil |
| } |
| |
| // Resize the browser window |
| err = chromedp.Run(timeoutCtx, |
| chromedp.EmulateViewport(int64(input.Width), int64(input.Height)), |
| ) |
| if err != nil { |
| return llm.TextContent(errorResponse(err)), nil |
| } |
| |
| return llm.TextContent(successResponse()), nil |
| } |
| |
| // GetTools returns browser tools, optionally filtering out screenshot-related tools |
| func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool { |
| tools := []*llm.Tool{ |
| b.NewNavigateTool(), |
| b.NewClickTool(), |
| b.NewTypeTool(), |
| b.NewWaitForTool(), |
| b.NewGetTextTool(), |
| b.NewEvalTool(), |
| b.NewScrollIntoViewTool(), |
| b.NewResizeTool(), |
| } |
| |
| // Add screenshot-related tools if supported |
| if includeScreenshotTools { |
| tools = append(tools, b.NewScreenshotTool()) |
| tools = append(tools, b.NewReadImageTool()) |
| } |
| |
| return tools |
| } |
| |
| // SaveScreenshot saves a screenshot to disk and returns its ID |
| func (b *BrowseTools) SaveScreenshot(data []byte) string { |
| // Generate a unique ID |
| id := uuid.New().String() |
| |
| // Save the file |
| filePath := filepath.Join(ScreenshotDir, id+".png") |
| if err := os.WriteFile(filePath, data, 0o644); err != nil { |
| log.Printf("Failed to save screenshot: %v", err) |
| return "" |
| } |
| |
| // Track this screenshot |
| b.screenshotsMutex.Lock() |
| b.screenshots[id] = time.Now() |
| b.screenshotsMutex.Unlock() |
| |
| return id |
| } |
| |
| // GetScreenshotPath returns the full path to a screenshot by ID |
| func GetScreenshotPath(id string) string { |
| return filepath.Join(ScreenshotDir, id+".png") |
| } |
| |
| // ReadImageTool definition |
| type readImageInput struct { |
| Path string `json:"path"` |
| Timeout string `json:"timeout,omitempty"` |
| } |
| |
| // NewReadImageTool creates a tool for reading images and returning them as base64 encoded data |
| func (b *BrowseTools) NewReadImageTool() *llm.Tool { |
| return &llm.Tool{ |
| Name: "browser_read_image", |
| Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM", |
| InputSchema: json.RawMessage(`{ |
| "type": "object", |
| "properties": { |
| "path": { |
| "type": "string", |
| "description": "Path to the image file to read" |
| }, |
| "timeout": { |
| "type": "string", |
| "description": "Timeout as a Go duration string (default: 5s)" |
| } |
| }, |
| "required": ["path"] |
| }`), |
| Run: b.readImageRun, |
| } |
| } |
| |
| func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) ([]llm.Content, error) { |
| var input readImageInput |
| if err := json.Unmarshal(m, &input); err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("invalid input: %w", err))), nil |
| } |
| |
| // Check if the path exists |
| if _, err := os.Stat(input.Path); os.IsNotExist(err) { |
| return llm.TextContent(errorResponse(fmt.Errorf("image file not found: %s", input.Path))), nil |
| } |
| |
| // Read the file |
| imageData, err := os.ReadFile(input.Path) |
| if err != nil { |
| return llm.TextContent(errorResponse(fmt.Errorf("failed to read image file: %w", err))), nil |
| } |
| |
| // Detect the image type |
| imageType := http.DetectContentType(imageData) |
| if !strings.HasPrefix(imageType, "image/") { |
| return llm.TextContent(errorResponse(fmt.Errorf("file is not an image: %s", imageType))), nil |
| } |
| |
| // Encode the image as base64 |
| base64Data := base64.StdEncoding.EncodeToString(imageData) |
| |
| // Create a Content object that includes both text and the image |
| return []llm.Content{ |
| { |
| Type: llm.ContentTypeText, |
| Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType), |
| }, |
| { |
| Type: llm.ContentTypeText, // Will be mapped to image in content array |
| MediaType: imageType, |
| Data: base64Data, |
| }, |
| }, nil |
| } |
| |
| // parseTimeout parses a timeout string and returns a time.Duration |
| // It returns a default of 5 seconds if the timeout is empty or invalid |
| func parseTimeout(timeout string) time.Duration { |
| if timeout == "" { |
| return 5 * time.Second // default 5 seconds |
| } |
| |
| dur, err := time.ParseDuration(timeout) |
| if err != nil { |
| // If parsing fails, return the default |
| return 5 * time.Second |
| } |
| |
| return dur |
| } |