blob: 81ae105bd94756ed8c9349000af26c07798b51a4 [file] [log] [blame]
// Package browse provides browser automation tools for the agent
package browse
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/chromedp/cdproto/runtime"
"github.com/chromedp/chromedp"
"github.com/google/uuid"
"sketch.dev/llm"
)
// ScreenshotDir is the directory where screenshots are stored
const ScreenshotDir = "/tmp/sketch-screenshots"
// BrowseTools contains all browser tools and manages a shared browser instance
type BrowseTools struct {
ctx context.Context
cancel context.CancelFunc
browserCtx context.Context
browserCtxCancel context.CancelFunc
mux sync.Mutex
initOnce sync.Once
initialized bool
initErr error
// Map to track screenshots by ID and their creation time
screenshots map[string]time.Time
screenshotsMutex sync.Mutex
// Console logs storage
consoleLogs []*runtime.EventConsoleAPICalled
consoleLogsMutex sync.Mutex
maxConsoleLogs int
}
// NewBrowseTools creates a new set of browser automation tools
func NewBrowseTools(ctx context.Context) *BrowseTools {
ctx, cancel := context.WithCancel(ctx)
// Ensure the screenshot directory exists
if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
log.Printf("Failed to create screenshot directory: %v", err)
}
b := &BrowseTools{
ctx: ctx,
cancel: cancel,
screenshots: make(map[string]time.Time),
consoleLogs: make([]*runtime.EventConsoleAPICalled, 0),
maxConsoleLogs: 100,
}
return b
}
// Initialize starts the browser if it's not already running
func (b *BrowseTools) Initialize() error {
b.mux.Lock()
defer b.mux.Unlock()
b.initOnce.Do(func() {
// ChromeDP.ExecPath has a list of common places to find Chrome...
opts := chromedp.DefaultExecAllocatorOptions[:]
// This is the default when running as root, but we generally need it
// when running in a container, even when we aren't root (which is largely
// the case for tests).
opts = append(opts, chromedp.NoSandbox)
// Setting 'DBUS_SESSION_BUS_ADDRESS=""' or this flag allows tests to pass
// in GitHub runner contexts. It's a mystery why the failure isn't clear when this fails.
opts = append(opts, chromedp.Flag("--disable-dbus", true))
// This can be pretty slow in tests
opts = append(opts, chromedp.WSURLReadTimeout(60*time.Second))
// Add environment variable to mark this as a sketch internal process
opts = append(opts, chromedp.Env("SKETCH_IGNORE_PORTS=1"))
allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
browserCtx, browserCancel := chromedp.NewContext(
allocCtx,
chromedp.WithLogf(log.Printf), chromedp.WithErrorf(log.Printf), chromedp.WithBrowserOption(chromedp.WithDialTimeout(60*time.Second)),
)
b.browserCtx = browserCtx
b.browserCtxCancel = browserCancel
// Set up console log listener
chromedp.ListenTarget(browserCtx, func(ev any) {
switch e := ev.(type) {
case *runtime.EventConsoleAPICalled:
b.captureConsoleLog(e)
}
})
// Ensure the browser starts
if err := chromedp.Run(browserCtx); err != nil {
b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
return
}
// Set default viewport size to 1280x720 (16:9 widescreen)
if err := chromedp.Run(browserCtx, chromedp.EmulateViewport(1280, 720)); err != nil {
b.initErr = fmt.Errorf("failed to set default viewport: %w", err)
return
}
b.initialized = true
})
return b.initErr
}
// Close shuts down the browser
func (b *BrowseTools) Close() {
b.mux.Lock()
defer b.mux.Unlock()
if b.browserCtxCancel != nil {
b.browserCtxCancel()
b.browserCtxCancel = nil
}
if b.cancel != nil {
b.cancel()
}
b.initialized = false
log.Println("Browser closed")
}
// GetBrowserContext returns the context for browser operations
func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
if err := b.Initialize(); err != nil {
return nil, err
}
return b.browserCtx, nil
}
// NavigateTool definition
type navigateInput struct {
URL string `json:"url"`
Timeout string `json:"timeout,omitempty"`
}
// isPort80 reports whether urlStr definitely uses port 80.
func isPort80(urlStr string) bool {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return false
}
port := parsedURL.Port()
return port == "80" || (port == "" && parsedURL.Scheme == "http")
}
// NewNavigateTool creates a tool for navigating to URLs
func (b *BrowseTools) NewNavigateTool() *llm.Tool {
return &llm.Tool{
Name: "browser_navigate",
Description: "Navigate the browser to a specific URL and wait for page to load",
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL to navigate to"
},
"timeout": {
"type": "string",
"description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["url"]
}`),
Run: b.navigateRun,
}
}
func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input navigateInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
if isPort80(input.URL) {
return llm.ErrorToolOut(fmt.Errorf("port 80 is not the port you're looking for--port 80 is the main sketch server"))
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
return llm.ErrorToolOut(err)
}
// Create a timeout context for this operation
timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
defer cancel()
err = chromedp.Run(timeoutCtx,
chromedp.Navigate(input.URL),
chromedp.WaitReady("body"),
)
if err != nil {
return llm.ErrorToolOut(err)
}
return llm.ToolOut{LLMContent: llm.TextContent("done")}
}
// EvalTool definition
type evalInput struct {
Expression string `json:"expression"`
Timeout string `json:"timeout,omitempty"`
Await *bool `json:"await,omitempty"`
}
// NewEvalTool creates a tool for evaluating JavaScript
func (b *BrowseTools) NewEvalTool() *llm.Tool {
return &llm.Tool{
Name: "browser_eval",
Description: `Evaluate JavaScript in the browser context.
Your go-to tool for interacting with content: clicking buttons, typing, getting content, scrolling, resizing, waiting for content/selector to be ready, etc.`,
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "JavaScript expression to evaluate"
},
"timeout": {
"type": "string",
"description": "Timeout as a Go duration string (default: 15s)"
},
"await": {
"type": "boolean",
"description": "If true, wait for promises to resolve and return their resolved value (default: true)"
}
},
"required": ["expression"]
}`),
Run: b.evalRun,
}
}
func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input evalInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
return llm.ErrorToolOut(err)
}
// Create a timeout context for this operation
timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
defer cancel()
var result any
var evalOps []chromedp.EvaluateOption
await := true
if input.Await != nil {
await = *input.Await
}
if await {
evalOps = append(evalOps, func(p *runtime.EvaluateParams) *runtime.EvaluateParams {
return p.WithAwaitPromise(true)
})
}
evalAction := chromedp.Evaluate(input.Expression, &result, evalOps...)
err = chromedp.Run(timeoutCtx, evalAction)
if err != nil {
return llm.ErrorToolOut(err)
}
// Return the result as JSON
response, err := json.Marshal(result)
if err != nil {
return llm.ErrorfToolOut("failed to marshal response: %w", err)
}
return llm.ToolOut{LLMContent: llm.TextContent("<javascript_result>" + string(response) + "</javascript_result>")}
}
// ScreenshotTool definition
type screenshotInput struct {
Selector string `json:"selector,omitempty"`
Timeout string `json:"timeout,omitempty"`
}
// NewScreenshotTool creates a tool for taking screenshots
func (b *BrowseTools) NewScreenshotTool() *llm.Tool {
return &llm.Tool{
Name: "browser_take_screenshot",
Description: "Take a screenshot of the page or a specific element",
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"selector": {
"type": "string",
"description": "CSS selector for the element to screenshot (optional)"
},
"timeout": {
"type": "string",
"description": "Timeout as a Go duration string (default: 15s)"
}
}
}`),
Run: b.screenshotRun,
}
}
func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input screenshotInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
browserCtx, err := b.GetBrowserContext()
if err != nil {
return llm.ErrorToolOut(err)
}
// Create a timeout context for this operation
timeoutCtx, cancel := context.WithTimeout(browserCtx, parseTimeout(input.Timeout))
defer cancel()
var buf []byte
var actions []chromedp.Action
if input.Selector != "" {
// Take screenshot of specific element
actions = append(actions,
chromedp.WaitReady(input.Selector),
chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
)
} else {
// Take full page screenshot
actions = append(actions, chromedp.CaptureScreenshot(&buf))
}
err = chromedp.Run(timeoutCtx, actions...)
if err != nil {
return llm.ErrorToolOut(err)
}
// Save the screenshot and get its ID for potential future reference
id := b.SaveScreenshot(buf)
if id == "" {
return llm.ErrorToolOut(fmt.Errorf("failed to save screenshot"))
}
// Get the full path to the screenshot
screenshotPath := GetScreenshotPath(id)
// Encode the image as base64
base64Data := base64.StdEncoding.EncodeToString(buf)
// Return the screenshot directly to the LLM
return llm.ToolOut{LLMContent: []llm.Content{
{
Type: llm.ContentTypeText,
Text: fmt.Sprintf("Screenshot taken (saved as %s)", screenshotPath),
},
{
Type: llm.ContentTypeText, // Will be mapped to image in content array
MediaType: "image/png",
Data: base64Data,
},
}}
}
// GetTools returns browser tools, optionally filtering out screenshot-related tools
func (b *BrowseTools) GetTools(includeScreenshotTools bool) []*llm.Tool {
tools := []*llm.Tool{
b.NewNavigateTool(),
b.NewEvalTool(),
b.NewRecentConsoleLogsTool(),
b.NewClearConsoleLogsTool(),
}
// Add screenshot-related tools if supported
if includeScreenshotTools {
tools = append(tools, b.NewScreenshotTool())
tools = append(tools, b.NewReadImageTool())
}
return tools
}
// SaveScreenshot saves a screenshot to disk and returns its ID
func (b *BrowseTools) SaveScreenshot(data []byte) string {
// Generate a unique ID
id := uuid.New().String()
// Save the file
filePath := filepath.Join(ScreenshotDir, id+".png")
if err := os.WriteFile(filePath, data, 0o644); err != nil {
log.Printf("Failed to save screenshot: %v", err)
return ""
}
// Track this screenshot
b.screenshotsMutex.Lock()
b.screenshots[id] = time.Now()
b.screenshotsMutex.Unlock()
return id
}
// GetScreenshotPath returns the full path to a screenshot by ID
func GetScreenshotPath(id string) string {
return filepath.Join(ScreenshotDir, id+".png")
}
// ReadImageTool definition
type readImageInput struct {
Path string `json:"path"`
Timeout string `json:"timeout,omitempty"`
}
// NewReadImageTool creates a tool for reading images and returning them as base64 encoded data
func (b *BrowseTools) NewReadImageTool() *llm.Tool {
return &llm.Tool{
Name: "read_image",
Description: "Read an image file (such as a screenshot) and encode it for sending to the LLM",
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the image file to read"
},
"timeout": {
"type": "string",
"description": "Timeout as a Go duration string (default: 15s)"
}
},
"required": ["path"]
}`),
Run: b.readImageRun,
}
}
func (b *BrowseTools) readImageRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input readImageInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
// Check if the path exists
if _, err := os.Stat(input.Path); os.IsNotExist(err) {
return llm.ErrorfToolOut("image file not found: %s", input.Path)
}
// Read the file
imageData, err := os.ReadFile(input.Path)
if err != nil {
return llm.ErrorfToolOut("failed to read image file: %w", err)
}
// Detect the image type
imageType := http.DetectContentType(imageData)
if !strings.HasPrefix(imageType, "image/") {
return llm.ErrorfToolOut("file is not an image: %s", imageType)
}
// Encode the image as base64
base64Data := base64.StdEncoding.EncodeToString(imageData)
// Create a Content object that includes both text and the image
return llm.ToolOut{LLMContent: []llm.Content{
{
Type: llm.ContentTypeText,
Text: fmt.Sprintf("Image from %s (type: %s)", input.Path, imageType),
},
{
Type: llm.ContentTypeText, // Will be mapped to image in content array
MediaType: imageType,
Data: base64Data,
},
}}
}
// parseTimeout parses a timeout string and returns a time.Duration
// It returns a default of 5 seconds if the timeout is empty or invalid
func parseTimeout(timeout string) time.Duration {
dur, err := time.ParseDuration(timeout)
if err != nil {
return 15 * time.Second
}
return dur
}
// captureConsoleLog captures a console log event and stores it
func (b *BrowseTools) captureConsoleLog(e *runtime.EventConsoleAPICalled) {
// Add to logs with mutex protection
b.consoleLogsMutex.Lock()
defer b.consoleLogsMutex.Unlock()
// Add the log and maintain max size
b.consoleLogs = append(b.consoleLogs, e)
if len(b.consoleLogs) > b.maxConsoleLogs {
b.consoleLogs = b.consoleLogs[len(b.consoleLogs)-b.maxConsoleLogs:]
}
}
// RecentConsoleLogsTool definition
type recentConsoleLogsInput struct {
Limit int `json:"limit,omitempty"`
}
// NewRecentConsoleLogsTool creates a tool for retrieving recent console logs
func (b *BrowseTools) NewRecentConsoleLogsTool() *llm.Tool {
return &llm.Tool{
Name: "browser_recent_console_logs",
Description: "Get recent browser console logs",
InputSchema: json.RawMessage(`{
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of log entries to return (default: 100)"
}
}
}`),
Run: b.recentConsoleLogsRun,
}
}
func (b *BrowseTools) recentConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input recentConsoleLogsInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
// Ensure browser is initialized
_, err := b.GetBrowserContext()
if err != nil {
return llm.ErrorToolOut(err)
}
// Apply limit (default to 100 if not specified)
limit := 100
if input.Limit > 0 {
limit = input.Limit
}
// Get console logs with mutex protection
b.consoleLogsMutex.Lock()
logs := make([]*runtime.EventConsoleAPICalled, 0, len(b.consoleLogs))
start := 0
if len(b.consoleLogs) > limit {
start = len(b.consoleLogs) - limit
}
logs = append(logs, b.consoleLogs[start:]...)
b.consoleLogsMutex.Unlock()
// Format the logs as JSON
logData, err := json.MarshalIndent(logs, "", " ")
if err != nil {
return llm.ErrorfToolOut("failed to serialize logs: %w", err)
}
// Format the logs
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Retrieved %d console log entries:\n\n", len(logs)))
if len(logs) == 0 {
sb.WriteString("No console logs captured.")
} else {
// Add the JSON data for full details
sb.WriteString(string(logData))
}
return llm.ToolOut{LLMContent: llm.TextContent(sb.String())}
}
// ClearConsoleLogsTool definition
type clearConsoleLogsInput struct{}
// NewClearConsoleLogsTool creates a tool for clearing console logs
func (b *BrowseTools) NewClearConsoleLogsTool() *llm.Tool {
return &llm.Tool{
Name: "browser_clear_console_logs",
Description: "Clear all captured browser console logs",
InputSchema: llm.EmptySchema(),
Run: b.clearConsoleLogsRun,
}
}
func (b *BrowseTools) clearConsoleLogsRun(ctx context.Context, m json.RawMessage) llm.ToolOut {
var input clearConsoleLogsInput
if err := json.Unmarshal(m, &input); err != nil {
return llm.ErrorfToolOut("invalid input: %w", err)
}
// Ensure browser is initialized
_, err := b.GetBrowserContext()
if err != nil {
return llm.ErrorToolOut(err)
}
// Clear console logs with mutex protection
b.consoleLogsMutex.Lock()
logCount := len(b.consoleLogs)
b.consoleLogs = make([]*runtime.EventConsoleAPICalled, 0)
b.consoleLogsMutex.Unlock()
return llm.ToolOut{LLMContent: llm.TextContent(fmt.Sprintf("Cleared %d console log entries.", logCount))}
}