Blame - claudetool/browse/browse.go - sketch

blob: 52248b8c9784cb3f826e3610dabcd90e22d16197 [file] [log] [blame]

Philip Zeyliger	33d282f	2025-05-03 04:01:54 +0000	[diff] [blame]	1	// Package browse provides browser automation tools for the agent
				2	package browse
				3
				4	import (
				5	"context"
				6	"encoding/json"
				7	"fmt"
				8	"log"
				9	"os"
				10	"path/filepath"
				11	"sync"
				12	"time"
				13
				14	"github.com/chromedp/chromedp"
				15	"github.com/google/uuid"
				16	"sketch.dev/llm"
				17	)
				18
				19	// ScreenshotDir is the directory where screenshots are stored
				20	const ScreenshotDir = "/tmp/sketch-screenshots"
				21
				22	// BrowseTools contains all browser tools and manages a shared browser instance
				23	type BrowseTools struct {
				24	ctx context.Context
				25	cancel context.CancelFunc
				26	browserCtx context.Context
				27	browserCtxCancel context.CancelFunc
				28	mux sync.Mutex
				29	initOnce sync.Once
				30	initialized bool
				31	initErr error
				32	// Map to track screenshots by ID and their creation time
				33	screenshots map[string]time.Time
				34	screenshotsMutex sync.Mutex
				35	}
				36
				37	// NewBrowseTools creates a new set of browser automation tools
				38	func NewBrowseTools(ctx context.Context) *BrowseTools {
				39	ctx, cancel := context.WithCancel(ctx)
				40
				41	// Ensure the screenshot directory exists
Autoformatter	4962f15	2025-05-06 17:24:20 +0000	[diff] [blame]	42	if err := os.MkdirAll(ScreenshotDir, 0o755); err != nil {
Philip Zeyliger	33d282f	2025-05-03 04:01:54 +0000	[diff] [blame]	43	log.Printf("Failed to create screenshot directory: %v", err)
				44	}
				45
				46	b := &BrowseTools{
				47	ctx: ctx,
				48	cancel: cancel,
				49	screenshots: make(map[string]time.Time),
				50	}
				51
				52	return b
				53	}
				54
				55	// Initialize starts the browser if it's not already running
				56	func (b *BrowseTools) Initialize() error {
				57	b.mux.Lock()
				58	defer b.mux.Unlock()
				59
				60	b.initOnce.Do(func() {
				61	// ChromeDP.ExecPath has a list of common places to find Chrome...
				62	opts := chromedp.DefaultExecAllocatorOptions[:]
				63	allocCtx, _ := chromedp.NewExecAllocator(b.ctx, opts...)
				64	browserCtx, browserCancel := chromedp.NewContext(
				65	allocCtx,
				66	chromedp.WithLogf(log.Printf),
				67	)
				68
				69	b.browserCtx = browserCtx
				70	b.browserCtxCancel = browserCancel
				71
				72	// Ensure the browser starts
				73	if err := chromedp.Run(browserCtx); err != nil {
				74	b.initErr = fmt.Errorf("failed to start browser (please apt get chromium or equivalent): %w", err)
				75	return
				76	}
				77	b.initialized = true
				78	})
				79
				80	return b.initErr
				81	}
				82
				83	// Close shuts down the browser
				84	func (b *BrowseTools) Close() {
				85	b.mux.Lock()
				86	defer b.mux.Unlock()
				87
				88	if b.browserCtxCancel != nil {
				89	b.browserCtxCancel()
				90	b.browserCtxCancel = nil
				91	}
				92
				93	if b.cancel != nil {
				94	b.cancel()
				95	}
				96
				97	b.initialized = false
				98	log.Println("Browser closed")
				99	}
				100
				101	// GetBrowserContext returns the context for browser operations
				102	func (b *BrowseTools) GetBrowserContext() (context.Context, error) {
				103	if err := b.Initialize(); err != nil {
				104	return nil, err
				105	}
				106	return b.browserCtx, nil
				107	}
				108
				109	// All tools return this as a response when successful
				110	type baseResponse struct {
				111	Status string `json:"status,omitempty"`
				112	}
				113
				114	func successResponse() string {
				115	return `{"status":"success"}`
				116	}
				117
				118	func errorResponse(err error) string {
				119	return fmt.Sprintf(`{"status":"error","error":"%s"}`, err.Error())
				120	}
				121
				122	// NavigateTool definition
				123	type navigateInput struct {
				124	URL string `json:"url"`
				125	}
				126
				127	// NewNavigateTool creates a tool for navigating to URLs
				128	func (b BrowseTools) NewNavigateTool() llm.Tool {
				129	return &llm.Tool{
				130	Name: "browser_navigate",
				131	Description: "Navigate the browser to a specific URL and wait for page to load",
				132	InputSchema: json.RawMessage(`{
				133	"type": "object",
				134	"properties": {
				135	"url": {
				136	"type": "string",
				137	"description": "The URL to navigate to"
				138	}
				139	},
				140	"required": ["url"]
				141	}`),
				142	Run: b.navigateRun,
				143	}
				144	}
				145
				146	func (b *BrowseTools) navigateRun(ctx context.Context, m json.RawMessage) (string, error) {
				147	var input navigateInput
				148	if err := json.Unmarshal(m, &input); err != nil {
				149	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				150	}
				151
				152	browserCtx, err := b.GetBrowserContext()
				153	if err != nil {
				154	return errorResponse(err), nil
				155	}
				156
				157	err = chromedp.Run(browserCtx,
				158	chromedp.Navigate(input.URL),
				159	chromedp.WaitReady("body"),
				160	)
				161	if err != nil {
				162	return errorResponse(err), nil
				163	}
				164
				165	return successResponse(), nil
				166	}
				167
				168	// ClickTool definition
				169	type clickInput struct {
				170	Selector string `json:"selector"`
				171	WaitVisible bool `json:"wait_visible,omitempty"`
				172	}
				173
				174	// NewClickTool creates a tool for clicking elements
				175	func (b BrowseTools) NewClickTool() llm.Tool {
				176	return &llm.Tool{
				177	Name: "browser_click",
				178	Description: "Click the first element matching a CSS selector",
				179	InputSchema: json.RawMessage(`{
				180	"type": "object",
				181	"properties": {
				182	"selector": {
				183	"type": "string",
				184	"description": "CSS selector for the element to click"
				185	},
				186	"wait_visible": {
				187	"type": "boolean",
				188	"description": "Wait for the element to be visible before clicking"
				189	}
				190	},
				191	"required": ["selector"]
				192	}`),
				193	Run: b.clickRun,
				194	}
				195	}
				196
				197	func (b *BrowseTools) clickRun(ctx context.Context, m json.RawMessage) (string, error) {
				198	var input clickInput
				199	if err := json.Unmarshal(m, &input); err != nil {
				200	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				201	}
				202
				203	browserCtx, err := b.GetBrowserContext()
				204	if err != nil {
				205	return errorResponse(err), nil
				206	}
				207
				208	actions := []chromedp.Action{
				209	chromedp.WaitReady(input.Selector),
				210	}
				211
				212	if input.WaitVisible {
				213	actions = append(actions, chromedp.WaitVisible(input.Selector))
				214	}
				215
				216	actions = append(actions, chromedp.Click(input.Selector))
				217
				218	err = chromedp.Run(browserCtx, actions...)
				219	if err != nil {
				220	return errorResponse(err), nil
				221	}
				222
				223	return successResponse(), nil
				224	}
				225
				226	// TypeTool definition
				227	type typeInput struct {
				228	Selector string `json:"selector"`
				229	Text string `json:"text"`
				230	Clear bool `json:"clear,omitempty"`
				231	}
				232
				233	// NewTypeTool creates a tool for typing into input elements
				234	func (b BrowseTools) NewTypeTool() llm.Tool {
				235	return &llm.Tool{
				236	Name: "browser_type",
				237	Description: "Type text into an input or textarea element",
				238	InputSchema: json.RawMessage(`{
				239	"type": "object",
				240	"properties": {
				241	"selector": {
				242	"type": "string",
				243	"description": "CSS selector for the input element"
				244	},
				245	"text": {
				246	"type": "string",
				247	"description": "Text to type into the element"
				248	},
				249	"clear": {
				250	"type": "boolean",
				251	"description": "Clear the input field before typing"
				252	}
				253	},
				254	"required": ["selector", "text"]
				255	}`),
				256	Run: b.typeRun,
				257	}
				258	}
				259
				260	func (b *BrowseTools) typeRun(ctx context.Context, m json.RawMessage) (string, error) {
				261	var input typeInput
				262	if err := json.Unmarshal(m, &input); err != nil {
				263	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				264	}
				265
				266	browserCtx, err := b.GetBrowserContext()
				267	if err != nil {
				268	return errorResponse(err), nil
				269	}
				270
				271	actions := []chromedp.Action{
				272	chromedp.WaitReady(input.Selector),
				273	chromedp.WaitVisible(input.Selector),
				274	}
				275
				276	if input.Clear {
				277	actions = append(actions, chromedp.Clear(input.Selector))
				278	}
				279
				280	actions = append(actions, chromedp.SendKeys(input.Selector, input.Text))
				281
				282	err = chromedp.Run(browserCtx, actions...)
				283	if err != nil {
				284	return errorResponse(err), nil
				285	}
				286
				287	return successResponse(), nil
				288	}
				289
				290	// WaitForTool definition
				291	type waitForInput struct {
				292	Selector string `json:"selector"`
				293	TimeoutMS int `json:"timeout_ms,omitempty"`
				294	}
				295
				296	// NewWaitForTool creates a tool for waiting for elements
				297	func (b BrowseTools) NewWaitForTool() llm.Tool {
				298	return &llm.Tool{
				299	Name: "browser_wait_for",
				300	Description: "Wait for an element to be present in the DOM",
				301	InputSchema: json.RawMessage(`{
				302	"type": "object",
				303	"properties": {
				304	"selector": {
				305	"type": "string",
				306	"description": "CSS selector for the element to wait for"
				307	},
				308	"timeout_ms": {
				309	"type": "integer",
				310	"description": "Maximum time to wait in milliseconds (default: 30000)"
				311	}
				312	},
				313	"required": ["selector"]
				314	}`),
				315	Run: b.waitForRun,
				316	}
				317	}
				318
				319	func (b *BrowseTools) waitForRun(ctx context.Context, m json.RawMessage) (string, error) {
				320	var input waitForInput
				321	if err := json.Unmarshal(m, &input); err != nil {
				322	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				323	}
				324
				325	timeout := 30000 // default timeout 30 seconds
				326	if input.TimeoutMS > 0 {
				327	timeout = input.TimeoutMS
				328	}
				329
				330	browserCtx, err := b.GetBrowserContext()
				331	if err != nil {
				332	return errorResponse(err), nil
				333	}
				334
				335	timeoutCtx, cancel := context.WithTimeout(browserCtx, time.Duration(timeout)*time.Millisecond)
				336	defer cancel()
				337
				338	err = chromedp.Run(timeoutCtx, chromedp.WaitReady(input.Selector))
				339	if err != nil {
				340	return errorResponse(err), nil
				341	}
				342
				343	return successResponse(), nil
				344	}
				345
				346	// GetTextTool definition
				347	type getTextInput struct {
				348	Selector string `json:"selector"`
				349	}
				350
				351	type getTextOutput struct {
				352	Text string `json:"text"`
				353	}
				354
				355	// NewGetTextTool creates a tool for getting text from elements
				356	func (b BrowseTools) NewGetTextTool() llm.Tool {
				357	return &llm.Tool{
				358	Name: "browser_get_text",
				359	Description: "Get the innerText of an element",
				360	InputSchema: json.RawMessage(`{
				361	"type": "object",
				362	"properties": {
				363	"selector": {
				364	"type": "string",
				365	"description": "CSS selector for the element to get text from"
				366	}
				367	},
				368	"required": ["selector"]
				369	}`),
				370	Run: b.getTextRun,
				371	}
				372	}
				373
				374	func (b *BrowseTools) getTextRun(ctx context.Context, m json.RawMessage) (string, error) {
				375	var input getTextInput
				376	if err := json.Unmarshal(m, &input); err != nil {
				377	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				378	}
				379
				380	browserCtx, err := b.GetBrowserContext()
				381	if err != nil {
				382	return errorResponse(err), nil
				383	}
				384
				385	var text string
				386	err = chromedp.Run(browserCtx,
				387	chromedp.WaitReady(input.Selector),
				388	chromedp.Text(input.Selector, &text),
				389	)
				390	if err != nil {
				391	return errorResponse(err), nil
				392	}
				393
				394	output := getTextOutput{Text: text}
				395	result, err := json.Marshal(output)
				396	if err != nil {
				397	return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
				398	}
				399
				400	return string(result), nil
				401	}
				402
				403	// EvalTool definition
				404	type evalInput struct {
				405	Expression string `json:"expression"`
				406	}
				407
				408	type evalOutput struct {
				409	Result any `json:"result"`
				410	}
				411
				412	// NewEvalTool creates a tool for evaluating JavaScript
				413	func (b BrowseTools) NewEvalTool() llm.Tool {
				414	return &llm.Tool{
				415	Name: "browser_eval",
				416	Description: "Evaluate JavaScript in the browser context",
				417	InputSchema: json.RawMessage(`{
				418	"type": "object",
				419	"properties": {
				420	"expression": {
				421	"type": "string",
				422	"description": "JavaScript expression to evaluate"
				423	}
				424	},
				425	"required": ["expression"]
				426	}`),
				427	Run: b.evalRun,
				428	}
				429	}
				430
				431	func (b *BrowseTools) evalRun(ctx context.Context, m json.RawMessage) (string, error) {
				432	var input evalInput
				433	if err := json.Unmarshal(m, &input); err != nil {
				434	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				435	}
				436
				437	browserCtx, err := b.GetBrowserContext()
				438	if err != nil {
				439	return errorResponse(err), nil
				440	}
				441
				442	var result any
				443	err = chromedp.Run(browserCtx, chromedp.Evaluate(input.Expression, &result))
				444	if err != nil {
				445	return errorResponse(err), nil
				446	}
				447
				448	output := evalOutput{Result: result}
				449	response, err := json.Marshal(output)
				450	if err != nil {
				451	return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
				452	}
				453
				454	return string(response), nil
				455	}
				456
				457	// ScreenshotTool definition
				458	type screenshotInput struct {
				459	Selector string `json:"selector,omitempty"`
				460	Format string `json:"format,omitempty"`
				461	}
				462
				463	type screenshotOutput struct {
				464	ID string `json:"id"`
				465	}
				466
				467	// NewScreenshotTool creates a tool for taking screenshots
				468	func (b BrowseTools) NewScreenshotTool() llm.Tool {
				469	return &llm.Tool{
				470	Name: "browser_screenshot",
				471	Description: "Take a screenshot of the page or a specific element",
				472	InputSchema: json.RawMessage(`{
				473	"type": "object",
				474	"properties": {
				475	"selector": {
				476	"type": "string",
				477	"description": "CSS selector for the element to screenshot (optional)"
				478	},
				479	"format": {
				480	"type": "string",
				481	"description": "Output format ('base64' or 'png'), defaults to 'base64'",
				482	"enum": ["base64", "png"]
				483	}
				484	}
				485	}`),
				486	Run: b.screenshotRun,
				487	}
				488	}
				489
				490	func (b *BrowseTools) screenshotRun(ctx context.Context, m json.RawMessage) (string, error) {
				491	var input screenshotInput
				492	if err := json.Unmarshal(m, &input); err != nil {
				493	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				494	}
				495
				496	browserCtx, err := b.GetBrowserContext()
				497	if err != nil {
				498	return errorResponse(err), nil
				499	}
				500
				501	var buf []byte
				502	var actions []chromedp.Action
				503
				504	if input.Selector != "" {
				505	// Take screenshot of specific element
				506	actions = append(actions,
				507	chromedp.WaitReady(input.Selector),
				508	chromedp.Screenshot(input.Selector, &buf, chromedp.NodeVisible),
				509	)
				510	} else {
				511	// Take full page screenshot
				512	actions = append(actions, chromedp.CaptureScreenshot(&buf))
				513	}
				514
				515	err = chromedp.Run(browserCtx, actions...)
				516	if err != nil {
				517	return errorResponse(err), nil
				518	}
				519
				520	// Save the screenshot and get its ID
				521	id := b.SaveScreenshot(buf)
				522	if id == "" {
				523	return errorResponse(fmt.Errorf("failed to save screenshot")), nil
				524	}
				525
				526	// Return the ID in the response
				527	output := screenshotOutput{ID: id}
				528	response, err := json.Marshal(output)
				529	if err != nil {
				530	return errorResponse(fmt.Errorf("failed to marshal response: %w", err)), nil
				531	}
				532
				533	return string(response), nil
				534	}
				535
				536	// ScrollIntoViewTool definition
				537	type scrollIntoViewInput struct {
				538	Selector string `json:"selector"`
				539	}
				540
				541	// NewScrollIntoViewTool creates a tool for scrolling elements into view
				542	func (b BrowseTools) NewScrollIntoViewTool() llm.Tool {
				543	return &llm.Tool{
				544	Name: "browser_scroll_into_view",
				545	Description: "Scroll an element into view if it's not visible",
				546	InputSchema: json.RawMessage(`{
				547	"type": "object",
				548	"properties": {
				549	"selector": {
				550	"type": "string",
				551	"description": "CSS selector for the element to scroll into view"
				552	}
				553	},
				554	"required": ["selector"]
				555	}`),
				556	Run: b.scrollIntoViewRun,
				557	}
				558	}
				559
				560	func (b *BrowseTools) scrollIntoViewRun(ctx context.Context, m json.RawMessage) (string, error) {
				561	var input scrollIntoViewInput
				562	if err := json.Unmarshal(m, &input); err != nil {
				563	return errorResponse(fmt.Errorf("invalid input: %w", err)), nil
				564	}
				565
				566	browserCtx, err := b.GetBrowserContext()
				567	if err != nil {
				568	return errorResponse(err), nil
				569	}
				570
				571	script := fmt.Sprintf(`
				572	const el = document.querySelector('%s');
				573	if (el) {
				574	el.scrollIntoView({behavior: 'smooth', block: 'center'});
				575	return true;
				576	}
				577	return false;
				578	`, input.Selector)
				579
				580	var result bool
				581	err = chromedp.Run(browserCtx,
				582	chromedp.WaitReady(input.Selector),
				583	chromedp.Evaluate(script, &result),
				584	)
				585	if err != nil {
				586	return errorResponse(err), nil
				587	}
				588
				589	if !result {
				590	return errorResponse(fmt.Errorf("element not found: %s", input.Selector)), nil
				591	}
				592
				593	return successResponse(), nil
				594	}
				595
				596	// GetAllTools returns all browser tools
				597	func (b BrowseTools) GetAllTools() []llm.Tool {
				598	return []*llm.Tool{
				599	b.NewNavigateTool(),
				600	b.NewClickTool(),
				601	b.NewTypeTool(),
				602	b.NewWaitForTool(),
				603	b.NewGetTextTool(),
				604	b.NewEvalTool(),
				605	b.NewScreenshotTool(),
				606	b.NewScrollIntoViewTool(),
				607	}
				608	}
				609
				610	// SaveScreenshot saves a screenshot to disk and returns its ID
				611	func (b *BrowseTools) SaveScreenshot(data []byte) string {
				612	// Generate a unique ID
				613	id := uuid.New().String()
				614
				615	// Save the file
				616	filePath := filepath.Join(ScreenshotDir, id+".png")
Autoformatter	4962f15	2025-05-06 17:24:20 +0000	[diff] [blame]	617	if err := os.WriteFile(filePath, data, 0o644); err != nil {
Philip Zeyliger	33d282f	2025-05-03 04:01:54 +0000	[diff] [blame]	618	log.Printf("Failed to save screenshot: %v", err)
				619	return ""
				620	}
				621
				622	// Track this screenshot
				623	b.screenshotsMutex.Lock()
				624	b.screenshots[id] = time.Now()
				625	b.screenshotsMutex.Unlock()
				626
				627	return id
				628	}
				629
				630	// GetScreenshotPath returns the full path to a screenshot by ID
				631	func GetScreenshotPath(id string) string {
				632	return filepath.Join(ScreenshotDir, id+".png")
				633	}