Blame - llm/gem/gem.go - sketch

blob: 668605815eaa05b7181be7bbc17a53150749b461 [file] [log] [blame]

David Crawshaw	5a23406	2025-05-04 17:52:08 +0000	[diff] [blame]	1	package gem
				2
				3	import (
				4	"cmp"
				5	"context"
				6	"encoding/json"
				7	"fmt"
				8	"log/slog"
				9	"math/rand"
				10	"net/http"
				11	"strings"
				12	"time"
				13
				14	"sketch.dev/llm"
				15	"sketch.dev/llm/gem/gemini"
				16	)
				17
				18	const (
David Crawshaw	a9d87aa	2025-05-06 10:08:56 -0700	[diff] [blame]	19	DefaultModel = "gemini-2.5-pro-preview-03-25"
				20	GeminiAPIKeyEnv = "GEMINI_API_KEY"
David Crawshaw	5a23406	2025-05-04 17:52:08 +0000	[diff] [blame]	21	)
				22
				23	// Service provides Gemini completions.
				24	// Fields should not be altered concurrently with calling any method on Service.
				25	type Service struct {
David Crawshaw	a9d87aa	2025-05-06 10:08:56 -0700	[diff] [blame]	26	HTTPC *http.Client // defaults to http.DefaultClient if nil
				27	URL string // Gemini API URL, uses the gemini package default if empty
				28	APIKey string // must be non-empty
				29	Model string // defaults to DefaultModel if empty
David Crawshaw	5a23406	2025-05-04 17:52:08 +0000	[diff] [blame]	30	}
				31
				32	var _ llm.Service = (*Service)(nil)
				33
				34	// These maps convert between Sketch's llm package and Gemini API formats
				35	var fromLLMRole = map[llm.MessageRole]string{
				36	llm.MessageRoleAssistant: "model",
				37	llm.MessageRoleUser: "user",
				38	}
				39
				40	// convertToolSchemas converts Sketch's llm.Tool schemas to Gemini's schema format
				41	func convertToolSchemas(tools []*llm.Tool) ([]gemini.FunctionDeclaration, error) {
				42	if len(tools) == 0 {
				43	return nil, nil
				44	}
				45
				46	var decls []gemini.FunctionDeclaration
				47	for _, tool := range tools {
				48	// Parse the schema from raw JSON
				49	var schemaJSON map[string]any
				50	if err := json.Unmarshal(tool.InputSchema, &schemaJSON); err != nil {
				51	return nil, fmt.Errorf("failed to unmarshal tool %s schema: %w", tool.Name, err)
				52	}
				53	decls = append(decls, gemini.FunctionDeclaration{
				54	Name: tool.Name,
				55	Description: tool.Description,
				56	Parameters: convertJSONSchemaToGeminiSchema(schemaJSON),
				57	})
				58	}
				59
				60	return decls, nil
				61	}
				62
				63	// convertJSONSchemaToGeminiSchema converts a JSON schema to Gemini's schema format
				64	func convertJSONSchemaToGeminiSchema(schemaJSON map[string]any) gemini.Schema {
				65	schema := gemini.Schema{}
				66
				67	// Set the type based on the JSON schema type
				68	if typeVal, ok := schemaJSON["type"].(string); ok {
				69	switch typeVal {
				70	case "string":
				71	schema.Type = gemini.DataTypeSTRING
				72	case "number":
				73	schema.Type = gemini.DataTypeNUMBER
				74	case "integer":
				75	schema.Type = gemini.DataTypeINTEGER
				76	case "boolean":
				77	schema.Type = gemini.DataTypeBOOLEAN
				78	case "array":
				79	schema.Type = gemini.DataTypeARRAY
				80	case "object":
				81	schema.Type = gemini.DataTypeOBJECT
				82	default:
				83	schema.Type = gemini.DataTypeSTRING // Default to string for unknown types
				84	}
				85	}
				86
				87	// Set description if available
				88	if desc, ok := schemaJSON["description"].(string); ok {
				89	schema.Description = desc
				90	}
				91
				92	// Handle enum values
				93	if enumValues, ok := schemaJSON["enum"].([]any); ok {
				94	schema.Enum = make([]string, len(enumValues))
				95	for i, v := range enumValues {
				96	if strVal, ok := v.(string); ok {
				97	schema.Enum[i] = strVal
				98	} else {
				99	// Convert non-string values to string
				100	valBytes, _ := json.Marshal(v)
				101	schema.Enum[i] = string(valBytes)
				102	}
				103	}
				104	}
				105
				106	// Handle object properties
				107	if properties, ok := schemaJSON["properties"].(map[string]any); ok && schema.Type == gemini.DataTypeOBJECT {
				108	schema.Properties = make(map[string]gemini.Schema)
				109	for propName, propSchema := range properties {
				110	if propSchemaMap, ok := propSchema.(map[string]any); ok {
				111	schema.Properties[propName] = convertJSONSchemaToGeminiSchema(propSchemaMap)
				112	}
				113	}
				114	}
				115
				116	// Handle required properties
				117	if required, ok := schemaJSON["required"].([]any); ok {
				118	schema.Required = make([]string, len(required))
				119	for i, r := range required {
				120	if strVal, ok := r.(string); ok {
				121	schema.Required[i] = strVal
				122	}
				123	}
				124	}
				125
				126	// Handle array items
				127	if items, ok := schemaJSON["items"].(map[string]any); ok && schema.Type == gemini.DataTypeARRAY {
				128	itemSchema := convertJSONSchemaToGeminiSchema(items)
				129	schema.Items = &itemSchema
				130	}
				131
				132	// Handle minimum/maximum items for arrays
				133	if minItems, ok := schemaJSON["minItems"].(float64); ok {
				134	schema.MinItems = fmt.Sprintf("%d", int(minItems))
				135	}
				136	if maxItems, ok := schemaJSON["maxItems"].(float64); ok {
				137	schema.MaxItems = fmt.Sprintf("%d", int(maxItems))
				138	}
				139
				140	return schema
				141	}
				142
				143	// buildGeminiRequest converts Sketch's llm.Request to Gemini's request format
				144	func (s Service) buildGeminiRequest(req llm.Request) (*gemini.Request, error) {
				145	gemReq := &gemini.Request{}
				146
				147	// Add system instruction if provided
				148	if len(req.System) > 0 {
				149	// Combine all system messages into a single system instruction
				150	systemText := ""
				151	for i, sys := range req.System {
				152	if i > 0 && systemText != "" && sys.Text != "" {
				153	systemText += "\n"
				154	}
				155	systemText += sys.Text
				156	}
				157
				158	if systemText != "" {
				159	gemReq.SystemInstruction = &gemini.Content{
				160	Parts: []gemini.Part{{Text: systemText}},
				161	}
				162	}
				163	}
				164
				165	// Convert messages to Gemini content format
				166	for _, msg := range req.Messages {
				167	// Set the role based on the message role
				168	role, ok := fromLLMRole[msg.Role]
				169	if !ok {
				170	return nil, fmt.Errorf("unsupported message role: %v", msg.Role)
				171	}
				172
				173	content := gemini.Content{
				174	Role: role,
				175	}
				176
				177	// Store tool usage information to correlate tool uses with responses
				178	toolNameToID := make(map[string]string)
				179
				180	// First pass: collect tool use IDs for correlation
				181	for _, c := range msg.Content {
				182	if c.Type == llm.ContentTypeToolUse && c.ID != "" {
				183	toolNameToID[c.ToolName] = c.ID
				184	}
				185	}
				186
				187	// Map each content item to Gemini's format
				188	for _, c := range msg.Content {
				189	switch c.Type {
				190	case llm.ContentTypeText, llm.ContentTypeThinking, llm.ContentTypeRedactedThinking:
				191	// Simple text content
				192	content.Parts = append(content.Parts, gemini.Part{
				193	Text: c.Text,
				194	})
				195	case llm.ContentTypeToolUse:
				196	// Tool use becomes a function call
				197	var args map[string]any
				198	if err := json.Unmarshal(c.ToolInput, &args); err != nil {
				199	return nil, fmt.Errorf("failed to unmarshal tool input: %w", err)
				200	}
				201
				202	// Make sure we have a valid ID for this tool use
				203	if c.ID == "" {
				204	c.ID = fmt.Sprintf("gemini_tool_%s_%d", c.ToolName, time.Now().UnixNano())
				205	}
				206
				207	// Save the ID for this tool name for future correlation
				208	toolNameToID[c.ToolName] = c.ID
				209
				210	slog.DebugContext(context.Background(), "gemini_preparing_tool_use",
				211	"tool_name", c.ToolName,
				212	"tool_id", c.ID,
				213	"input", string(c.ToolInput))
				214
				215	content.Parts = append(content.Parts, gemini.Part{
				216	FunctionCall: &gemini.FunctionCall{
				217	Name: c.ToolName,
				218	Args: args,
				219	},
				220	})
				221	case llm.ContentTypeToolResult:
				222	// Tool result becomes a function response
				223	// Create a map for the response
				224	response := map[string]any{
				225	"result": c.ToolResult,
				226	"error": c.ToolError,
				227	}
				228
				229	// Determine the function name to use - this is critical
				230	funcName := ""
				231
				232	// First try to find the function name from a stored toolUseID if we have one
				233	if c.ToolUseID != "" {
				234	// Try to derive the tool name from the previous tools we've seen
				235	for name, id := range toolNameToID {
				236	if id == c.ToolUseID {
				237	funcName = name
				238	break
				239	}
				240	}
				241	}
				242
				243	// Fallback options if we couldn't find the tool name
				244	if funcName == "" {
				245	// Try the tool name directly
				246	if c.ToolName != "" {
				247	funcName = c.ToolName
				248	} else {
				249	// Last resort fallback
				250	funcName = "default_tool"
				251	}
				252	}
				253
				254	slog.DebugContext(context.Background(), "gemini_preparing_tool_result",
				255	"tool_use_id", c.ToolUseID,
				256	"mapped_func_name", funcName,
				257	"result_length", len(c.ToolResult))
				258
				259	content.Parts = append(content.Parts, gemini.Part{
				260	FunctionResponse: &gemini.FunctionResponse{
				261	Name: funcName,
				262	Response: response,
				263	},
				264	})
				265	}
				266	}
				267
				268	gemReq.Contents = append(gemReq.Contents, content)
				269	}
				270
				271	// Handle tools/functions
				272	if len(req.Tools) > 0 {
				273	// Convert tool schemas
				274	decls, err := convertToolSchemas(req.Tools)
				275	if err != nil {
				276	return nil, fmt.Errorf("failed to convert tool schemas: %w", err)
				277	}
				278	if len(decls) > 0 {
				279	gemReq.Tools = []gemini.Tool{{FunctionDeclarations: decls}}
				280	}
				281	}
				282
				283	return gemReq, nil
				284	}
				285
				286	// convertGeminiResponsesToContent converts a Gemini response to llm.Content
				287	func convertGeminiResponseToContent(res *gemini.Response) []llm.Content {
				288	if res == nil \|\| len(res.Candidates) == 0 \|\| len(res.Candidates[0].Content.Parts) == 0 {
				289	return []llm.Content{{
				290	Type: llm.ContentTypeText,
				291	Text: "",
				292	}}
				293	}
				294
				295	var contents []llm.Content
				296
				297	// Process each part in the first candidate's content
				298	for i, part := range res.Candidates[0].Content.Parts {
				299	// Log the part type for debugging
				300	slog.DebugContext(context.Background(), "processing_gemini_part",
				301	"index", i,
				302	"has_text", part.Text != "",
				303	"has_function_call", part.FunctionCall != nil,
				304	"has_function_response", part.FunctionResponse != nil)
				305
				306	if part.Text != "" {
				307	// Simple text response
				308	contents = append(contents, llm.Content{
				309	Type: llm.ContentTypeText,
				310	Text: part.Text,
				311	})
				312	} else if part.FunctionCall != nil {
				313	// Function call (tool use)
				314	args, err := json.Marshal(part.FunctionCall.Args)
				315	if err != nil {
				316	// If we can't marshal, use empty args
				317	slog.DebugContext(context.Background(), "gemini_failed_to_markshal_args",
				318	"tool_name", part.FunctionCall.Name,
				319	"args", string(args),
				320	"err", err.Error(),
				321	)
				322	args = []byte("{}")
				323	}
				324
				325	// Generate a unique ID for this tool use that includes the function name
				326	// to make it easier to correlate with responses
				327	toolID := fmt.Sprintf("gemini_tool_%s_%d", part.FunctionCall.Name, time.Now().UnixNano())
				328
				329	contents = append(contents, llm.Content{
				330	ID: toolID,
				331	Type: llm.ContentTypeToolUse,
				332	ToolName: part.FunctionCall.Name,
				333	ToolInput: json.RawMessage(args),
				334	})
				335
				336	slog.DebugContext(context.Background(), "gemini_tool_call",
				337	"tool_id", toolID,
				338	"tool_name", part.FunctionCall.Name,
				339	"args", string(args))
				340	} else if part.FunctionResponse != nil {
				341	// We shouldn't normally get function responses from the model, but just in case
				342	respData, _ := json.Marshal(part.FunctionResponse.Response)
				343	slog.DebugContext(context.Background(), "unexpected_function_response",
				344	"name", part.FunctionResponse.Name,
				345	"response", string(respData))
				346	}
				347	}
				348
				349	// If no content was added, add an empty text content
				350	if len(contents) == 0 {
				351	slog.DebugContext(context.Background(), "empty_gemini_response", "adding_empty_text", true)
				352	contents = append(contents, llm.Content{
				353	Type: llm.ContentTypeText,
				354	Text: "",
				355	})
				356	}
				357
				358	return contents
				359	}
				360
				361	// Gemini doesn't provide usage info directly, so we need to estimate it
				362	// ensureToolIDs makes sure all tool uses have proper IDs
				363	func ensureToolIDs(contents []llm.Content) {
				364	for i, content := range contents {
				365	if content.Type == llm.ContentTypeToolUse && content.ID == "" {
				366	// Generate a stable ID using the tool name and timestamp
				367	contents[i].ID = fmt.Sprintf("gemini_tool_%s_%d", content.ToolName, time.Now().UnixNano())
				368	slog.DebugContext(context.Background(), "assigned_missing_tool_id",
				369	"tool_name", content.ToolName,
				370	"new_id", contents[i].ID)
				371	}
				372	}
				373	}
				374
				375	func calculateUsage(req gemini.Request, res gemini.Response) llm.Usage {
				376	// Very rough estimation of token counts
				377	var inputTokens uint64
				378	var outputTokens uint64
				379
				380	// Count system tokens
				381	if req.SystemInstruction != nil {
				382	for _, part := range req.SystemInstruction.Parts {
				383	if part.Text != "" {
				384	// Very rough estimation: 1 token per 4 characters
				385	inputTokens += uint64(len(part.Text)) / 4
				386	}
				387	}
				388	}
				389
				390	// Count input tokens
				391	for _, content := range req.Contents {
				392	for _, part := range content.Parts {
				393	if part.Text != "" {
				394	inputTokens += uint64(len(part.Text)) / 4
				395	} else if part.FunctionCall != nil {
				396	// Estimate function call tokens
				397	argBytes, _ := json.Marshal(part.FunctionCall.Args)
				398	inputTokens += uint64(len(part.FunctionCall.Name)+len(argBytes)) / 4
				399	} else if part.FunctionResponse != nil {
				400	// Estimate function response tokens
				401	resBytes, _ := json.Marshal(part.FunctionResponse.Response)
				402	inputTokens += uint64(len(part.FunctionResponse.Name)+len(resBytes)) / 4
				403	}
				404	}
				405	}
				406
				407	// Count output tokens
				408	if res != nil && len(res.Candidates) > 0 {
				409	for _, part := range res.Candidates[0].Content.Parts {
				410	if part.Text != "" {
				411	outputTokens += uint64(len(part.Text)) / 4
				412	} else if part.FunctionCall != nil {
				413	// Estimate function call tokens
				414	argBytes, _ := json.Marshal(part.FunctionCall.Args)
				415	outputTokens += uint64(len(part.FunctionCall.Name)+len(argBytes)) / 4
				416	}
				417	}
				418	}
				419
				420	// For Gemini 2.5 Pro Preview pricing: $1.25 per 1M input tokens, $10 per 1M output tokens
				421	// Convert to dollars
				422	costUSD := float64(inputTokens)1.25/1_000_000.0 + float64(outputTokens)10/1_000_000.0
				423
				424	return llm.Usage{
				425	InputTokens: inputTokens,
				426	OutputTokens: outputTokens,
				427	CostUSD: costUSD,
				428	}
				429	}
				430
				431	// Do sends a request to Gemini.
				432	func (s Service) Do(ctx context.Context, ir llm.Request) (*llm.Response, error) {
				433	// Log the incoming request for debugging
				434	slog.DebugContext(ctx, "gemini_request",
				435	"message_count", len(ir.Messages),
				436	"tool_count", len(ir.Tools),
				437	"system_count", len(ir.System))
				438
				439	// Log tool-related information if any tools are present
				440	if len(ir.Tools) > 0 {
				441	var toolNames []string
				442	for _, tool := range ir.Tools {
				443	toolNames = append(toolNames, tool.Name)
				444	}
				445	slog.DebugContext(ctx, "gemini_tools", "tools", toolNames)
				446	}
				447
				448	// Log details about the messages being sent
				449	for i, msg := range ir.Messages {
				450	contentTypes := make([]string, len(msg.Content))
				451	for j, c := range msg.Content {
				452	contentTypes[j] = c.Type.String()
				453
				454	// Log tool-related content with more details
				455	if c.Type == llm.ContentTypeToolUse {
				456	slog.DebugContext(ctx, "gemini_tool_use",
				457	"message_idx", i,
				458	"content_idx", j,
				459	"tool_name", c.ToolName,
				460	"tool_input", string(c.ToolInput))
				461	} else if c.Type == llm.ContentTypeToolResult {
				462	slog.DebugContext(ctx, "gemini_tool_result",
				463	"message_idx", i,
				464	"content_idx", j,
				465	"tool_use_id", c.ToolUseID,
				466	"tool_error", c.ToolError,
				467	"result_length", len(c.ToolResult))
				468	}
				469	}
				470	slog.DebugContext(ctx, "gemini_message",
				471	"idx", i,
				472	"role", msg.Role.String(),
				473	"content_types", contentTypes)
				474	}
				475	// Build the Gemini request
				476	gemReq, err := s.buildGeminiRequest(ir)
				477	if err != nil {
				478	return nil, fmt.Errorf("failed to build Gemini request: %w", err)
				479	}
				480
				481	// Log the structured Gemini request for debugging
				482	if reqJSON, err := json.MarshalIndent(gemReq, "", " "); err == nil {
				483	slog.DebugContext(ctx, "gemini_request_json", "request", string(reqJSON))
				484	}
				485
				486	// Create a Gemini model instance
				487	model := gemini.Model{
David Crawshaw	3659d87	2025-05-05 17:52:23 -0700	[diff] [blame]	488	Model: "models/" + cmp.Or(s.Model, DefaultModel),
				489	Endpoint: s.URL,
				490	APIKey: s.APIKey,
				491	HTTPC: cmp.Or(s.HTTPC, http.DefaultClient),
David Crawshaw	5a23406	2025-05-04 17:52:08 +0000	[diff] [blame]	492	}
				493
				494	// Send the request to Gemini with retry logic
				495	startTime := time.Now()
				496	endTime := startTime // Initialize endTime
				497	var gemRes *gemini.Response
				498
				499	// Retry mechanism for handling server errors and rate limiting
				500	backoff := []time.Duration{1 * time.Second, 3 * time.Second, 5 * time.Second, 10 * time.Second}
				501	for attempts := 0; attempts <= len(backoff); attempts++ {
				502	gemApiErr := error(nil)
				503	gemRes, gemApiErr = model.GenerateContent(ctx, gemReq)
				504	endTime = time.Now()
				505
				506	if gemApiErr == nil {
				507	// Successful response
				508	// Log the structured Gemini response
				509	if resJSON, err := json.MarshalIndent(gemRes, "", " "); err == nil {
				510	slog.DebugContext(ctx, "gemini_response_json", "response", string(resJSON))
				511	}
				512	break
				513	}
				514
				515	if attempts == len(backoff) {
				516	// We've exhausted all retry attempts
				517	return nil, fmt.Errorf("gemini: API error after %d attempts: %w", attempts, gemApiErr)
				518	}
				519
				520	// Check if the error is retryable (e.g., server error or rate limiting)
				521	if strings.Contains(gemApiErr.Error(), "429") \|\| strings.Contains(gemApiErr.Error(), "5") {
				522	// Rate limited or server error - wait and retry
				523	random := time.Duration(rand.Int63n(int64(time.Second)))
				524	sleep := backoff[attempts] + random
				525	slog.WarnContext(ctx, "gemini_request_retry", "error", gemApiErr.Error(), "attempt", attempts+1, "sleep", sleep)
				526	time.Sleep(sleep)
				527	continue
				528	}
				529
				530	// Non-retryable error
				531	return nil, fmt.Errorf("gemini: API error: %w", gemApiErr)
				532	}
				533
				534	content := convertGeminiResponseToContent(gemRes)
				535
				536	ensureToolIDs(content)
				537
				538	usage := calculateUsage(gemReq, gemRes)
				539
				540	stopReason := llm.StopReasonEndTurn
				541	for _, part := range content {
				542	if part.Type == llm.ContentTypeToolUse {
				543	stopReason = llm.StopReasonToolUse
				544	slog.DebugContext(ctx, "gemini_tool_use_detected",
				545	"setting_stop_reason", "llm.StopReasonToolUse",
				546	"tool_name", part.ToolName)
				547	break
				548	}
				549	}
				550
				551	return &llm.Response{
				552	Role: llm.MessageRoleAssistant,
				553	Model: s.Model,
				554	Content: content,
				555	StopReason: stopReason,
				556	Usage: usage,
				557	StartTime: &startTime,
				558	EndTime: &endTime,
				559	}, nil
				560	}