Blame - llm/gem/gem.go - sketch

blob: 4327123c6a69e7ed8882228353c9cb5ec7d3975a [file] [log] [blame]

David Crawshaw	5a23406	2025-05-04 17:52:08 +0000	[diff] [blame]	1	package gem
				2
				3	import (
				4	"cmp"
				5	"context"
				6	"encoding/json"
				7	"fmt"
				8	"log/slog"
				9	"math/rand"
				10	"net/http"
				11	"strings"
				12	"time"
				13
				14	"sketch.dev/llm"
				15	"sketch.dev/llm/gem/gemini"
				16	)
				17
				18	const (
				19	DefaultModel = "gemini-2.5-pro-preview-03-25"
				20	DefaultMaxTokens = 8192
				21	GeminiAPIKeyEnv = "GEMINI_API_KEY"
				22	)
				23
				24	// Service provides Gemini completions.
				25	// Fields should not be altered concurrently with calling any method on Service.
				26	type Service struct {
				27	HTTPC *http.Client // defaults to http.DefaultClient if nil
				28	APIKey string // must be non-empty
				29	Model string // defaults to DefaultModel if empty
				30	MaxTokens int // defaults to DefaultMaxTokens if zero
				31	}
				32
				33	var _ llm.Service = (*Service)(nil)
				34
				35	// These maps convert between Sketch's llm package and Gemini API formats
				36	var fromLLMRole = map[llm.MessageRole]string{
				37	llm.MessageRoleAssistant: "model",
				38	llm.MessageRoleUser: "user",
				39	}
				40
				41	// convertToolSchemas converts Sketch's llm.Tool schemas to Gemini's schema format
				42	func convertToolSchemas(tools []*llm.Tool) ([]gemini.FunctionDeclaration, error) {
				43	if len(tools) == 0 {
				44	return nil, nil
				45	}
				46
				47	var decls []gemini.FunctionDeclaration
				48	for _, tool := range tools {
				49	// Parse the schema from raw JSON
				50	var schemaJSON map[string]any
				51	if err := json.Unmarshal(tool.InputSchema, &schemaJSON); err != nil {
				52	return nil, fmt.Errorf("failed to unmarshal tool %s schema: %w", tool.Name, err)
				53	}
				54	decls = append(decls, gemini.FunctionDeclaration{
				55	Name: tool.Name,
				56	Description: tool.Description,
				57	Parameters: convertJSONSchemaToGeminiSchema(schemaJSON),
				58	})
				59	}
				60
				61	return decls, nil
				62	}
				63
				64	// convertJSONSchemaToGeminiSchema converts a JSON schema to Gemini's schema format
				65	func convertJSONSchemaToGeminiSchema(schemaJSON map[string]any) gemini.Schema {
				66	schema := gemini.Schema{}
				67
				68	// Set the type based on the JSON schema type
				69	if typeVal, ok := schemaJSON["type"].(string); ok {
				70	switch typeVal {
				71	case "string":
				72	schema.Type = gemini.DataTypeSTRING
				73	case "number":
				74	schema.Type = gemini.DataTypeNUMBER
				75	case "integer":
				76	schema.Type = gemini.DataTypeINTEGER
				77	case "boolean":
				78	schema.Type = gemini.DataTypeBOOLEAN
				79	case "array":
				80	schema.Type = gemini.DataTypeARRAY
				81	case "object":
				82	schema.Type = gemini.DataTypeOBJECT
				83	default:
				84	schema.Type = gemini.DataTypeSTRING // Default to string for unknown types
				85	}
				86	}
				87
				88	// Set description if available
				89	if desc, ok := schemaJSON["description"].(string); ok {
				90	schema.Description = desc
				91	}
				92
				93	// Handle enum values
				94	if enumValues, ok := schemaJSON["enum"].([]any); ok {
				95	schema.Enum = make([]string, len(enumValues))
				96	for i, v := range enumValues {
				97	if strVal, ok := v.(string); ok {
				98	schema.Enum[i] = strVal
				99	} else {
				100	// Convert non-string values to string
				101	valBytes, _ := json.Marshal(v)
				102	schema.Enum[i] = string(valBytes)
				103	}
				104	}
				105	}
				106
				107	// Handle object properties
				108	if properties, ok := schemaJSON["properties"].(map[string]any); ok && schema.Type == gemini.DataTypeOBJECT {
				109	schema.Properties = make(map[string]gemini.Schema)
				110	for propName, propSchema := range properties {
				111	if propSchemaMap, ok := propSchema.(map[string]any); ok {
				112	schema.Properties[propName] = convertJSONSchemaToGeminiSchema(propSchemaMap)
				113	}
				114	}
				115	}
				116
				117	// Handle required properties
				118	if required, ok := schemaJSON["required"].([]any); ok {
				119	schema.Required = make([]string, len(required))
				120	for i, r := range required {
				121	if strVal, ok := r.(string); ok {
				122	schema.Required[i] = strVal
				123	}
				124	}
				125	}
				126
				127	// Handle array items
				128	if items, ok := schemaJSON["items"].(map[string]any); ok && schema.Type == gemini.DataTypeARRAY {
				129	itemSchema := convertJSONSchemaToGeminiSchema(items)
				130	schema.Items = &itemSchema
				131	}
				132
				133	// Handle minimum/maximum items for arrays
				134	if minItems, ok := schemaJSON["minItems"].(float64); ok {
				135	schema.MinItems = fmt.Sprintf("%d", int(minItems))
				136	}
				137	if maxItems, ok := schemaJSON["maxItems"].(float64); ok {
				138	schema.MaxItems = fmt.Sprintf("%d", int(maxItems))
				139	}
				140
				141	return schema
				142	}
				143
				144	// buildGeminiRequest converts Sketch's llm.Request to Gemini's request format
				145	func (s Service) buildGeminiRequest(req llm.Request) (*gemini.Request, error) {
				146	gemReq := &gemini.Request{}
				147
				148	// Add system instruction if provided
				149	if len(req.System) > 0 {
				150	// Combine all system messages into a single system instruction
				151	systemText := ""
				152	for i, sys := range req.System {
				153	if i > 0 && systemText != "" && sys.Text != "" {
				154	systemText += "\n"
				155	}
				156	systemText += sys.Text
				157	}
				158
				159	if systemText != "" {
				160	gemReq.SystemInstruction = &gemini.Content{
				161	Parts: []gemini.Part{{Text: systemText}},
				162	}
				163	}
				164	}
				165
				166	// Convert messages to Gemini content format
				167	for _, msg := range req.Messages {
				168	// Set the role based on the message role
				169	role, ok := fromLLMRole[msg.Role]
				170	if !ok {
				171	return nil, fmt.Errorf("unsupported message role: %v", msg.Role)
				172	}
				173
				174	content := gemini.Content{
				175	Role: role,
				176	}
				177
				178	// Store tool usage information to correlate tool uses with responses
				179	toolNameToID := make(map[string]string)
				180
				181	// First pass: collect tool use IDs for correlation
				182	for _, c := range msg.Content {
				183	if c.Type == llm.ContentTypeToolUse && c.ID != "" {
				184	toolNameToID[c.ToolName] = c.ID
				185	}
				186	}
				187
				188	// Map each content item to Gemini's format
				189	for _, c := range msg.Content {
				190	switch c.Type {
				191	case llm.ContentTypeText, llm.ContentTypeThinking, llm.ContentTypeRedactedThinking:
				192	// Simple text content
				193	content.Parts = append(content.Parts, gemini.Part{
				194	Text: c.Text,
				195	})
				196	case llm.ContentTypeToolUse:
				197	// Tool use becomes a function call
				198	var args map[string]any
				199	if err := json.Unmarshal(c.ToolInput, &args); err != nil {
				200	return nil, fmt.Errorf("failed to unmarshal tool input: %w", err)
				201	}
				202
				203	// Make sure we have a valid ID for this tool use
				204	if c.ID == "" {
				205	c.ID = fmt.Sprintf("gemini_tool_%s_%d", c.ToolName, time.Now().UnixNano())
				206	}
				207
				208	// Save the ID for this tool name for future correlation
				209	toolNameToID[c.ToolName] = c.ID
				210
				211	slog.DebugContext(context.Background(), "gemini_preparing_tool_use",
				212	"tool_name", c.ToolName,
				213	"tool_id", c.ID,
				214	"input", string(c.ToolInput))
				215
				216	content.Parts = append(content.Parts, gemini.Part{
				217	FunctionCall: &gemini.FunctionCall{
				218	Name: c.ToolName,
				219	Args: args,
				220	},
				221	})
				222	case llm.ContentTypeToolResult:
				223	// Tool result becomes a function response
				224	// Create a map for the response
				225	response := map[string]any{
				226	"result": c.ToolResult,
				227	"error": c.ToolError,
				228	}
				229
				230	// Determine the function name to use - this is critical
				231	funcName := ""
				232
				233	// First try to find the function name from a stored toolUseID if we have one
				234	if c.ToolUseID != "" {
				235	// Try to derive the tool name from the previous tools we've seen
				236	for name, id := range toolNameToID {
				237	if id == c.ToolUseID {
				238	funcName = name
				239	break
				240	}
				241	}
				242	}
				243
				244	// Fallback options if we couldn't find the tool name
				245	if funcName == "" {
				246	// Try the tool name directly
				247	if c.ToolName != "" {
				248	funcName = c.ToolName
				249	} else {
				250	// Last resort fallback
				251	funcName = "default_tool"
				252	}
				253	}
				254
				255	slog.DebugContext(context.Background(), "gemini_preparing_tool_result",
				256	"tool_use_id", c.ToolUseID,
				257	"mapped_func_name", funcName,
				258	"result_length", len(c.ToolResult))
				259
				260	content.Parts = append(content.Parts, gemini.Part{
				261	FunctionResponse: &gemini.FunctionResponse{
				262	Name: funcName,
				263	Response: response,
				264	},
				265	})
				266	}
				267	}
				268
				269	gemReq.Contents = append(gemReq.Contents, content)
				270	}
				271
				272	// Handle tools/functions
				273	if len(req.Tools) > 0 {
				274	// Convert tool schemas
				275	decls, err := convertToolSchemas(req.Tools)
				276	if err != nil {
				277	return nil, fmt.Errorf("failed to convert tool schemas: %w", err)
				278	}
				279	if len(decls) > 0 {
				280	gemReq.Tools = []gemini.Tool{{FunctionDeclarations: decls}}
				281	}
				282	}
				283
				284	return gemReq, nil
				285	}
				286
				287	// convertGeminiResponsesToContent converts a Gemini response to llm.Content
				288	func convertGeminiResponseToContent(res *gemini.Response) []llm.Content {
				289	if res == nil \|\| len(res.Candidates) == 0 \|\| len(res.Candidates[0].Content.Parts) == 0 {
				290	return []llm.Content{{
				291	Type: llm.ContentTypeText,
				292	Text: "",
				293	}}
				294	}
				295
				296	var contents []llm.Content
				297
				298	// Process each part in the first candidate's content
				299	for i, part := range res.Candidates[0].Content.Parts {
				300	// Log the part type for debugging
				301	slog.DebugContext(context.Background(), "processing_gemini_part",
				302	"index", i,
				303	"has_text", part.Text != "",
				304	"has_function_call", part.FunctionCall != nil,
				305	"has_function_response", part.FunctionResponse != nil)
				306
				307	if part.Text != "" {
				308	// Simple text response
				309	contents = append(contents, llm.Content{
				310	Type: llm.ContentTypeText,
				311	Text: part.Text,
				312	})
				313	} else if part.FunctionCall != nil {
				314	// Function call (tool use)
				315	args, err := json.Marshal(part.FunctionCall.Args)
				316	if err != nil {
				317	// If we can't marshal, use empty args
				318	slog.DebugContext(context.Background(), "gemini_failed_to_markshal_args",
				319	"tool_name", part.FunctionCall.Name,
				320	"args", string(args),
				321	"err", err.Error(),
				322	)
				323	args = []byte("{}")
				324	}
				325
				326	// Generate a unique ID for this tool use that includes the function name
				327	// to make it easier to correlate with responses
				328	toolID := fmt.Sprintf("gemini_tool_%s_%d", part.FunctionCall.Name, time.Now().UnixNano())
				329
				330	contents = append(contents, llm.Content{
				331	ID: toolID,
				332	Type: llm.ContentTypeToolUse,
				333	ToolName: part.FunctionCall.Name,
				334	ToolInput: json.RawMessage(args),
				335	})
				336
				337	slog.DebugContext(context.Background(), "gemini_tool_call",
				338	"tool_id", toolID,
				339	"tool_name", part.FunctionCall.Name,
				340	"args", string(args))
				341	} else if part.FunctionResponse != nil {
				342	// We shouldn't normally get function responses from the model, but just in case
				343	respData, _ := json.Marshal(part.FunctionResponse.Response)
				344	slog.DebugContext(context.Background(), "unexpected_function_response",
				345	"name", part.FunctionResponse.Name,
				346	"response", string(respData))
				347	}
				348	}
				349
				350	// If no content was added, add an empty text content
				351	if len(contents) == 0 {
				352	slog.DebugContext(context.Background(), "empty_gemini_response", "adding_empty_text", true)
				353	contents = append(contents, llm.Content{
				354	Type: llm.ContentTypeText,
				355	Text: "",
				356	})
				357	}
				358
				359	return contents
				360	}
				361
				362	// Gemini doesn't provide usage info directly, so we need to estimate it
				363	// ensureToolIDs makes sure all tool uses have proper IDs
				364	func ensureToolIDs(contents []llm.Content) {
				365	for i, content := range contents {
				366	if content.Type == llm.ContentTypeToolUse && content.ID == "" {
				367	// Generate a stable ID using the tool name and timestamp
				368	contents[i].ID = fmt.Sprintf("gemini_tool_%s_%d", content.ToolName, time.Now().UnixNano())
				369	slog.DebugContext(context.Background(), "assigned_missing_tool_id",
				370	"tool_name", content.ToolName,
				371	"new_id", contents[i].ID)
				372	}
				373	}
				374	}
				375
				376	func calculateUsage(req gemini.Request, res gemini.Response) llm.Usage {
				377	// Very rough estimation of token counts
				378	var inputTokens uint64
				379	var outputTokens uint64
				380
				381	// Count system tokens
				382	if req.SystemInstruction != nil {
				383	for _, part := range req.SystemInstruction.Parts {
				384	if part.Text != "" {
				385	// Very rough estimation: 1 token per 4 characters
				386	inputTokens += uint64(len(part.Text)) / 4
				387	}
				388	}
				389	}
				390
				391	// Count input tokens
				392	for _, content := range req.Contents {
				393	for _, part := range content.Parts {
				394	if part.Text != "" {
				395	inputTokens += uint64(len(part.Text)) / 4
				396	} else if part.FunctionCall != nil {
				397	// Estimate function call tokens
				398	argBytes, _ := json.Marshal(part.FunctionCall.Args)
				399	inputTokens += uint64(len(part.FunctionCall.Name)+len(argBytes)) / 4
				400	} else if part.FunctionResponse != nil {
				401	// Estimate function response tokens
				402	resBytes, _ := json.Marshal(part.FunctionResponse.Response)
				403	inputTokens += uint64(len(part.FunctionResponse.Name)+len(resBytes)) / 4
				404	}
				405	}
				406	}
				407
				408	// Count output tokens
				409	if res != nil && len(res.Candidates) > 0 {
				410	for _, part := range res.Candidates[0].Content.Parts {
				411	if part.Text != "" {
				412	outputTokens += uint64(len(part.Text)) / 4
				413	} else if part.FunctionCall != nil {
				414	// Estimate function call tokens
				415	argBytes, _ := json.Marshal(part.FunctionCall.Args)
				416	outputTokens += uint64(len(part.FunctionCall.Name)+len(argBytes)) / 4
				417	}
				418	}
				419	}
				420
				421	// For Gemini 2.5 Pro Preview pricing: $1.25 per 1M input tokens, $10 per 1M output tokens
				422	// Convert to dollars
				423	costUSD := float64(inputTokens)1.25/1_000_000.0 + float64(outputTokens)10/1_000_000.0
				424
				425	return llm.Usage{
				426	InputTokens: inputTokens,
				427	OutputTokens: outputTokens,
				428	CostUSD: costUSD,
				429	}
				430	}
				431
				432	// Do sends a request to Gemini.
				433	func (s Service) Do(ctx context.Context, ir llm.Request) (*llm.Response, error) {
				434	// Log the incoming request for debugging
				435	slog.DebugContext(ctx, "gemini_request",
				436	"message_count", len(ir.Messages),
				437	"tool_count", len(ir.Tools),
				438	"system_count", len(ir.System))
				439
				440	// Log tool-related information if any tools are present
				441	if len(ir.Tools) > 0 {
				442	var toolNames []string
				443	for _, tool := range ir.Tools {
				444	toolNames = append(toolNames, tool.Name)
				445	}
				446	slog.DebugContext(ctx, "gemini_tools", "tools", toolNames)
				447	}
				448
				449	// Log details about the messages being sent
				450	for i, msg := range ir.Messages {
				451	contentTypes := make([]string, len(msg.Content))
				452	for j, c := range msg.Content {
				453	contentTypes[j] = c.Type.String()
				454
				455	// Log tool-related content with more details
				456	if c.Type == llm.ContentTypeToolUse {
				457	slog.DebugContext(ctx, "gemini_tool_use",
				458	"message_idx", i,
				459	"content_idx", j,
				460	"tool_name", c.ToolName,
				461	"tool_input", string(c.ToolInput))
				462	} else if c.Type == llm.ContentTypeToolResult {
				463	slog.DebugContext(ctx, "gemini_tool_result",
				464	"message_idx", i,
				465	"content_idx", j,
				466	"tool_use_id", c.ToolUseID,
				467	"tool_error", c.ToolError,
				468	"result_length", len(c.ToolResult))
				469	}
				470	}
				471	slog.DebugContext(ctx, "gemini_message",
				472	"idx", i,
				473	"role", msg.Role.String(),
				474	"content_types", contentTypes)
				475	}
				476	// Build the Gemini request
				477	gemReq, err := s.buildGeminiRequest(ir)
				478	if err != nil {
				479	return nil, fmt.Errorf("failed to build Gemini request: %w", err)
				480	}
				481
				482	// Log the structured Gemini request for debugging
				483	if reqJSON, err := json.MarshalIndent(gemReq, "", " "); err == nil {
				484	slog.DebugContext(ctx, "gemini_request_json", "request", string(reqJSON))
				485	}
				486
				487	// Create a Gemini model instance
				488	model := gemini.Model{
				489	Model: "models/" + cmp.Or(s.Model, DefaultModel),
				490	APIKey: s.APIKey,
				491	HTTPC: cmp.Or(s.HTTPC, http.DefaultClient),
				492	}
				493
				494	// Send the request to Gemini with retry logic
				495	startTime := time.Now()
				496	endTime := startTime // Initialize endTime
				497	var gemRes *gemini.Response
				498
				499	// Retry mechanism for handling server errors and rate limiting
				500	backoff := []time.Duration{1 * time.Second, 3 * time.Second, 5 * time.Second, 10 * time.Second}
				501	for attempts := 0; attempts <= len(backoff); attempts++ {
				502	gemApiErr := error(nil)
				503	gemRes, gemApiErr = model.GenerateContent(ctx, gemReq)
				504	endTime = time.Now()
				505
				506	if gemApiErr == nil {
				507	// Successful response
				508	// Log the structured Gemini response
				509	if resJSON, err := json.MarshalIndent(gemRes, "", " "); err == nil {
				510	slog.DebugContext(ctx, "gemini_response_json", "response", string(resJSON))
				511	}
				512	break
				513	}
				514
				515	if attempts == len(backoff) {
				516	// We've exhausted all retry attempts
				517	return nil, fmt.Errorf("gemini: API error after %d attempts: %w", attempts, gemApiErr)
				518	}
				519
				520	// Check if the error is retryable (e.g., server error or rate limiting)
				521	if strings.Contains(gemApiErr.Error(), "429") \|\| strings.Contains(gemApiErr.Error(), "5") {
				522	// Rate limited or server error - wait and retry
				523	random := time.Duration(rand.Int63n(int64(time.Second)))
				524	sleep := backoff[attempts] + random
				525	slog.WarnContext(ctx, "gemini_request_retry", "error", gemApiErr.Error(), "attempt", attempts+1, "sleep", sleep)
				526	time.Sleep(sleep)
				527	continue
				528	}
				529
				530	// Non-retryable error
				531	return nil, fmt.Errorf("gemini: API error: %w", gemApiErr)
				532	}
				533
				534	content := convertGeminiResponseToContent(gemRes)
				535
				536	ensureToolIDs(content)
				537
				538	usage := calculateUsage(gemReq, gemRes)
				539
				540	stopReason := llm.StopReasonEndTurn
				541	for _, part := range content {
				542	if part.Type == llm.ContentTypeToolUse {
				543	stopReason = llm.StopReasonToolUse
				544	slog.DebugContext(ctx, "gemini_tool_use_detected",
				545	"setting_stop_reason", "llm.StopReasonToolUse",
				546	"tool_name", part.ToolName)
				547	break
				548	}
				549	}
				550
				551	return &llm.Response{
				552	Role: llm.MessageRoleAssistant,
				553	Model: s.Model,
				554	Content: content,
				555	StopReason: stopReason,
				556	Usage: usage,
				557	StartTime: &startTime,
				558	EndTime: &endTime,
				559	}, nil
				560	}