gadget/packages/ai/src/ollama.test.ts
Rob Colbert 07a760c7b5 feat: add numPredict, numCtx, maxCompletionTokens to model config pipeline
Fixes premature AI API response truncation by propagating inference
parameters through the entire probe → storage → runtime → API call chain.

Root cause: Ollama defaults num_predict to 128 tokens and num_ctx to
4096, silently truncating output and context. We never overrode these.

Changes:
- IAiModelSettings: add numPredict, maxCompletionTokens fields
- IDroneModelConfig: moved from gadget-drone to @gadget/api (shared),
  expanded with numPredict, numCtx, maxCompletionTokens params
- IAiModelConfig.params: add numPredict, numCtx, maxCompletionTokens
- IAiModelProbeResult.settings: add numPredict, maxCompletionTokens
- AiModelSettingsSchema (Mongoose): add numPredict, maxCompletionTokens
- Ollama extractSettings(): extract num_predict from model parameters
- Ollama generate()/chat(): pass options: { num_ctx, num_predict }
- OpenAI all three create() calls: add max_completion_tokens
- web-cli.ts onProviderProbe(): compute numPredict (-1 for Ollama)
  and maxCompletionTokens (contextWindow for OpenAI) during probe
- agent.ts main + subagent loops: read model settings from provider
  cached models, build IDroneModelConfig with stored params
- ai.ts: remove local IDroneModelConfig, import from @gadget/api
- chat-session.ts: add new params to title generation call
- Tests: update all fixtures with new params, all 19 tests pass

Defaults when model settings unavailable:
- numPredict: -1 (Ollama unlimited - generate until natural stop)
- numCtx: 131072 (128k - covers most modern models)
- maxCompletionTokens: 16384 (16k - reasonable OpenAI default)
2026-05-11 13:50:19 -04:00

328 lines
9.2 KiB
TypeScript

import { describe, it, expect, vi, beforeEach } from 'vitest';
import type { ChatResponseStream } from 'ollama';
// Mock the Ollama client BEFORE importing the module
const mockOllamaClient = {
chat: vi.fn(),
generate: vi.fn(),
list: vi.fn(),
show: vi.fn(),
};
vi.mock('ollama', () => {
return {
Ollama: class MockOllama {
constructor() {
return mockOllamaClient;
}
},
};
});
import { OllamaAiApi } from './ollama';
// Mock logger
const mockLogger = {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
// Mock environment and provider
const mockEnv = {
NODE_ENV: 'test',
services: {
google: {
cse: {
apiKey: 'test-key',
engineId: 'test-engine',
},
},
},
};
const mockProvider = {
_id: 'test-provider',
name: 'Test Ollama',
sdk: 'ollama' as const,
baseUrl: 'http://localhost:11434',
apiKey: 'test-key',
};
describe('OllamaAiApi', () => {
let api: OllamaAiApi;
beforeEach(() => {
vi.clearAllMocks();
api = new OllamaAiApi(mockEnv as any, mockProvider as any, mockLogger as any);
});
describe('chat', () => {
it('should handle normal response streaming', async () => {
// Mock streaming response
const mockStream = async function* () {
yield {
message: { content: 'Hello' },
done: false,
};
yield {
message: { content: ' world' },
done: false,
};
yield {
message: { content: '!' },
done: true,
done_reason: 'stop',
total_duration: 100,
prompt_eval_count: 10,
eval_count: 3,
};
};
mockOllamaClient.chat.mockResolvedValue(mockStream());
const streamCallback = vi.fn();
const response = await api.chat(
{
provider: mockProvider as any,
modelId: 'test-model',
params: { reasoning: false, temperature: 0.8, topP: 0.9, topK: 40, numPredict: -1, numCtx: 131072, maxCompletionTokens: 16384 },
},
{
userPrompt: 'Test prompt',
context: [],
},
streamCallback,
);
// Verify stream callback was called for each chunk
expect(streamCallback).toHaveBeenCalledTimes(3);
expect(streamCallback).toHaveBeenCalledWith({
type: 'response',
data: 'Hello',
});
expect(streamCallback).toHaveBeenCalledWith({
type: 'response',
data: ' world',
});
expect(streamCallback).toHaveBeenCalledWith({
type: 'response',
data: '!',
});
// Verify response
expect(response.done).toBe(true);
expect(response.doneReason).toBe('stop');
expect(response.response).toBe('Hello world!');
});
it('should handle tool calls', async () => {
const mockStream = async function* () {
yield {
message: {
content: '',
tool_calls: [
{
function: {
name: 'search_google',
arguments: { query: 'test query' },
},
},
],
},
done: false,
};
yield {
message: { content: '' },
done: true,
done_reason: 'stop',
total_duration: 100,
prompt_eval_count: 10,
eval_count: 1,
};
};
mockOllamaClient.chat.mockResolvedValue(mockStream());
const streamCallback = vi.fn();
const response = await api.chat(
{
provider: mockProvider as any,
modelId: 'test-model',
params: { reasoning: false, temperature: 0.8, topP: 0.9, topK: 40, numPredict: -1, numCtx: 131072, maxCompletionTokens: 16384 },
},
{
userPrompt: 'Test prompt',
context: [],
},
streamCallback,
);
// Verify tool calls are returned, not executed
expect(response.toolCalls).toBeDefined();
expect(response.toolCalls!.length).toBe(1);
expect(response.toolCalls![0].function.name).toBe('search_google');
// chat() should only be called once (no internal loop)
expect(mockOllamaClient.chat).toHaveBeenCalledTimes(1);
});
it('should handle thinking content when reasoning is enabled', async () => {
const mockStream = async function* () {
yield {
message: {
thinking: 'Let me think about this...',
content: '',
},
done: false,
};
yield {
message: {
thinking: ' The answer is',
content: '',
},
done: false,
};
yield {
message: { content: '42' },
done: true,
done_reason: 'stop',
total_duration: 100,
prompt_eval_count: 10,
eval_count: 1,
};
};
mockOllamaClient.chat.mockResolvedValue(mockStream());
const streamCallback = vi.fn();
const response = await api.chat(
{
provider: mockProvider as any,
modelId: 'test-model',
params: { reasoning: true, temperature: 0.8, topP: 0.9, topK: 40, numPredict: -1, numCtx: 131072, maxCompletionTokens: 16384 },
},
{
userPrompt: 'What is the answer?',
context: [],
},
streamCallback,
);
expect(streamCallback).toHaveBeenCalledWith({
type: 'thinking',
data: 'Let me think about this...',
});
expect(streamCallback).toHaveBeenCalledWith({
type: 'thinking',
data: ' The answer is',
});
expect(streamCallback).toHaveBeenCalledWith({
type: 'response',
data: '42',
});
expect(response.thinking).toBe('Let me think about this... The answer is');
});
it('should reject empty response on load failure', async () => {
const mockStream = async function* () {
yield {
message: { content: '' },
done: true,
done_reason: 'load',
total_duration: 5000,
prompt_eval_count: 0,
eval_count: 0,
};
};
mockOllamaClient.chat.mockResolvedValue(mockStream());
await expect(api.chat(
{
provider: mockProvider as any,
modelId: 'test-model',
params: { reasoning: false, temperature: 0.8, topP: 0.9, topK: 40, numPredict: -1, numCtx: 131072, maxCompletionTokens: 16384 },
},
{
userPrompt: 'Test prompt',
context: [],
},
vi.fn(),
)).rejects.toThrow('Provider returned an empty chat response');
});
});
describe('probeModel', () => {
it('should detect thinking capability from "thinking" (Ollama convention)', async () => {
mockOllamaClient.show.mockResolvedValue({
capabilities: ['completion', 'vision', 'tools', 'thinking'],
details: { family: 'gemma4' },
model_info: {},
modified_at: '2026-04-04T06:20:40.211Z',
});
const result = await api.probeModel('gemma4:e4b');
expect(result.capabilities.hasThinking).toBe(true);
expect(result.capabilities.canCallTools).toBe(true);
expect(result.capabilities.hasVision).toBe(true);
});
it('should detect thinking capability from "reasoning" (OpenAI convention)', async () => {
mockOllamaClient.show.mockResolvedValue({
capabilities: ['completion', 'reasoning'],
details: { family: 'deepseek' },
model_info: {},
modified_at: '2026-04-04T06:20:40.211Z',
});
const result = await api.probeModel('deepseek-r1');
expect(result.capabilities.hasThinking).toBe(true);
});
it('should set hasThinking false when neither thinking nor reasoning in capabilities', async () => {
mockOllamaClient.show.mockResolvedValue({
capabilities: ['completion'],
details: { family: 'llama' },
model_info: {},
modified_at: '2026-04-04T06:20:40.211Z',
});
const result = await api.probeModel('llama3.2');
expect(result.capabilities.hasThinking).toBe(false);
});
it('should detect vision, tools, and embedding capabilities', async () => {
mockOllamaClient.show.mockResolvedValue({
capabilities: ['completion', 'vision', 'tools', 'embeddings'],
details: { family: 'llama' },
model_info: {},
modified_at: '2026-04-04T06:20:40.211Z',
});
const result = await api.probeModel('some-model');
expect(result.capabilities.hasVision).toBe(true);
expect(result.capabilities.canCallTools).toBe(true);
expect(result.capabilities.hasEmbedding).toBe(true);
});
it('should extract settings from Modelfile parameters', async () => {
mockOllamaClient.show.mockResolvedValue({
capabilities: ['completion'],
details: { family: 'llama' },
model_info: {},
parameters: 'temperature 0.7\ntop_k 40\ntop_p 0.9\nnum_ctx 4096',
modified_at: '2026-04-04T06:20:40.211Z',
});
const result = await api.probeModel('llama3.2');
expect(result.settings).toBeDefined();
expect(result.settings!.temperature).toBe(0.7);
expect(result.settings!.topK).toBe(40);
expect(result.settings!.topP).toBe(0.9);
expect(result.settings!.numCtx).toBe(4096);
});
});
});