test: integration test hardening (#13532)

* test: integration test hardening

Improve reliability on slower systems, and some flakes.  Fix
a few logic flaws on the newer tests, general hardening.

* tighten up vision logging

* add new models

* remove some older models - still covered by library scenarios
This commit is contained in:
Daniel Hiltgen 2026-05-08 15:54:17 -07:00 committed by GitHub
parent 1e1b34dada
commit c2f2d90a67
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 315 additions and 69 deletions

View file

@ -289,7 +289,7 @@ func TestAPIChat(t *testing.T) {
} }
case <-done: case <-done:
if genErr != nil { if genErr != nil {
t.Fatalf("failed with %s request prompt %v", req.Model, req.Messages) t.Fatalf("failed with %s request prompt %s", req.Model, summarizeMessages(req.Messages))
} }
// Verify the response contains the expected data // Verify the response contains the expected data
response := buf.String() response := buf.String()

View file

@ -19,6 +19,7 @@ import (
) )
var defaultAudioModels = []string{ var defaultAudioModels = []string{
"nemotron3:33b",
"gemma4:e2b", "gemma4:e2b",
"gemma4:e4b", "gemma4:e4b",
} }
@ -36,8 +37,11 @@ func decodeTestAudio(t *testing.T) api.ImageData {
// setupAudioModel pulls the model, preloads it, and skips if it doesn't support audio. // setupAudioModel pulls the model, preloads it, and skips if it doesn't support audio.
func setupAudioModel(ctx context.Context, t *testing.T, client *api.Client, model string) { func setupAudioModel(ctx context.Context, t *testing.T, client *api.Client, model string) {
t.Helper() t.Helper()
requireCapability(ctx, t, client, model, "audio") if testModel == "" {
pullOrSkip(ctx, t, client, model) pullOrSkip(ctx, t, client, model)
}
skipIfModelTooLargeForVRAM(ctx, t, client, model)
requireCapability(ctx, t, client, model, "audio")
err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil }) err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
if err != nil { if err != nil {
t.Fatalf("failed to load model %s: %s", model, err) t.Fatalf("failed to load model %s: %s", model, err)

View file

@ -38,8 +38,8 @@ func TestUnicode(t *testing.T) {
if testModel != "" { if testModel != "" {
t.Skip("uses hardcoded model, not applicable with model override") t.Skip("uses hardcoded model, not applicable with model override")
} }
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 12) // Actual model load is ~26G
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
defer cancel() defer cancel()
// Set up the test data // Set up the test data
req := api.ChatRequest{ req := api.ChatRequest{
@ -78,7 +78,7 @@ func TestUnicode(t *testing.T) {
DoChat(ctx, t, client, req, []string{ DoChat(ctx, t, client, req, []string{
"散射", // scattering "散射", // scattering
"频率", // frequency "频率", // frequency
}, 120*time.Second, 120*time.Second) }, 180*time.Second, 30*time.Second)
} }
func TestExtendedUnicodeOutput(t *testing.T) { func TestExtendedUnicodeOutput(t *testing.T) {

View file

@ -126,7 +126,7 @@ func TestMultiModelStress(t *testing.T) {
slog.Info("Loading models to find how many can fit in VRAM before overflowing") slog.Info("Loading models to find how many can fit in VRAM before overflowing")
chooseModels: chooseModels:
for i, model := range chosenModels { for i, model := range chosenModels {
req := &api.GenerateRequest{Model: model} req := &api.GenerateRequest{Model: model} // Leave KeepAlive unset so they stay loaded until the scheduler decides to unload them
slog.Info("loading", "model", model) slog.Info("loading", "model", model)
err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil }) err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
if err != nil { if err != nil {
@ -162,8 +162,22 @@ chooseModels:
slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models") slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
} }
// For some iGPU/CPU systems we may end up with lingering 5 minute load timeouts chewing up memory - force unload everything we tried
slog.Info("unloading test models")
for _, model := range chosenModels {
client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
}
defer func() {
// best effort unload once we're done with the real test
for _, model := range chosenModels {
client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
}
}()
r := rand.New(rand.NewSource(0)) r := rand.New(rand.NewSource(0))
var wg sync.WaitGroup var wg sync.WaitGroup
slog.Info("Starting main test...")
for i := range targetLoadCount { for i := range targetLoadCount {
wg.Add(1) wg.Add(1)
go func(i int) { go func(i int) {
@ -176,6 +190,8 @@ chooseModels:
} }
k := r.Int() % len(reqs) k := r.Int() % len(reqs)
reqs[k].Model = chosenModels[i] reqs[k].Model = chosenModels[i]
// Set a default timeout to ensure the scheduler unloads for resource needs, not expiration
reqs[k].KeepAlive = nil
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content) slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout) DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout)
} }

View file

@ -4,7 +4,9 @@ package integration
import ( import (
"context" "context"
"errors"
"log/slog" "log/slog"
"strings"
"sync" "sync"
"testing" "testing"
"time" "time"
@ -14,19 +16,20 @@ import (
func TestLongInputContext(t *testing.T) { func TestLongInputContext(t *testing.T) {
// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what // Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
// we asked for and there is nothing extra that we could spill over into // we asked for and there is nothing extra that we could spill over into.
// Older runners silently truncate oversized prompts, while llama-server
// rejects them with a client error. Accept both behaviors so this test can
// run against main and the llama-server branch.
t.Setenv("OLLAMA_NUM_PARALLEL", "1") t.Setenv("OLLAMA_NUM_PARALLEL", "1")
// Longer needed for small footprint GPUs ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel() defer cancel()
// Set up the test data
req := api.ChatRequest{ req := api.ChatRequest{
Model: smol, Model: smol,
Messages: []api.Message{ Messages: []api.Message{
{ {
Role: "user", Role: "user",
Content: "Oh, dont speak to me of Austria. Perhaps I dont understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexanders loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I dont believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?", Content: "What country is this passage referring to?\nOh, dont speak to me of Austria. Perhaps I dont understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexanders loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I dont believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe!",
}, },
}, },
Stream: &stream, Stream: &stream,
@ -39,7 +42,39 @@ func TestLongInputContext(t *testing.T) {
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup() defer cleanup()
pullOrSkip(ctx, t, client, req.Model) pullOrSkip(ctx, t, client, req.Model)
DoChat(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
var response strings.Builder
err := client.Chat(ctx, &req, func(resp api.ChatResponse) error {
response.WriteString(resp.Message.Content)
return nil
})
if err != nil {
var statusErr api.StatusError
if errors.As(err, &statusErr) &&
statusErr.StatusCode >= 400 && statusErr.StatusCode < 500 &&
isContextLimitError(err.Error()) {
slog.Info("runner rejected oversized prompt", "error", err)
return
}
t.Fatalf("unexpected error for long input context: %v", err)
}
anyResp := []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict", "napoleonic", "historical"}
got := strings.ToLower(response.String())
for _, want := range anyResp {
if strings.Contains(got, want) {
return
}
}
t.Fatalf("%s: none of %v found in %q -- request was:%s", req.Model, anyResp, response.String(), summarizeMessages(req.Messages))
}
func isContextLimitError(err string) bool {
err = strings.ToLower(err)
return strings.Contains(err, "context") &&
(strings.Contains(err, "exceed") ||
strings.Contains(err, "too large") ||
strings.Contains(err, "too long"))
} }
func TestContextExhaustion(t *testing.T) { func TestContextExhaustion(t *testing.T) {
@ -71,7 +106,23 @@ func TestContextExhaustion(t *testing.T) {
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup() defer cleanup()
pullOrSkip(ctx, t, client, req.Model) pullOrSkip(ctx, t, client, req.Model)
DoChat(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
resp := DoChat(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world", "story", "friend", "suddenly", "finally", "day", "rain", "walked", "looked", "smiled", "laughed"}, 120*time.Second, 10*time.Second)
if resp != nil && !containsEmoji(resp.Content) {
t.Fatalf("%s: expected story response to contain emoji, got %q", req.Model, resp.Content)
}
}
func containsEmoji(s string) bool {
for _, r := range s {
switch {
case r >= 0x1F000 && r <= 0x1FAFF:
return true
case r >= 0x2600 && r <= 0x27BF:
return true
}
}
return false
} }
// Send multiple generate requests with prior context and ensure the response is coherant and expected // Send multiple generate requests with prior context and ensure the response is coherant and expected
@ -102,10 +153,14 @@ func TestParallelGenerateWithHistory(t *testing.T) {
t.Fatalf("failed to load model %s: %s", modelName, err) t.Fatalf("failed to load model %s: %s", modelName, err)
} }
gpuPercent := getGPUPercent(ctx, t, client, modelName) gpuPercent := getGPUPercent(ctx, t, client, modelName)
if gpuPercent < 80 { if gpuPercent < 80 && gpuPercent > 50 {
slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent) slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent)
initialTimeout = 240 * time.Second initialTimeout = 240 * time.Second
streamTimeout = 30 * time.Second streamTimeout = 30 * time.Second
} else if gpuPercent < 50 {
slog.Warn("Very low GPU percentage - skipping test", "percent", gpuPercent)
client.Generate(ctx, &api.GenerateRequest{Model: modelName, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
t.Skip("Very low GPU percentage")
} }
var wg sync.WaitGroup var wg sync.WaitGroup
@ -206,10 +261,14 @@ func TestParallelChatWithHistory(t *testing.T) {
t.Fatalf("failed to load model %s: %s", modelName, err) t.Fatalf("failed to load model %s: %s", modelName, err)
} }
gpuPercent := getGPUPercent(ctx, t, client, modelName) gpuPercent := getGPUPercent(ctx, t, client, modelName)
if gpuPercent < 80 { if gpuPercent < 80 && gpuPercent > 50 {
slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent) slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent)
initialTimeout = 240 * time.Second initialTimeout = 240 * time.Second
streamTimeout = 30 * time.Second streamTimeout = 30 * time.Second
} else if gpuPercent < 50 {
slog.Warn("Very low GPU percentage - skipping test", "percent", gpuPercent)
client.Generate(ctx, &api.GenerateRequest{Model: modelName, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
t.Skip("Very low GPU percentage")
} }
var wg sync.WaitGroup var wg sync.WaitGroup

View file

@ -139,6 +139,9 @@ func runOllamaCreate(ctx context.Context, t *testing.T, args ...string) {
} }
func TestCreateSafetensorsLLM(t *testing.T) { func TestCreateSafetensorsLLM(t *testing.T) {
if testModel != "" {
t.Skip("exercises create pipeline with a fixed source model, not applicable with model override")
}
skipIfRemote(t) skipIfRemote(t)
modelDir := filepath.Join(testdataModelsDir, "TinyLlama-1.1B") modelDir := filepath.Join(testdataModelsDir, "TinyLlama-1.1B")
@ -214,6 +217,9 @@ func TestCreateSafetensorsLLM(t *testing.T) {
} }
func TestCreateGGUF(t *testing.T) { func TestCreateGGUF(t *testing.T) {
if testModel != "" {
t.Skip("exercises create pipeline with a fixed source model, not applicable with model override")
}
modelDir := filepath.Join(testdataModelsDir, "Llama-3.2-1B-GGUF") modelDir := filepath.Join(testdataModelsDir, "Llama-3.2-1B-GGUF")
downloadHFModel(t, "bartowski/Llama-3.2-1B-Instruct-GGUF", modelDir, downloadHFModel(t, "bartowski/Llama-3.2-1B-Instruct-GGUF", modelDir,
"--include", "Llama-3.2-1B-Instruct-IQ3_M.gguf") "--include", "Llama-3.2-1B-Instruct-IQ3_M.gguf")

View file

@ -45,6 +45,22 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)) return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
} }
func requireEmbedErrorContainsAny(t *testing.T, err error, substrings ...string) {
t.Helper()
if err == nil {
t.Fatalf("expected error containing one of %q, got nil", substrings)
}
for _, s := range substrings {
if strings.Contains(err.Error(), s) {
return
}
}
t.Fatalf("expected error containing one of %q, got: %v", substrings, err)
}
func euclideanDistance[V float32 | float64](v1, v2 []V) V { func euclideanDistance[V float32 | float64](v1, v2 []V) V {
if len(v1) != len(v2) { if len(v1) != len(v2) {
return V(math.Inf(1)) return V(math.Inf(1))
@ -73,7 +89,7 @@ func manhattanDistance[V float32 | float64](v1, v2 []V) V {
} }
func TestEmbedCosineDistanceCorrelation(t *testing.T) { func TestEmbedCosineDistanceCorrelation(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup() defer cleanup()
@ -354,9 +370,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
Options: map[string]any{"num_ctx": 3}, Options: map[string]any{"num_ctx": 3},
}, },
check: func(t *testing.T, res *api.EmbedResponse, err error) { check: func(t *testing.T, res *api.EmbedResponse, err error) {
if err.Error() != "the input length exceeds the context length" { requireEmbedErrorContainsAny(t, err, "input length exceeds the context length", "exceeds maximum context length")
t.Fatalf("expected truncation error, got: %v", err)
}
}, },
}, },
{ {
@ -368,9 +382,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
Options: map[string]any{"num_ctx": 1}, Options: map[string]any{"num_ctx": 1},
}, },
check: func(t *testing.T, res *api.EmbedResponse, err error) { check: func(t *testing.T, res *api.EmbedResponse, err error) {
if err.Error() != "input after truncation exceeds maximum context length" { requireEmbedErrorContainsAny(t, err, "input after truncation exceeds maximum context length", "input exceeds maximum context length and cannot be truncated further")
t.Fatalf("expected truncation error, got: %v", err)
}
}, },
}, },
{ {
@ -382,9 +394,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
Options: map[string]any{"num_ctx": 0}, Options: map[string]any{"num_ctx": 0},
}, },
check: func(t *testing.T, res *api.EmbedResponse, err error) { check: func(t *testing.T, res *api.EmbedResponse, err error) {
if err.Error() != "input after truncation exceeds maximum context length" { requireEmbedErrorContainsAny(t, err, "input after truncation exceeds maximum context length", "input exceeds maximum context length and cannot be truncated further")
t.Fatalf("expected truncation error, got: %v", err)
}
}, },
}, },
{ {
@ -591,7 +601,7 @@ func TestEmbedStatusCode(t *testing.T) {
t.Run("truncation error status code", func(t *testing.T) { t.Run("truncation error status code", func(t *testing.T) {
truncFalse := false truncFalse := false
longInput := strings.Repeat("word ", 100) longInput := strings.Repeat("very long input ", 100)
req := api.EmbedRequest{ req := api.EmbedRequest{
Model: model, Model: model,
@ -618,9 +628,7 @@ func TestEmbedStatusCode(t *testing.T) {
} }
// Verify the error message is meaningful // Verify the error message is meaningful
if !strings.Contains(err.Error(), "context length") { requireEmbedErrorContainsAny(t, err, "context length", "too large", "exceed_context_size")
t.Errorf("expected error message to mention context length, got: %v", err)
}
}) })
t.Run("batch truncation error status code", func(t *testing.T) { t.Run("batch truncation error status code", func(t *testing.T) {

View file

@ -17,7 +17,7 @@ func TestImageGeneration(t *testing.T) {
if testModel != "" { if testModel != "" {
t.Skip("uses hardcoded models, not applicable with model override") t.Skip("uses hardcoded models, not applicable with model override")
} }
skipUnderMinVRAM(t, 8) skipUnderMinVRAM(t, 32)
type testCase struct { type testCase struct {
imageGenModel string imageGenModel string
@ -64,6 +64,10 @@ func TestImageGeneration(t *testing.T) {
} else if strings.Contains(err.Error(), "ollama-mlx: no such file or directory") { } else if strings.Contains(err.Error(), "ollama-mlx: no such file or directory") {
// most likely linux arm - not supported yet // most likely linux arm - not supported yet
t.Skip("unsupported architecture") t.Skip("unsupported architecture")
} else if strings.Contains(err.Error(), "are available") {
t.Skip("insufficient VRAM for image generation model")
} else if strings.Contains(err.Error(), "failed to create server") {
t.Skip("image generation server failed to start")
} }
t.Fatalf("failed to generate image: %v", err) t.Fatalf("failed to generate image: %v", err)
} }

View file

@ -56,6 +56,7 @@ func TestVisionModels(t *testing.T) {
"seed": 42, "seed": 42,
"temperature": 0.0, "temperature": 0.0,
}, },
KeepAlive: &api.Duration{Duration: 10 * time.Second},
} }
// Preload to skip if we're less than 80% on GPU to avoid extremely slow tests // Preload to skip if we're less than 80% on GPU to avoid extremely slow tests

View file

@ -41,9 +41,10 @@ func TestModelsChat(t *testing.T) {
var chatModels []string var chatModels []string
if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" { if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
chatModels = ollamaEngineChatModels chatModels = append(ollamaEngineChatModels, mlxEngineChatModels...)
} else { } else {
chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...) chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
chatModels = append(chatModels, mlxEngineChatModels...)
} }
for _, model := range testModels(chatModels) { for _, model := range testModels(chatModels) {
@ -71,6 +72,7 @@ func TestModelsChat(t *testing.T) {
func(response api.GenerateResponse) error { return nil }, func(response api.GenerateResponse) error { return nil },
) )
if err != nil { if err != nil {
skipIfMLXUnsupported(t, err)
t.Fatalf("failed to load model %s: %s", model, err) t.Fatalf("failed to load model %s: %s", model, err)
} }
gpuPercent := getGPUPercent(ctx, t, client, model) gpuPercent := getGPUPercent(ctx, t, client, model)

View file

@ -60,8 +60,14 @@ func TestAPIToolCallingStress(t *testing.T) {
models := testModels(libraryToolsModels) models := testModels(libraryToolsModels)
softTimeout, _ := getTimeouts(t)
for _, model := range models { for _, model := range models {
t.Run(model, func(t *testing.T) { t.Run(model, func(t *testing.T) {
if time.Since(started) > softTimeout {
t.Skip("skipping remaining tests to avoid excessive runtime")
return
}
// Skip known-bad models unless explicitly requested via env var // Skip known-bad models unless explicitly requested via env var
if reason, ok := skipModels[model]; ok && testModel == "" { if reason, ok := skipModels[model]; ok && testModel == "" {
t.Skipf("skipping: %s", reason) t.Skipf("skipping: %s", reason)
@ -75,6 +81,13 @@ func TestAPIToolCallingStress(t *testing.T) {
pullOrSkip(ctx, t, client, model) pullOrSkip(ctx, t, client, model)
// Preload and skip if not sufficiently GPU-loaded to avoid timeouts
err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
if err != nil {
t.Fatalf("failed to load model %s: %s", model, err)
}
skipIfNotGPULoaded(ctx, t, client, model, 80)
tools := stressTestTools() tools := stressTestTools()
// Large system prompt that mimics real coding agents (opencode, Claude Code, etc.) // Large system prompt that mimics real coding agents (opencode, Claude Code, etc.)
@ -343,6 +356,7 @@ func testToolCall(t *testing.T, ctx context.Context, client *api.Client, model,
{Role: "user", Content: userMessage}, {Role: "user", Content: userMessage},
}, },
Tools: tools, Tools: tools,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{ Options: map[string]any{
"temperature": 0, "temperature": 0,
"num_ctx": contextLength(16384), "num_ctx": contextLength(16384),
@ -426,6 +440,7 @@ func testToolCallMultiTurn(t *testing.T, ctx context.Context, client *api.Client
// The model should now respond with content or another tool call // The model should now respond with content or another tool call
}, },
Tools: tools, Tools: tools,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{ Options: map[string]any{
"temperature": 0, "temperature": 0,
"num_ctx": contextLength(16384), "num_ctx": contextLength(16384),

View file

@ -23,7 +23,8 @@ func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
func TestAPIToolCalling(t *testing.T) { func TestAPIToolCalling(t *testing.T) {
initialTimeout := 60 * time.Second initialTimeout := 60 * time.Second
streamTimeout := 60 * time.Second streamTimeout := 60 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) softTimeout, hardTimeout := getTimeouts(t)
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
defer cancel() defer cancel()
client, _, cleanup := InitServerConnection(ctx, t) client, _, cleanup := InitServerConnection(ctx, t)
@ -52,6 +53,11 @@ func TestAPIToolCalling(t *testing.T) {
for _, model := range models { for _, model := range models {
t.Run(model, func(t *testing.T) { t.Run(model, func(t *testing.T) {
if time.Now().Sub(started) > softTimeout {
t.Skip("skipping remaining tests to avoid excessive runtime")
return
}
if testModel != "" { if testModel != "" {
requireCapability(ctx, t, client, model, "tools") requireCapability(ctx, t, client, model, "tools")
} }
@ -93,6 +99,7 @@ func TestAPIToolCalling(t *testing.T) {
Options: map[string]any{ Options: map[string]any{
"temperature": 0, "temperature": 0,
}, },
KeepAlive: &api.Duration{Duration: 10 * time.Second},
} }
stallTimer := time.NewTimer(initialTimeout) stallTimer := time.NewTimer(initialTimeout)

View file

@ -45,6 +45,8 @@ var (
// Note: add newer models at the top of the list to test them first // Note: add newer models at the top of the list to test them first
ollamaEngineChatModels = []string{ ollamaEngineChatModels = []string{
"nemotron3:33b",
"laguna-xs.2:q4_K_M",
"gemma4", "gemma4",
"lfm2.5-thinking", "lfm2.5-thinking",
"ministral-3", "ministral-3",
@ -66,6 +68,15 @@ var (
"minicpm-v:latest", // arch=qwen2 "minicpm-v:latest", // arch=qwen2
"granite-code:latest", // arch=llama "granite-code:latest", // arch=llama
} }
// MLX-backed safetensors tags. These exercise the mlxrunner subprocess
// on platforms where MLX is available (today: macOS; Linux/Windows CUDA
// coming). On other platforms, skipIfMLXUnsupported turns the load
// failure into a test skip.
mlxEngineChatModels = []string{
"laguna-xs.2:nvfp4",
"qwen3.5:2b-nvfp4", // ~2.5GB, Qwen3_5 arch
"gemma4:e2b-nvfp4", // ~7.1GB, Gemma4 arch (skipped under low VRAM)
}
llamaRunnerChatModels = []string{ llamaRunnerChatModels = []string{
"mistral:latest", "mistral:latest",
"falcon3:latest", "falcon3:latest",
@ -77,14 +88,6 @@ var (
"internlm2:latest", "internlm2:latest",
"codellama:latest", // arch=llama "codellama:latest", // arch=llama
"phi3:latest", "phi3:latest",
"falcon2:latest",
"gemma:latest",
"llama2:latest",
"nous-hermes:latest",
"orca-mini:latest",
"qwen:latest",
"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
"falcon:latest",
} }
// Some library models are quite large - ensure large VRAM and sufficient disk space // Some library models are quite large - ensure large VRAM and sufficient disk space
@ -261,7 +264,6 @@ var (
"zephyr", "zephyr",
} }
libraryEmbedModels = []string{ libraryEmbedModels = []string{
"qwen3-embedding",
"embeddinggemma", "embeddinggemma",
"nomic-embed-text", "nomic-embed-text",
"all-minilm", "all-minilm",
@ -272,8 +274,11 @@ var (
"paraphrase-multilingual", "paraphrase-multilingual",
"snowflake-arctic-embed", "snowflake-arctic-embed",
"snowflake-arctic-embed2", "snowflake-arctic-embed2",
"qwen3-embedding",
} }
libraryToolsModels = []string{ libraryToolsModels = []string{
"nemotron3:33b",
"laguna-xs.2",
"gemma4", "gemma4",
"lfm2.5-thinking", "lfm2.5-thinking",
"qwen3-vl", "qwen3-vl",
@ -284,7 +289,6 @@ var (
"llama3.2", "llama3.2",
"mistral", "mistral",
"qwen2.5", "qwen2.5",
"qwen2",
"ministral-3", "ministral-3",
"mistral-nemo", "mistral-nemo",
"mistral-small", "mistral-small",
@ -329,13 +333,23 @@ func testModels(defaults []string) []string {
} }
// requireCapability skips the test if the model does not advertise the // requireCapability skips the test if the model does not advertise the
// given capability. It queries the server via Show and caches nothing — // given capability. If the model is missing locally, it first goes through
// call it once per subtest. For local-only models where Show may not // the normal pull-if-missing path so tests still behave correctly on cold
// return capabilities (e.g. models created via ollama create), this is // hosts. For local-only models where Show may not return capabilities
// a best-effort check. // (e.g. models created via ollama create), this is a best-effort check.
func requireCapability(ctx context.Context, t *testing.T, client *api.Client, modelName string, cap model.Capability) { func requireCapability(ctx context.Context, t *testing.T, client *api.Client, modelName string, cap model.Capability) {
t.Helper() t.Helper()
resp, err := client.Show(ctx, &api.ShowRequest{Name: modelName}) resp, err := client.Show(ctx, &api.ShowRequest{Name: modelName})
var statusError api.StatusError
if errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound {
if err := PullIfMissing(ctx, client, modelName); err != nil {
t.Skipf("model %s not available: %v", modelName, err)
}
resp, err = client.Show(ctx, &api.ShowRequest{Name: modelName})
}
if err != nil { if err != nil {
t.Fatalf("failed to show model %s: %v", modelName, err) t.Fatalf("failed to show model %s: %v", modelName, err)
} }
@ -699,6 +713,45 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
} }
} }
// summarizeMessages returns a compact string form of the messages suitable
// for logs and error output. Image byte payloads are replaced with a
// "<image: N bytes>" marker so vision tests don't dump huge integer arrays.
func summarizeMessages(msgs []api.Message) string {
var b strings.Builder
b.WriteByte('[')
for i, m := range msgs {
if i > 0 {
b.WriteString(", ")
}
fmt.Fprintf(&b, "{Role:%s Content:%q", m.Role, m.Content)
if m.Thinking != "" {
fmt.Fprintf(&b, " Thinking:%q", m.Thinking)
}
if len(m.Images) > 0 {
b.WriteString(" Images:[")
for j, img := range m.Images {
if j > 0 {
b.WriteString(", ")
}
fmt.Fprintf(&b, "<image: %d bytes>", len(img))
}
b.WriteByte(']')
}
if len(m.ToolCalls) > 0 {
fmt.Fprintf(&b, " ToolCalls:%+v", m.ToolCalls)
}
if m.ToolName != "" {
fmt.Fprintf(&b, " ToolName:%s", m.ToolName)
}
if m.ToolCallID != "" {
fmt.Fprintf(&b, " ToolCallID:%s", m.ToolCallID)
}
b.WriteByte('}')
}
b.WriteByte(']')
return b.String()
}
func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) *api.Message { func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) *api.Message {
stallTimer := time.NewTimer(initialTimeout) stallTimer := time.NewTimer(initialTimeout)
var buf bytes.Buffer var buf bytes.Buffer
@ -734,7 +787,7 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
} }
} }
if !atLeastOne { if !atLeastOne {
t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages) t.Fatalf("%s: none of %v found in \"%s\" -- request was:%s", req.Model, anyResp, response, summarizeMessages(req.Messages))
} }
} }
@ -751,10 +804,10 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
return nil return nil
} }
if genErr != nil { if genErr != nil {
t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages) t.Fatalf("%s failed with %s request prompt %s", genErr, req.Model, summarizeMessages(req.Messages))
} }
verify() verify()
slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response) slog.Info("test pass", "model", req.Model, "messages", summarizeMessages(req.Messages), "contains", anyResp, "response", response)
case <-ctx.Done(): case <-ctx.Done():
// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass // if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
@ -784,6 +837,66 @@ func ChatRequests() ([]api.ChatRequest, [][]string) {
return reqs, results return reqs, results
} }
// skipIfMLXUnsupported converts an MLX runner startup error into a test skip
// when the fingerprint matches "the MLX stack is not wired up on this host",
// and only on platforms where MLX is not yet expected to work. On Apple
// Silicon (darwin/arm64) MLX must work, so the same errors there fall
// through and fail the test — we never want to mask a real Mac regression.
//
// The fingerprints are the exact wrapper strings produced by the MLX code
// paths (see x/mlxrunner/server.go, x/mlxrunner/mlx/dynamic.go,
// x/imagegen/mlx/mlx.go, x/imagegen/memory.go). Model-level errors
// (unsupported architecture, tensor mismatches, runtime failures) do not
// contain these strings, so this helper will not mask them.
func skipIfMLXUnsupported(t *testing.T, err error) {
t.Helper()
if err == nil {
return
}
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return
}
msg := err.Error()
for _, s := range []string{
"MLX not available:",
"failed to load MLX dynamic library",
"failed to load MLX function symbols",
"image generation on macOS requires Apple Silicon",
"image generation is not supported on",
} {
if strings.Contains(msg, s) {
t.Skipf("MLX not available on %s/%s: %v", runtime.GOOS, runtime.GOARCH, err)
}
}
}
// skipIfModelTooLargeForVRAM skips the test when the model's on-disk size
// is larger than OLLAMA_MAX_VRAM by enough that even partial GPU offload
// won't help. Uses the same 0.75x gate as TestPerfModels (model_perf_test.go)
// so vision/audio tests stay runnable on systems where the model is slightly
// over VRAM and a portion legitimately spills to CPU. No-op when
// OLLAMA_MAX_VRAM is unset.
func skipIfModelTooLargeForVRAM(ctx context.Context, t *testing.T, client *api.Client, modelName string) {
t.Helper()
s := os.Getenv("OLLAMA_MAX_VRAM")
if s == "" {
return
}
maxVram, err := strconv.ParseUint(s, 10, 64)
if err != nil {
t.Fatalf("invalid OLLAMA_MAX_VRAM %v", err)
}
resp, err := client.List(ctx)
if err != nil {
t.Fatalf("list models failed %v", err)
}
for _, m := range resp.Models {
if m.Name == modelName && float32(m.Size)*0.75 > float32(maxVram) {
t.Skipf("model %s is too large %s for available VRAM %s", modelName, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
}
}
}
func skipUnderMinVRAM(t *testing.T, gb uint64) { func skipUnderMinVRAM(t *testing.T, gb uint64) {
// TODO use info API in the future // TODO use info API in the future
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" { if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
@ -802,6 +915,8 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, model string, minPercent int) { func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, model string, minPercent int) {
gpuPercent := getGPUPercent(ctx, t, client, model) gpuPercent := getGPUPercent(ctx, t, client, model)
if gpuPercent < minPercent { if gpuPercent < minPercent {
// Unload the model if we're going to skip
client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
t.Skip(fmt.Sprintf("test requires minimum %d%% GPU load, but model %s only has %d%%", minPercent, model, gpuPercent)) t.Skip(fmt.Sprintf("test requires minimum %d%% GPU load, but model %s only has %d%%", minPercent, model, gpuPercent))
} }
} }

View file

@ -16,6 +16,7 @@ import (
// Default set of vision models to test. When OLLAMA_TEST_MODEL is set, // Default set of vision models to test. When OLLAMA_TEST_MODEL is set,
// only that model is tested (with a capability check for vision). // only that model is tested (with a capability check for vision).
var defaultVisionModels = []string{ var defaultVisionModels = []string{
"nemotron3:33b",
"gemma4", "gemma4",
"gemma3", "gemma3",
"llama3.2-vision", "llama3.2-vision",
@ -67,10 +68,11 @@ func skipIfNoVisionOverride(t *testing.T) {
// setupVisionModel pulls the model, preloads it, and skips if not GPU-loaded. // setupVisionModel pulls the model, preloads it, and skips if not GPU-loaded.
func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, model string) { func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, model string) {
t.Helper() t.Helper()
if testModel != "" { if testModel == "" {
requireCapability(ctx, t, client, model, "vision")
}
pullOrSkip(ctx, t, client, model) pullOrSkip(ctx, t, client, model)
}
skipIfModelTooLargeForVRAM(ctx, t, client, model)
requireCapability(ctx, t, client, model, "vision")
err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil }) err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
if err != nil { if err != nil {
t.Fatalf("failed to load model %s: %s", model, err) t.Fatalf("failed to load model %s: %s", model, err)
@ -82,7 +84,7 @@ func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, mod
// questions about the same image. This verifies that the KV cache correctly // questions about the same image. This verifies that the KV cache correctly
// handles cached image tokens across turns. // handles cached image tokens across turns.
func TestVisionMultiTurn(t *testing.T) { func TestVisionMultiTurn(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
// Models that fail on multi-turn detail questions (e.g. misidentifying objects). // Models that fail on multi-turn detail questions (e.g. misidentifying objects).
@ -115,6 +117,7 @@ func TestVisionMultiTurn(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
resp1 := DoChat(ctx, t, client, req, []string{ resp1 := DoChat(ctx, t, client, req, []string{
@ -150,7 +153,7 @@ func TestVisionMultiTurn(t *testing.T) {
// TestVisionObjectCounting asks the model to count objects in an image. // TestVisionObjectCounting asks the model to count objects in an image.
func TestVisionObjectCounting(t *testing.T) { func TestVisionObjectCounting(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
skipModels := map[string]string{ skipModels := map[string]string{
@ -180,6 +183,7 @@ func TestVisionObjectCounting(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
DoChat(ctx, t, client, req, []string{"4", "four"}, 120*time.Second, 30*time.Second) DoChat(ctx, t, client, req, []string{"4", "four"}, 120*time.Second, 30*time.Second)
@ -190,7 +194,7 @@ func TestVisionObjectCounting(t *testing.T) {
// TestVisionSceneUnderstanding tests whether the model can identify // TestVisionSceneUnderstanding tests whether the model can identify
// cultural references and scene context from an image. // cultural references and scene context from an image.
func TestVisionSceneUnderstanding(t *testing.T) { func TestVisionSceneUnderstanding(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
// Models known to be too small or not capable enough for cultural reference detection. // Models known to be too small or not capable enough for cultural reference detection.
@ -222,6 +226,7 @@ func TestVisionSceneUnderstanding(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
DoChat(ctx, t, client, req, []string{ DoChat(ctx, t, client, req, []string{
@ -234,7 +239,7 @@ func TestVisionSceneUnderstanding(t *testing.T) {
// TestVisionSpatialReasoning tests the model's ability to identify // TestVisionSpatialReasoning tests the model's ability to identify
// objects based on their spatial position in the image. // objects based on their spatial position in the image.
func TestVisionSpatialReasoning(t *testing.T) { func TestVisionSpatialReasoning(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
for _, model := range testModels(defaultVisionModels) { for _, model := range testModels(defaultVisionModels) {
@ -259,10 +264,11 @@ func TestVisionSpatialReasoning(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
DoChat(ctx, t, client, req, []string{ DoChat(ctx, t, client, req, []string{
"laptop", "computer", "typing", "working", "laptop", "computer", "typing", "working", "desk", "writing", "pen", "glasses", "reading",
}, 120*time.Second, 30*time.Second) }, 120*time.Second, 30*time.Second)
}) })
} }
@ -271,7 +277,7 @@ func TestVisionSpatialReasoning(t *testing.T) {
// TestVisionDetailRecognition tests whether the model can identify // TestVisionDetailRecognition tests whether the model can identify
// small details like accessories in an image. // small details like accessories in an image.
func TestVisionDetailRecognition(t *testing.T) { func TestVisionDetailRecognition(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
for _, model := range testModels(defaultVisionModels) { for _, model := range testModels(defaultVisionModels) {
@ -294,6 +300,7 @@ func TestVisionDetailRecognition(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
DoChat(ctx, t, client, req, []string{ DoChat(ctx, t, client, req, []string{
@ -307,7 +314,7 @@ func TestVisionDetailRecognition(t *testing.T) {
// the model to compare and contrast them. This exercises multi-image // the model to compare and contrast them. This exercises multi-image
// encoding and cross-image reasoning. // encoding and cross-image reasoning.
func TestVisionMultiImage(t *testing.T) { func TestVisionMultiImage(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
// Multi-image support varies across models. // Multi-image support varies across models.
@ -338,6 +345,7 @@ func TestVisionMultiImage(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
// Both images feature cartoon llamas/alpacas — the model should // Both images feature cartoon llamas/alpacas — the model should
@ -353,7 +361,7 @@ func TestVisionMultiImage(t *testing.T) {
// of the ollama homepage image (a cartoon llama with "Start building with // of the ollama homepage image (a cartoon llama with "Start building with
// open models" text). Basic sanity check that the vision pipeline works. // open models" text). Basic sanity check that the vision pipeline works.
func TestVisionImageDescription(t *testing.T) { func TestVisionImageDescription(t *testing.T) {
skipUnderMinVRAM(t, 6) skipUnderMinVRAM(t, 16)
skipIfNoVisionOverride(t) skipIfNoVisionOverride(t)
for _, model := range testModels(defaultVisionModels) { for _, model := range testModels(defaultVisionModels) {
@ -376,6 +384,7 @@ func TestVisionImageDescription(t *testing.T) {
}, },
}, },
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second},
Options: map[string]any{"temperature": 0.0, "seed": 42}, Options: map[string]any{"temperature": 0.0, "seed": 42},
} }
DoChat(ctx, t, client, req, []string{ DoChat(ctx, t, client, req, []string{