test: integration test hardening (#13532)

* test: integration test hardening Improve reliability on slower systems, and some flakes. Fix a few logic flaws on the newer tests, general hardening. * tighten up vision logging * add new models * remove some older models - still covered by library scenarios
2026-05-13 06:21:28 +00:00 · 2026-05-08 15:54:17 -07:00 · 2026-05-08 15:54:17 -07:00 · c2f2d90a67
commit c2f2d90a67
parent 1e1b34dada
14 changed files with 315 additions and 69 deletions
--- a/integration/api_test.go
+++ b/integration/api_test.go
@ -289,7 +289,7 @@ func TestAPIChat(t *testing.T) {
 				}
 			case <-done:
 				if genErr != nil {
-					t.Fatalf("failed with %s request prompt %v", req.Model, req.Messages)
+					t.Fatalf("failed with %s request prompt %s", req.Model, summarizeMessages(req.Messages))
 				}
 				// Verify the response contains the expected data
 				response := buf.String()
--- a/integration/audio_test.go
+++ b/integration/audio_test.go
@ -19,6 +19,7 @@ import (
 )
 var defaultAudioModels = []string{
 	"nemotron3:33b",
 	"gemma4:e2b",
 	"gemma4:e4b",
 }
@ -36,8 +37,11 @@ func decodeTestAudio(t *testing.T) api.ImageData {
 // setupAudioModel pulls the model, preloads it, and skips if it doesn't support audio.
 func setupAudioModel(ctx context.Context, t *testing.T, client *api.Client, model string) {
 	t.Helper()
-	requireCapability(ctx, t, client, model, "audio")
+	if testModel == "" {
 		pullOrSkip(ctx, t, client, model)
 	}
 	skipIfModelTooLargeForVRAM(ctx, t, client, model)
 	requireCapability(ctx, t, client, model, "audio")
 	err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
 	if err != nil {
 		t.Fatalf("failed to load model %s: %s", model, err)
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@ -38,8 +38,8 @@ func TestUnicode(t *testing.T) {
 	if testModel != "" {
 		t.Skip("uses hardcoded model, not applicable with model override")
 	}
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 12) // Actual model load is ~26G
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.ChatRequest{
@ -78,7 +78,7 @@ func TestUnicode(t *testing.T) {
 	DoChat(ctx, t, client, req, []string{
 		"散射", // scattering
 		"频率", // frequency
-	}, 120*time.Second, 120*time.Second)
+	}, 180*time.Second, 30*time.Second)
 }
 func TestExtendedUnicodeOutput(t *testing.T) {
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@ -126,7 +126,7 @@ func TestMultiModelStress(t *testing.T) {
 	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
 chooseModels:
 	for i, model := range chosenModels {
-		req := &api.GenerateRequest{Model: model}
+		req := &api.GenerateRequest{Model: model} // Leave KeepAlive unset so they stay loaded until the scheduler decides to unload them
 		slog.Info("loading", "model", model)
 		err = client.Generate(ctx, req, func(response api.GenerateResponse) error { return nil })
 		if err != nil {
@ -162,8 +162,22 @@ chooseModels:
 		slog.Warn("all models being used without exceeding VRAM, set OLLAMA_MAX_VRAM so test can pick larger models")
 	}
 	// For some iGPU/CPU systems we may end up with lingering 5 minute load timeouts chewing up memory - force unload everything we tried
 	slog.Info("unloading test models")
 	for _, model := range chosenModels {
 		client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 	}
 	defer func() {
 		// best effort unload once we're done with the real test
 		for _, model := range chosenModels {
 			client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 		}
 	}()
 	r := rand.New(rand.NewSource(0))
 	var wg sync.WaitGroup
 	slog.Info("Starting main test...")
 	for i := range targetLoadCount {
 		wg.Add(1)
 		go func(i int) {
@ -176,6 +190,8 @@ chooseModels:
 				}
 				k := r.Int() % len(reqs)
 				reqs[k].Model = chosenModels[i]
 				// Set a default timeout to ensure the scheduler unloads for resource needs, not expiration
 				reqs[k].KeepAlive = nil
 				slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
 				DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout)
 			}
--- a/integration/context_test.go
+++ b/integration/context_test.go
@ -4,7 +4,9 @@ package integration
 import (
 	"context"
 	"errors"
 	"log/slog"
 	"strings"
 	"sync"
 	"testing"
 	"time"
@ -14,19 +16,20 @@ import (
 func TestLongInputContext(t *testing.T) {
 	// Setting NUM_PARALLEL to 1 ensures the allocated context is exactly what
-	// we asked for and there is nothing extra that we could spill over into
+	// we asked for and there is nothing extra that we could spill over into.
 	// Older runners silently truncate oversized prompts, while llama-server
 	// rejects them with a client error. Accept both behaviors so this test can
 	// run against main and the llama-server branch.
 	t.Setenv("OLLAMA_NUM_PARALLEL", "1")
-	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.ChatRequest{
 		Model: smol,
 		Messages: []api.Message{
 			{
 				Role:    "user",
-				Content: "Oh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe! What country is this referring to?",
+				Content: "What country is this passage referring to?\nOh, don’t speak to me of Austria. Perhaps I don’t understand things, but Austria never has wished, and does not wish, for war. She is betraying us! Russia alone must save Europe. Our gracious sovereign recognizes his high vocation and will be true to it. That is the one thing I have faith in! Our good and wonderful sovereign has to perform the noblest role on earth, and he is so virtuous and noble that God will not forsake him. He will fulfill his vocation and crush the hydra of revolution, which has become more terrible than ever in the person of this murderer and villain! We alone must avenge the blood of the just one.... Whom, I ask you, can we rely on?... England with her commercial spirit will not and cannot understand the Emperor Alexander’s loftiness of soul. She has refused to evacuate Malta. She wanted to find, and still seeks, some secret motive in our actions. What answer did Novosíltsev get? None. The English have not understood and cannot understand the self-abnegation of our Emperor who wants nothing for himself, but only desires the good of mankind. And what have they promised? Nothing! And what little they have promised they will not perform! Prussia has always declared that Buonaparte is invincible, and that all Europe is powerless before him.... And I don’t believe a word that Hardenburg says, or Haugwitz either. This famous Prussian neutrality is just a trap. I have faith only in God and the lofty destiny of our adored monarch. He will save Europe!",
 			},
 		},
 		Stream: &stream,
@ -39,7 +42,39 @@ func TestLongInputContext(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	pullOrSkip(ctx, t, client, req.Model)
-	DoChat(ctx, t, client, req, []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
+
 	var response strings.Builder
 	err := client.Chat(ctx, &req, func(resp api.ChatResponse) error {
 		response.WriteString(resp.Message.Content)
 		return nil
 	})
 	if err != nil {
 		var statusErr api.StatusError
 		if errors.As(err, &statusErr) &&
 			statusErr.StatusCode >= 400 && statusErr.StatusCode < 500 &&
 			isContextLimitError(err.Error()) {
 			slog.Info("runner rejected oversized prompt", "error", err)
 			return
 		}
 		t.Fatalf("unexpected error for long input context: %v", err)
 	}
 	anyResp := []string{"russia", "german", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict", "napoleonic", "historical"}
 	got := strings.ToLower(response.String())
 	for _, want := range anyResp {
 		if strings.Contains(got, want) {
 			return
 		}
 	}
 	t.Fatalf("%s: none of %v found in %q -- request was:%s", req.Model, anyResp, response.String(), summarizeMessages(req.Messages))
 }
 func isContextLimitError(err string) bool {
 	err = strings.ToLower(err)
 	return strings.Contains(err, "context") &&
 		(strings.Contains(err, "exceed") ||
 			strings.Contains(err, "too large") ||
 			strings.Contains(err, "too long"))
 }
 func TestContextExhaustion(t *testing.T) {
@ -71,7 +106,23 @@ func TestContextExhaustion(t *testing.T) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	pullOrSkip(ctx, t, client, req.Model)
-	DoChat(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second)
+
 	resp := DoChat(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world", "story", "friend", "suddenly", "finally", "day", "rain", "walked", "looked", "smiled", "laughed"}, 120*time.Second, 10*time.Second)
 	if resp != nil && !containsEmoji(resp.Content) {
 		t.Fatalf("%s: expected story response to contain emoji, got %q", req.Model, resp.Content)
 	}
 }
 func containsEmoji(s string) bool {
 	for _, r := range s {
 		switch {
 		case r >= 0x1F000 && r <= 0x1FAFF:
 			return true
 		case r >= 0x2600 && r <= 0x27BF:
 			return true
 		}
 	}
 	return false
 }
 // Send multiple generate requests with prior context and ensure the response is coherant and expected
@ -102,10 +153,14 @@ func TestParallelGenerateWithHistory(t *testing.T) {
 		t.Fatalf("failed to load model %s: %s", modelName, err)
 	}
 	gpuPercent := getGPUPercent(ctx, t, client, modelName)
-	if gpuPercent < 80 {
+	if gpuPercent < 80 && gpuPercent > 50 {
 		slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent)
 		initialTimeout = 240 * time.Second
 		streamTimeout = 30 * time.Second
 	} else if gpuPercent < 50 {
 		slog.Warn("Very low GPU percentage - skipping test", "percent", gpuPercent)
 		client.Generate(ctx, &api.GenerateRequest{Model: modelName, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 		t.Skip("Very low GPU percentage")
 	}
 	var wg sync.WaitGroup
@ -206,10 +261,14 @@ func TestParallelChatWithHistory(t *testing.T) {
 		t.Fatalf("failed to load model %s: %s", modelName, err)
 	}
 	gpuPercent := getGPUPercent(ctx, t, client, modelName)
-	if gpuPercent < 80 {
+	if gpuPercent < 80 && gpuPercent > 50 {
 		slog.Warn("Low GPU percentage - increasing timeouts", "percent", gpuPercent)
 		initialTimeout = 240 * time.Second
 		streamTimeout = 30 * time.Second
 	} else if gpuPercent < 50 {
 		slog.Warn("Very low GPU percentage - skipping test", "percent", gpuPercent)
 		client.Generate(ctx, &api.GenerateRequest{Model: modelName, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 		t.Skip("Very low GPU percentage")
 	}
 	var wg sync.WaitGroup
--- a/integration/create_test.go
+++ b/integration/create_test.go
@ -139,6 +139,9 @@ func runOllamaCreate(ctx context.Context, t *testing.T, args ...string) {
 }
 func TestCreateSafetensorsLLM(t *testing.T) {
 	if testModel != "" {
 		t.Skip("exercises create pipeline with a fixed source model, not applicable with model override")
 	}
 	skipIfRemote(t)
 	modelDir := filepath.Join(testdataModelsDir, "TinyLlama-1.1B")
@ -214,6 +217,9 @@ func TestCreateSafetensorsLLM(t *testing.T) {
 }
 func TestCreateGGUF(t *testing.T) {
 	if testModel != "" {
 		t.Skip("exercises create pipeline with a fixed source model, not applicable with model override")
 	}
 	modelDir := filepath.Join(testdataModelsDir, "Llama-3.2-1B-GGUF")
 	downloadHFModel(t, "bartowski/Llama-3.2-1B-Instruct-GGUF", modelDir,
 		"--include", "Llama-3.2-1B-Instruct-IQ3_M.gguf")
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@ -45,6 +45,22 @@ func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
 	return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
 }
 func requireEmbedErrorContainsAny(t *testing.T, err error, substrings ...string) {
 	t.Helper()
 	if err == nil {
 		t.Fatalf("expected error containing one of %q, got nil", substrings)
 	}
 	for _, s := range substrings {
 		if strings.Contains(err.Error(), s) {
 			return
 		}
 	}
 	t.Fatalf("expected error containing one of %q, got: %v", substrings, err)
 }
 func euclideanDistance[V float32 | float64](v1, v2 []V) V {
 	if len(v1) != len(v2) {
 		return V(math.Inf(1))
@ -73,7 +89,7 @@ func manhattanDistance[V float32 | float64](v1, v2 []V) V {
 }
 func TestEmbedCosineDistanceCorrelation(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
@ -354,9 +370,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Options:  map[string]any{"num_ctx": 3},
 			},
 			check: func(t *testing.T, res *api.EmbedResponse, err error) {
-				if err.Error() != "the input length exceeds the context length" {
+				requireEmbedErrorContainsAny(t, err, "input length exceeds the context length", "exceeds maximum context length")
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
 			},
 		},
 		{
@ -368,9 +382,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Options:  map[string]any{"num_ctx": 1},
 			},
 			check: func(t *testing.T, res *api.EmbedResponse, err error) {
-				if err.Error() != "input after truncation exceeds maximum context length" {
+				requireEmbedErrorContainsAny(t, err, "input after truncation exceeds maximum context length", "input exceeds maximum context length and cannot be truncated further")
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
 			},
 		},
 		{
@ -382,9 +394,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
 				Options:  map[string]any{"num_ctx": 0},
 			},
 			check: func(t *testing.T, res *api.EmbedResponse, err error) {
-				if err.Error() != "input after truncation exceeds maximum context length" {
+				requireEmbedErrorContainsAny(t, err, "input after truncation exceeds maximum context length", "input exceeds maximum context length and cannot be truncated further")
 					t.Fatalf("expected truncation error, got: %v", err)
 				}
 			},
 		},
 		{
@ -591,7 +601,7 @@ func TestEmbedStatusCode(t *testing.T) {
 			t.Run("truncation error status code", func(t *testing.T) {
 				truncFalse := false
-				longInput := strings.Repeat("word ", 100)
+				longInput := strings.Repeat("very long input ", 100)
 				req := api.EmbedRequest{
 					Model:    model,
@ -618,9 +628,7 @@ func TestEmbedStatusCode(t *testing.T) {
 				}
 				// Verify the error message is meaningful
-				if !strings.Contains(err.Error(), "context length") {
+				requireEmbedErrorContainsAny(t, err, "context length", "too large", "exceed_context_size")
 					t.Errorf("expected error message to mention context length, got: %v", err)
 				}
 			})
 			t.Run("batch truncation error status code", func(t *testing.T) {
--- a/integration/imagegen_test.go
+++ b/integration/imagegen_test.go
@ -17,7 +17,7 @@ func TestImageGeneration(t *testing.T) {
 	if testModel != "" {
 		t.Skip("uses hardcoded models, not applicable with model override")
 	}
-	skipUnderMinVRAM(t, 8)
+	skipUnderMinVRAM(t, 32)
 	type testCase struct {
 		imageGenModel string
@ -64,6 +64,10 @@ func TestImageGeneration(t *testing.T) {
 				} else if strings.Contains(err.Error(), "ollama-mlx: no such file or directory") {
 					// most likely linux arm - not supported yet
 					t.Skip("unsupported architecture")
 				} else if strings.Contains(err.Error(), "are available") {
 					t.Skip("insufficient VRAM for image generation model")
 				} else if strings.Contains(err.Error(), "failed to create server") {
 					t.Skip("image generation server failed to start")
 				}
 				t.Fatalf("failed to generate image: %v", err)
 			}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@ -56,6 +56,7 @@ func TestVisionModels(t *testing.T) {
 					"seed":        42,
 					"temperature": 0.0,
 				},
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}
 			// Preload to skip if we're less than 80% on GPU to avoid extremely slow tests
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@ -41,9 +41,10 @@ func TestModelsChat(t *testing.T) {
 	var chatModels []string
 	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
-		chatModels = ollamaEngineChatModels
+		chatModels = append(ollamaEngineChatModels, mlxEngineChatModels...)
 	} else {
 		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
 		chatModels = append(chatModels, mlxEngineChatModels...)
 	}
 	for _, model := range testModels(chatModels) {
@ -71,6 +72,7 @@ func TestModelsChat(t *testing.T) {
 				func(response api.GenerateResponse) error { return nil },
 			)
 			if err != nil {
 				skipIfMLXUnsupported(t, err)
 				t.Fatalf("failed to load model %s: %s", model, err)
 			}
 			gpuPercent := getGPUPercent(ctx, t, client, model)
--- a/integration/tools_stress_test.go
+++ b/integration/tools_stress_test.go
@ -60,8 +60,14 @@ func TestAPIToolCallingStress(t *testing.T) {
 	models := testModels(libraryToolsModels)
 	softTimeout, _ := getTimeouts(t)
 	for _, model := range models {
 		t.Run(model, func(t *testing.T) {
 			if time.Since(started) > softTimeout {
 				t.Skip("skipping remaining tests to avoid excessive runtime")
 				return
 			}
 			// Skip known-bad models unless explicitly requested via env var
 			if reason, ok := skipModels[model]; ok && testModel == "" {
 				t.Skipf("skipping: %s", reason)
@ -75,6 +81,13 @@ func TestAPIToolCallingStress(t *testing.T) {
 			pullOrSkip(ctx, t, client, model)
 			// Preload and skip if not sufficiently GPU-loaded to avoid timeouts
 			err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
 			if err != nil {
 				t.Fatalf("failed to load model %s: %s", model, err)
 			}
 			skipIfNotGPULoaded(ctx, t, client, model, 80)
 			tools := stressTestTools()
 			// Large system prompt that mimics real coding agents (opencode, Claude Code, etc.)
@ -343,6 +356,7 @@ func testToolCall(t *testing.T, ctx context.Context, client *api.Client, model,
 			{Role: "user", Content: userMessage},
 		},
 		Tools:     tools,
 		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 		Options: map[string]any{
 			"temperature": 0,
 			"num_ctx":     contextLength(16384),
@ -426,6 +440,7 @@ func testToolCallMultiTurn(t *testing.T, ctx context.Context, client *api.Client
 			// The model should now respond with content or another tool call
 		},
 		Tools:     tools,
 		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 		Options: map[string]any{
 			"temperature": 0,
 			"num_ctx":     contextLength(16384),
--- a/integration/tools_test.go
+++ b/integration/tools_test.go
@ -23,7 +23,8 @@ func testPropsMap(m map[string]api.ToolProperty) *api.ToolPropertiesMap {
 func TestAPIToolCalling(t *testing.T) {
 	initialTimeout := 60 * time.Second
 	streamTimeout := 60 * time.Second
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+	softTimeout, hardTimeout := getTimeouts(t)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
@ -52,6 +53,11 @@ func TestAPIToolCalling(t *testing.T) {
 	for _, model := range models {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
 				t.Skip("skipping remaining tests to avoid excessive runtime")
 				return
 			}
 			if testModel != "" {
 				requireCapability(ctx, t, client, model, "tools")
 			}
@ -93,6 +99,7 @@ func TestAPIToolCalling(t *testing.T) {
 				Options: map[string]any{
 					"temperature": 0,
 				},
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}
 			stallTimer := time.NewTimer(initialTimeout)
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@ -45,6 +45,8 @@ var (
 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
 		"nemotron3:33b",
 		"laguna-xs.2:q4_K_M",
 		"gemma4",
 		"lfm2.5-thinking",
 		"ministral-3",
@ -66,6 +68,15 @@ var (
 		"minicpm-v:latest",    // arch=qwen2
 		"granite-code:latest", // arch=llama
 	}
 	// MLX-backed safetensors tags. These exercise the mlxrunner subprocess
 	// on platforms where MLX is available (today: macOS; Linux/Windows CUDA
 	// coming). On other platforms, skipIfMLXUnsupported turns the load
 	// failure into a test skip.
 	mlxEngineChatModels = []string{
 		"laguna-xs.2:nvfp4",
 		"qwen3.5:2b-nvfp4",  // ~2.5GB, Qwen3_5 arch
 		"gemma4:e2b-nvfp4",  // ~7.1GB, Gemma4 arch (skipped under low VRAM)
 	}
 	llamaRunnerChatModels = []string{
 		"mistral:latest",
 		"falcon3:latest",
@ -77,14 +88,6 @@ var (
 		"internlm2:latest",
 		"codellama:latest", // arch=llama
 		"phi3:latest",
 		"falcon2:latest",
 		"gemma:latest",
 		"llama2:latest",
 		"nous-hermes:latest",
 		"orca-mini:latest",
 		"qwen:latest",
 		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
 		"falcon:latest",
 	}
 	// Some library models are quite large - ensure large VRAM and sufficient disk space
@ -261,7 +264,6 @@ var (
 		"zephyr",
 	}
 	libraryEmbedModels = []string{
 		"qwen3-embedding",
 		"embeddinggemma",
 		"nomic-embed-text",
 		"all-minilm",
@ -272,8 +274,11 @@ var (
 		"paraphrase-multilingual",
 		"snowflake-arctic-embed",
 		"snowflake-arctic-embed2",
 		"qwen3-embedding",
 	}
 	libraryToolsModels = []string{
 		"nemotron3:33b",
 		"laguna-xs.2",
 		"gemma4",
 		"lfm2.5-thinking",
 		"qwen3-vl",
@ -284,7 +289,6 @@ var (
 		"llama3.2",
 		"mistral",
 		"qwen2.5",
 		"qwen2",
 		"ministral-3",
 		"mistral-nemo",
 		"mistral-small",
@ -329,13 +333,23 @@ func testModels(defaults []string) []string {
 }
 // requireCapability skips the test if the model does not advertise the
-// given capability. It queries the server via Show and caches nothing —
+// given capability. If the model is missing locally, it first goes through
-// call it once per subtest. For local-only models where Show may not
+// the normal pull-if-missing path so tests still behave correctly on cold
-// return capabilities (e.g. models created via ollama create), this is
+// hosts. For local-only models where Show may not return capabilities
-// a best-effort check.
+// (e.g. models created via ollama create), this is a best-effort check.
 func requireCapability(ctx context.Context, t *testing.T, client *api.Client, modelName string, cap model.Capability) {
 	t.Helper()
 	resp, err := client.Show(ctx, &api.ShowRequest{Name: modelName})
 	var statusError api.StatusError
 	if errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound {
 		if err := PullIfMissing(ctx, client, modelName); err != nil {
 			t.Skipf("model %s not available: %v", modelName, err)
 		}
 		resp, err = client.Show(ctx, &api.ShowRequest{Name: modelName})
 	}
 	if err != nil {
 		t.Fatalf("failed to show model %s: %v", modelName, err)
 	}
@ -699,6 +713,45 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		}
 }
 // summarizeMessages returns a compact string form of the messages suitable
 // for logs and error output. Image byte payloads are replaced with a
 // "<image: N bytes>" marker so vision tests don't dump huge integer arrays.
 func summarizeMessages(msgs []api.Message) string {
 	var b strings.Builder
 	b.WriteByte('[')
 	for i, m := range msgs {
 		if i > 0 {
 			b.WriteString(", ")
 		}
 		fmt.Fprintf(&b, "{Role:%s Content:%q", m.Role, m.Content)
 		if m.Thinking != "" {
 			fmt.Fprintf(&b, " Thinking:%q", m.Thinking)
 		}
 		if len(m.Images) > 0 {
 			b.WriteString(" Images:[")
 			for j, img := range m.Images {
 				if j > 0 {
 					b.WriteString(", ")
 				}
 				fmt.Fprintf(&b, "<image: %d bytes>", len(img))
 			}
 			b.WriteByte(']')
 		}
 		if len(m.ToolCalls) > 0 {
 			fmt.Fprintf(&b, " ToolCalls:%+v", m.ToolCalls)
 		}
 		if m.ToolName != "" {
 			fmt.Fprintf(&b, " ToolName:%s", m.ToolName)
 		}
 		if m.ToolCallID != "" {
 			fmt.Fprintf(&b, " ToolCallID:%s", m.ToolCallID)
 		}
 		b.WriteByte('}')
 	}
 	b.WriteByte(']')
 	return b.String()
 }
 func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatRequest, anyResp []string, initialTimeout, streamTimeout time.Duration) *api.Message {
 	stallTimer := time.NewTimer(initialTimeout)
 	var buf bytes.Buffer
@ -734,7 +787,7 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
 			}
 		}
 		if !atLeastOne {
-			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
+			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%s", req.Model, anyResp, response, summarizeMessages(req.Messages))
 		}
 	}
@ -751,10 +804,10 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
 			return nil
 		}
 		if genErr != nil {
-			t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages)
+			t.Fatalf("%s failed with %s request prompt %s", genErr, req.Model, summarizeMessages(req.Messages))
 		}
 		verify()
-		slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response)
+		slog.Info("test pass", "model", req.Model, "messages", summarizeMessages(req.Messages), "contains", anyResp, "response", response)
 	case <-ctx.Done():
 		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
 		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
@ -784,6 +837,66 @@ func ChatRequests() ([]api.ChatRequest, [][]string) {
 	return reqs, results
 }
 // skipIfMLXUnsupported converts an MLX runner startup error into a test skip
 // when the fingerprint matches "the MLX stack is not wired up on this host",
 // and only on platforms where MLX is not yet expected to work. On Apple
 // Silicon (darwin/arm64) MLX must work, so the same errors there fall
 // through and fail the test — we never want to mask a real Mac regression.
 //
 // The fingerprints are the exact wrapper strings produced by the MLX code
 // paths (see x/mlxrunner/server.go, x/mlxrunner/mlx/dynamic.go,
 // x/imagegen/mlx/mlx.go, x/imagegen/memory.go). Model-level errors
 // (unsupported architecture, tensor mismatches, runtime failures) do not
 // contain these strings, so this helper will not mask them.
 func skipIfMLXUnsupported(t *testing.T, err error) {
 	t.Helper()
 	if err == nil {
 		return
 	}
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return
 	}
 	msg := err.Error()
 	for _, s := range []string{
 		"MLX not available:",
 		"failed to load MLX dynamic library",
 		"failed to load MLX function symbols",
 		"image generation on macOS requires Apple Silicon",
 		"image generation is not supported on",
 	} {
 		if strings.Contains(msg, s) {
 			t.Skipf("MLX not available on %s/%s: %v", runtime.GOOS, runtime.GOARCH, err)
 		}
 	}
 }
 // skipIfModelTooLargeForVRAM skips the test when the model's on-disk size
 // is larger than OLLAMA_MAX_VRAM by enough that even partial GPU offload
 // won't help. Uses the same 0.75x gate as TestPerfModels (model_perf_test.go)
 // so vision/audio tests stay runnable on systems where the model is slightly
 // over VRAM and a portion legitimately spills to CPU. No-op when
 // OLLAMA_MAX_VRAM is unset.
 func skipIfModelTooLargeForVRAM(ctx context.Context, t *testing.T, client *api.Client, modelName string) {
 	t.Helper()
 	s := os.Getenv("OLLAMA_MAX_VRAM")
 	if s == "" {
 		return
 	}
 	maxVram, err := strconv.ParseUint(s, 10, 64)
 	if err != nil {
 		t.Fatalf("invalid OLLAMA_MAX_VRAM %v", err)
 	}
 	resp, err := client.List(ctx)
 	if err != nil {
 		t.Fatalf("list models failed %v", err)
 	}
 	for _, m := range resp.Models {
 		if m.Name == modelName && float32(m.Size)*0.75 > float32(maxVram) {
 			t.Skipf("model %s is too large %s for available VRAM %s", modelName, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
 		}
 	}
 }
 func skipUnderMinVRAM(t *testing.T, gb uint64) {
 	// TODO use info API in the future
 	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
@ -802,6 +915,8 @@ func skipUnderMinVRAM(t *testing.T, gb uint64) {
 func skipIfNotGPULoaded(ctx context.Context, t *testing.T, client *api.Client, model string, minPercent int) {
 	gpuPercent := getGPUPercent(ctx, t, client, model)
 	if gpuPercent < minPercent {
 		// Unload the model if we're going to skip
 		client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 		t.Skip(fmt.Sprintf("test requires minimum %d%% GPU load, but model %s only has %d%%", minPercent, model, gpuPercent))
 	}
 }
--- a/integration/vision_test.go
+++ b/integration/vision_test.go
@ -16,6 +16,7 @@ import (
 // Default set of vision models to test. When OLLAMA_TEST_MODEL is set,
 // only that model is tested (with a capability check for vision).
 var defaultVisionModels = []string{
 	"nemotron3:33b",
 	"gemma4",
 	"gemma3",
 	"llama3.2-vision",
@ -67,10 +68,11 @@ func skipIfNoVisionOverride(t *testing.T) {
 // setupVisionModel pulls the model, preloads it, and skips if not GPU-loaded.
 func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, model string) {
 	t.Helper()
-	if testModel != "" {
+	if testModel == "" {
 		requireCapability(ctx, t, client, model, "vision")
 	}
 		pullOrSkip(ctx, t, client, model)
 	}
 	skipIfModelTooLargeForVRAM(ctx, t, client, model)
 	requireCapability(ctx, t, client, model, "vision")
 	err := client.Generate(ctx, &api.GenerateRequest{Model: model}, func(response api.GenerateResponse) error { return nil })
 	if err != nil {
 		t.Fatalf("failed to load model %s: %s", model, err)
@ -82,7 +84,7 @@ func setupVisionModel(ctx context.Context, t *testing.T, client *api.Client, mod
 // questions about the same image. This verifies that the KV cache correctly
 // handles cached image tokens across turns.
 func TestVisionMultiTurn(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	// Models that fail on multi-turn detail questions (e.g. misidentifying objects).
@ -115,6 +117,7 @@ func TestVisionMultiTurn(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			resp1 := DoChat(ctx, t, client, req, []string{
@ -150,7 +153,7 @@ func TestVisionMultiTurn(t *testing.T) {
 // TestVisionObjectCounting asks the model to count objects in an image.
 func TestVisionObjectCounting(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	skipModels := map[string]string{
@ -180,6 +183,7 @@ func TestVisionObjectCounting(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			DoChat(ctx, t, client, req, []string{"4", "four"}, 120*time.Second, 30*time.Second)
@ -190,7 +194,7 @@ func TestVisionObjectCounting(t *testing.T) {
 // TestVisionSceneUnderstanding tests whether the model can identify
 // cultural references and scene context from an image.
 func TestVisionSceneUnderstanding(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	// Models known to be too small or not capable enough for cultural reference detection.
@ -222,6 +226,7 @@ func TestVisionSceneUnderstanding(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			DoChat(ctx, t, client, req, []string{
@ -234,7 +239,7 @@ func TestVisionSceneUnderstanding(t *testing.T) {
 // TestVisionSpatialReasoning tests the model's ability to identify
 // objects based on their spatial position in the image.
 func TestVisionSpatialReasoning(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	for _, model := range testModels(defaultVisionModels) {
@ -259,10 +264,11 @@ func TestVisionSpatialReasoning(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			DoChat(ctx, t, client, req, []string{
-				"laptop", "computer", "typing", "working",
+				"laptop", "computer", "typing", "working", "desk", "writing", "pen", "glasses", "reading",
 			}, 120*time.Second, 30*time.Second)
 		})
 	}
@ -271,7 +277,7 @@ func TestVisionSpatialReasoning(t *testing.T) {
 // TestVisionDetailRecognition tests whether the model can identify
 // small details like accessories in an image.
 func TestVisionDetailRecognition(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	for _, model := range testModels(defaultVisionModels) {
@ -294,6 +300,7 @@ func TestVisionDetailRecognition(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			DoChat(ctx, t, client, req, []string{
@ -307,7 +314,7 @@ func TestVisionDetailRecognition(t *testing.T) {
 // the model to compare and contrast them. This exercises multi-image
 // encoding and cross-image reasoning.
 func TestVisionMultiImage(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	// Multi-image support varies across models.
@ -338,6 +345,7 @@ func TestVisionMultiImage(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			// Both images feature cartoon llamas/alpacas — the model should
@ -353,7 +361,7 @@ func TestVisionMultiImage(t *testing.T) {
 // of the ollama homepage image (a cartoon llama with "Start building with
 // open models" text). Basic sanity check that the vision pipeline works.
 func TestVisionImageDescription(t *testing.T) {
-	skipUnderMinVRAM(t, 6)
+	skipUnderMinVRAM(t, 16)
 	skipIfNoVisionOverride(t)
 	for _, model := range testModels(defaultVisionModels) {
@ -376,6 +384,7 @@ func TestVisionImageDescription(t *testing.T) {
 					},
 				},
 				Stream: &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options:   map[string]any{"temperature": 0.0, "seed": 42},
 			}
 			DoChat(ctx, t, client, req, []string{