diff --git a/anthropic/anthropic.go b/anthropic/anthropic.go
index fdd054a2f..46394cdd2 100755
--- a/anthropic/anthropic.go
+++ b/anthropic/anthropic.go
@@ -78,6 +78,11 @@ type MessagesRequest struct {
ToolChoice *ToolChoice `json:"tool_choice,omitempty"`
Thinking *ThinkingConfig `json:"thinking,omitempty"`
Metadata *Metadata `json:"metadata,omitempty"`
+ OutputConfig *OutputConfig `json:"output_config,omitempty"`
+}
+
+type OutputConfig struct {
+ Effort string `json:"effort,omitempty"`
}
// MessageParam represents a message in the request
@@ -161,7 +166,7 @@ type WebSearchToolResultError struct {
// ImageSource represents the source of an image
type ImageSource struct {
- Type string `json:"type"` // "base64" or "url"
+ Type string `json:"type"` // "base64"
MediaType string `json:"media_type,omitempty"`
Data string `json:"data,omitempty"`
URL string `json:"url,omitempty"`
@@ -373,9 +378,26 @@ func FromMessagesRequest(r MessagesRequest) (*api.ChatRequest, error) {
}
var think *api.ThinkValue
+ normalizedEffort := ""
+ if r.OutputConfig != nil {
+ normalizedEffort = strings.ToLower(strings.TrimSpace(r.OutputConfig.Effort))
+ if normalizedEffort == "xhigh" {
+ normalizedEffort = "high"
+ }
+ }
+
if r.Thinking != nil && r.Thinking.Type == "enabled" {
think = &api.ThinkValue{Value: true}
}
+ if r.Thinking != nil && r.Thinking.Type == "disabled" {
+ think = &api.ThinkValue{Value: false}
+ }
+ if think == nil && r.OutputConfig != nil {
+ switch normalizedEffort {
+ case "high", "medium", "low", "max":
+ think = &api.ThinkValue{Value: normalizedEffort}
+ }
+ }
stream := r.Stream
convertedRequest := &api.ChatRequest{
@@ -425,17 +447,12 @@ func convertMessage(msg MessageParam) ([]api.Message, error) {
return nil, errors.New("invalid image source")
}
- if block.Source.Type == "base64" {
- decoded, err := base64.StdEncoding.DecodeString(block.Source.Data)
- if err != nil {
- logutil.Trace("anthropic: invalid base64 image data", "role", role, "error", err)
- return nil, fmt.Errorf("invalid base64 image data: %w", err)
- }
- images = append(images, decoded)
- } else {
- logutil.Trace("anthropic: unsupported image source type", "role", role, "source_type", block.Source.Type)
- return nil, fmt.Errorf("invalid image source type: %s. Only base64 images are supported.", block.Source.Type)
+ decoded, err := resolveImageSource(block.Source)
+ if err != nil {
+ logutil.Trace("anthropic: unsupported image source", "role", role, "source_type", block.Source.Type, "error", err)
+ return nil, err
}
+ images = append(images, decoded)
case "tool_use":
toolUseBlocks++
@@ -457,26 +474,16 @@ func convertMessage(msg MessageParam) ([]api.Message, error) {
case "tool_result":
toolResultBlocks++
- var resultContent string
-
- switch c := block.Content.(type) {
- case string:
- resultContent = c
- case []any:
- for _, cb := range c {
- if cbMap, ok := cb.(map[string]any); ok {
- if cbMap["type"] == "text" {
- if text, ok := cbMap["text"].(string); ok {
- resultContent += text
- }
- }
- }
- }
+ resultContent, resultImages, err := convertToolResultContent(block.Content)
+ if err != nil {
+ logutil.Trace("anthropic: invalid tool_result content", "role", role, "error", err)
+ return nil, err
}
toolResults = append(toolResults, api.Message{
Role: "tool",
Content: resultContent,
+ Images: resultImages,
ToolCallID: block.ToolUseID,
})
@@ -508,6 +515,10 @@ func convertMessage(msg MessageParam) ([]api.Message, error) {
}
}
+ if role == "user" && len(toolResults) > 0 {
+ messages = append(messages, toolResults...)
+ }
+
if textContent.Len() > 0 || len(images) > 0 || len(toolCalls) > 0 || thinking != "" {
m := api.Message{
Role: role,
@@ -519,8 +530,10 @@ func convertMessage(msg MessageParam) ([]api.Message, error) {
messages = append(messages, m)
}
- // Add tool results as separate messages
- messages = append(messages, toolResults...)
+ // Add tool results as separate messages.
+ if role != "user" || len(toolResults) == 0 {
+ messages = append(messages, toolResults...)
+ }
logutil.Trace("anthropic: converted block message",
"role", role,
"blocks", len(msg.Content),
@@ -969,6 +982,71 @@ func GenerateMessageID() string {
return generateID("msg")
}
+func resolveImageSource(source *ImageSource) (api.ImageData, error) {
+ if source.Type != "base64" {
+ return nil, fmt.Errorf("invalid image source type: %s. Only base64 images are supported.", source.Type)
+ }
+
+ decoded, err := base64.StdEncoding.DecodeString(source.Data)
+ if err != nil {
+ return nil, fmt.Errorf("invalid base64 image data: %w", err)
+ }
+
+ return decoded, nil
+}
+
+func convertToolResultContent(content any) (string, []api.ImageData, error) {
+ switch c := content.(type) {
+ case nil:
+ return "", nil, nil
+ case string:
+ return c, nil, nil
+ case []any:
+ var text strings.Builder
+ var images []api.ImageData
+
+ for _, cb := range c {
+ cbMap, ok := cb.(map[string]any)
+ if !ok {
+ continue
+ }
+
+ switch cbMap["type"] {
+ case "text":
+ if t, ok := cbMap["text"].(string); ok {
+ text.WriteString(t)
+ }
+ case "image":
+ rawSource, ok := cbMap["source"].(map[string]any)
+ if !ok {
+ return "", nil, errors.New("invalid tool_result image source")
+ }
+
+ var source ImageSource
+ if rawType, ok := rawSource["type"].(string); ok {
+ source.Type = rawType
+ }
+ if rawMediaType, ok := rawSource["media_type"].(string); ok {
+ source.MediaType = rawMediaType
+ }
+ if rawData, ok := rawSource["data"].(string); ok {
+ source.Data = rawData
+ }
+
+ img, err := resolveImageSource(&source)
+ if err != nil {
+ return "", nil, err
+ }
+ images = append(images, img)
+ }
+ }
+
+ return text.String(), images, nil
+ default:
+ return "", nil, nil
+ }
+}
+
// ptr returns a pointer to the given string value
func ptr(s string) *string {
return &s
diff --git a/anthropic/anthropic_test.go b/anthropic/anthropic_test.go
index ea787fd53..27f9b9a7e 100755
--- a/anthropic/anthropic_test.go
+++ b/anthropic/anthropic_test.go
@@ -271,6 +271,241 @@ func TestFromMessagesRequest_WithToolResult(t *testing.T) {
}
}
+func TestFromMessagesRequest_WithToolResultImage(t *testing.T) {
+ imgData, _ := base64.StdEncoding.DecodeString(testImage)
+
+ req := MessagesRequest{
+ Model: "test-model",
+ MaxTokens: 1024,
+ Messages: []MessageParam{
+ {
+ Role: "user",
+ Content: []ContentBlock{
+ {
+ Type: "tool_result",
+ ToolUseID: "call_img",
+ Content: []any{
+ map[string]any{"type": "text", "text": "Attached image"},
+ map[string]any{
+ "type": "image",
+ "source": map[string]any{
+ "type": "base64",
+ "media_type": "image/png",
+ "data": testImage,
+ },
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.Messages) != 1 {
+ t.Fatalf("expected 1 message, got %d", len(result.Messages))
+ }
+
+ msg := result.Messages[0]
+ if msg.Role != "tool" {
+ t.Errorf("expected role 'tool', got %q", msg.Role)
+ }
+ if msg.ToolCallID != "call_img" {
+ t.Errorf("expected tool_call_id 'call_img', got %q", msg.ToolCallID)
+ }
+ if msg.Content != "Attached image" {
+ t.Errorf("unexpected content: %q", msg.Content)
+ }
+ if len(msg.Images) != 1 {
+ t.Fatalf("expected 1 image, got %d", len(msg.Images))
+ }
+ if string(msg.Images[0]) != string(imgData) {
+ t.Error("image data mismatch")
+ }
+}
+
+func TestFromMessagesRequest_WithToolResultFollowedByUserText(t *testing.T) {
+ req := MessagesRequest{
+ Model: "test-model",
+ MaxTokens: 1024,
+ Messages: []MessageParam{
+ {
+ Role: "assistant",
+ Content: []ContentBlock{
+ {
+ Type: "tool_use",
+ ID: "call_read",
+ Name: "Read",
+ Input: makeArgs("file_path", "/Users/hoyyeva/Desktop/aaa.png"),
+ },
+ },
+ },
+ {
+ Role: "user",
+ Content: []ContentBlock{
+ {
+ Type: "tool_result",
+ ToolUseID: "call_read",
+ Content: "Read image (311.5KB)",
+ },
+ {
+ Type: "text",
+ Text: ptr("Please describe it."),
+ },
+ },
+ },
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if len(result.Messages) != 3 {
+ t.Fatalf("expected 3 messages, got %d", len(result.Messages))
+ }
+
+ if result.Messages[1].Role != "tool" {
+ t.Fatalf("expected second message to be tool, got %q", result.Messages[1].Role)
+ }
+ if result.Messages[1].ToolCallID != "call_read" {
+ t.Fatalf("expected tool_call_id 'call_read', got %q", result.Messages[1].ToolCallID)
+ }
+ if result.Messages[2].Role != "user" {
+ t.Fatalf("expected third message to be user, got %q", result.Messages[2].Role)
+ }
+ if result.Messages[2].Content != "Please describe it." {
+ t.Fatalf("unexpected user content: %q", result.Messages[2].Content)
+ }
+}
+
+func TestFromMessagesRequest_WithOutputConfigEffort(t *testing.T) {
+ req := MessagesRequest{
+ Model: "gemma4",
+ MaxTokens: 32000,
+ Messages: []MessageParam{
+ {
+ Role: "user",
+ Content: textContent("Describe the image."),
+ },
+ },
+ OutputConfig: &OutputConfig{
+ Effort: "high",
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.Think == nil {
+ t.Fatal("expected think to be set from output_config.effort")
+ }
+
+ if got := result.Think.String(); got != "high" {
+ t.Fatalf("expected think level 'high', got %q", got)
+ }
+}
+
+func TestFromMessagesRequest_WithOutputConfigEffortXHighMapsToHigh(t *testing.T) {
+ req := MessagesRequest{
+ Model: "gemma4",
+ MaxTokens: 32000,
+ Messages: []MessageParam{
+ {
+ Role: "user",
+ Content: textContent("Describe the image."),
+ },
+ },
+ OutputConfig: &OutputConfig{
+ Effort: "xhigh",
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.Think == nil {
+ t.Fatal("expected think to be set from output_config.effort")
+ }
+
+ if got := result.Think.String(); got != "high" {
+ t.Fatalf("expected think level 'high' for xhigh effort, got %q", got)
+ }
+}
+
+func TestFromMessagesRequest_ThinkingDisabledOverridesOutputConfigEffort(t *testing.T) {
+ req := MessagesRequest{
+ Model: "gemma4",
+ MaxTokens: 32000,
+ Messages: []MessageParam{
+ {
+ Role: "user",
+ Content: textContent("Describe the image."),
+ },
+ },
+ Thinking: &ThinkingConfig{
+ Type: "disabled",
+ },
+ OutputConfig: &OutputConfig{
+ Effort: "high",
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.Think == nil {
+ t.Fatal("expected think to be set")
+ }
+
+ if got := result.Think.Value; got != false {
+ t.Fatalf("expected think=false when thinking is disabled, got %v", got)
+ }
+}
+
+func TestFromMessagesRequest_ThinkingAdaptiveUsesOutputConfigEffort(t *testing.T) {
+ req := MessagesRequest{
+ Model: "gemma4",
+ MaxTokens: 32000,
+ Messages: []MessageParam{
+ {
+ Role: "user",
+ Content: textContent("Describe the image."),
+ },
+ },
+ Thinking: &ThinkingConfig{
+ Type: "adaptive",
+ },
+ OutputConfig: &OutputConfig{
+ Effort: "high",
+ },
+ }
+
+ result, err := FromMessagesRequest(req)
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ if result.Think == nil {
+ t.Fatal("expected think to be set from output_config.effort")
+ }
+
+ if got := result.Think.String(); got != "high" {
+ t.Fatalf("expected think level 'high' for adaptive thinking, got %q", got)
+ }
+}
+
func TestFromMessagesRequest_WithTools(t *testing.T) {
req := MessagesRequest{
Model: "test-model",
diff --git a/model/renderers/gemma4.go b/model/renderers/gemma4.go
index bbf717f37..863c9ac36 100644
--- a/model/renderers/gemma4.go
+++ b/model/renderers/gemma4.go
@@ -98,7 +98,8 @@ func (r *Gemma4Renderer) Render(messages []api.Message, tools []api.Tool, thinkV
toolResponsesEmitted := false
if len(message.ToolCalls) > 0 {
for k := i + 1; k < len(loopMessages) && loopMessages[k].Role == "tool"; k++ {
- sb.WriteString(r.formatToolResponseBlock(r.toolResponseName(loopMessages[k], message.ToolCalls), loopMessages[k].Content))
+ response := r.renderToolResponseContent(loopMessages[k], &imageOffset)
+ sb.WriteString(r.formatToolResponseBlock(r.toolResponseName(loopMessages[k], message.ToolCalls), response))
toolResponsesEmitted = true
prevMessageType = "tool_response"
}
@@ -160,19 +161,22 @@ func stripThinking(text string) string {
// When trim is true, leading/trailing whitespace is stripped (matching the Jinja2
// template's | trim filter applied to non-model content).
func (r *Gemma4Renderer) renderContent(sb *strings.Builder, msg api.Message, imageOffset *int, trim bool) {
- if len(msg.Images) > 0 && r.useImgTags {
- for range msg.Images {
- sb.WriteString(fmt.Sprintf("[img-%d]", *imageOffset))
- *imageOffset++
- }
- }
content := msg.Content
if trim {
content = strings.TrimSpace(content)
}
+ if len(msg.Images) > 0 && r.useImgTags {
+ content, *imageOffset = renderContentWithImageTags(content, len(msg.Images), *imageOffset)
+ }
sb.WriteString(content)
}
+func (r *Gemma4Renderer) renderToolResponseContent(msg api.Message, imageOffset *int) string {
+ var sb strings.Builder
+ r.renderContent(&sb, msg, imageOffset, false)
+ return sb.String()
+}
+
func (r *Gemma4Renderer) previousNonToolRole(messages []api.Message, idx int) string {
for i := idx - 1; i >= 0; i-- {
if messages[i].Role != "tool" {
diff --git a/model/renderers/glmocr.go b/model/renderers/glmocr.go
index 05e7be08e..8e84bdbb3 100644
--- a/model/renderers/glmocr.go
+++ b/model/renderers/glmocr.go
@@ -13,15 +13,11 @@ type GlmOcrRenderer struct {
}
func (r *GlmOcrRenderer) renderContent(message api.Message, imageOffset int) (string, int) {
- var sb strings.Builder
- for range message.Images {
- if r.useImgTags {
- sb.WriteString(fmt.Sprintf("[img-%d]", imageOffset))
- imageOffset++
- }
+ if r.useImgTags {
+ return renderContentWithImageTags(message.Content, len(message.Images), imageOffset)
}
- sb.WriteString(message.Content)
- return sb.String(), imageOffset
+
+ return message.Content, imageOffset
}
func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
@@ -85,8 +81,10 @@ func (r *GlmOcrRenderer) Render(messages []api.Message, tools []api.Tool, thinkV
if i == 0 || messages[i-1].Role != "tool" {
sb.WriteString("<|observation|>")
}
+ content, nextOffset := r.renderContent(message, imageOffset)
+ imageOffset = nextOffset
sb.WriteString("\n\n")
- sb.WriteString(message.Content)
+ sb.WriteString(content)
sb.WriteString("\n\n")
case "system":
sb.WriteString("<|system|>\n")
diff --git a/model/renderers/glmocr_test.go b/model/renderers/glmocr_test.go
index dbc611ccb..1f8b9c038 100644
--- a/model/renderers/glmocr_test.go
+++ b/model/renderers/glmocr_test.go
@@ -25,7 +25,7 @@ func TestGlmOcrRenderer_Images(t *testing.T) {
Images: []api.ImageData{api.ImageData("img1")},
},
},
- expected: "[gMASK]<|user|>\n[img-0]Describe this image.<|assistant|>\n",
+ expected: "[gMASK]<|user|>\n[img-0] Describe this image.<|assistant|>\n",
},
{
name: "use_img_tags_multiple_images",
@@ -37,7 +37,7 @@ func TestGlmOcrRenderer_Images(t *testing.T) {
Images: []api.ImageData{api.ImageData("img1"), api.ImageData("img2")},
},
},
- expected: "[gMASK]<|user|>\n[img-0][img-1]Describe these images.<|assistant|>\n",
+ expected: "[gMASK]<|user|>\n[img-0][img-1] Describe these images.<|assistant|>\n",
},
{
name: "multi_turn_increments_image_offset",
@@ -58,7 +58,7 @@ func TestGlmOcrRenderer_Images(t *testing.T) {
Images: []api.ImageData{api.ImageData("img2")},
},
},
- expected: "[gMASK]<|user|>\n[img-0]First image<|assistant|>\n\nProcessed.\n<|user|>\n[img-1]Second image<|assistant|>\n",
+ expected: "[gMASK]<|user|>\n[img-0] First image<|assistant|>\n\nProcessed.\n<|user|>\n[img-1] Second image<|assistant|>\n",
},
{
name: "default_no_img_tags",
diff --git a/model/renderers/image_tags.go b/model/renderers/image_tags.go
new file mode 100644
index 000000000..205080162
--- /dev/null
+++ b/model/renderers/image_tags.go
@@ -0,0 +1,39 @@
+package renderers
+
+import (
+ "fmt"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+// renderContentWithImageTags preserves the legacy server-side placeholder
+// semantics for explicit [img] tokens: replace placeholders in order, and
+// only prepend tags for any remaining images without placeholders.
+func renderContentWithImageTags(content string, imageCount int, imageOffset int) (string, int) {
+ if imageCount == 0 {
+ return content, imageOffset
+ }
+
+ if strings.Contains(content, "[img-") {
+ return content, imageOffset + imageCount
+ }
+
+ var prefix strings.Builder
+ for i := range imageCount {
+ imgTag := fmt.Sprintf("[img-%d]", imageOffset+i)
+ if strings.Contains(content, "[img]") {
+ content = strings.Replace(content, "[img]", imgTag, 1)
+ } else {
+ prefix.WriteString(imgTag)
+ }
+ }
+
+ if prefix.Len() > 0 && content != "" {
+ if r, _ := utf8.DecodeRuneInString(content); r != utf8.RuneError && !unicode.IsSpace(r) {
+ prefix.WriteByte(' ')
+ }
+ }
+
+ return prefix.String() + content, imageOffset + imageCount
+}
diff --git a/model/renderers/image_tags_test.go b/model/renderers/image_tags_test.go
new file mode 100644
index 000000000..e833dde56
--- /dev/null
+++ b/model/renderers/image_tags_test.go
@@ -0,0 +1,67 @@
+package renderers
+
+import "testing"
+
+func TestRenderContentWithImageTags(t *testing.T) {
+ tests := []struct {
+ name string
+ content string
+ imageCount int
+ imageOffset int
+ want string
+ wantOffset int
+ }{
+ {
+ name: "prefixes when there are no placeholders",
+ content: "describe this image",
+ imageCount: 2,
+ imageOffset: 0,
+ want: "[img-0][img-1] describe this image",
+ wantOffset: 2,
+ },
+ {
+ name: "replaces explicit placeholders in order",
+ content: "compare [img] and [img]",
+ imageCount: 2,
+ imageOffset: 3,
+ want: "compare [img-3] and [img-4]",
+ wantOffset: 5,
+ },
+ {
+ name: "prefixes extra images after placeholders are exhausted",
+ content: "compare [img]",
+ imageCount: 2,
+ imageOffset: 0,
+ want: "[img-1] compare [img-0]",
+ wantOffset: 2,
+ },
+ {
+ name: "leaves leftover placeholders when there are fewer images",
+ content: "compare [img] and [img]",
+ imageCount: 1,
+ imageOffset: 0,
+ want: "compare [img-0] and [img]",
+ wantOffset: 1,
+ },
+ {
+ name: "preserves already-numbered placeholders",
+ content: "compare [img-0] and [img-1]",
+ imageCount: 2,
+ imageOffset: 0,
+ want: "compare [img-0] and [img-1]",
+ wantOffset: 2,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got, gotOffset := renderContentWithImageTags(tt.content, tt.imageCount, tt.imageOffset)
+ if got != tt.want {
+ t.Fatalf("content = %q, want %q", got, tt.want)
+ }
+ if gotOffset != tt.wantOffset {
+ t.Fatalf("offset = %d, want %d", gotOffset, tt.wantOffset)
+ }
+ })
+ }
+}
diff --git a/model/renderers/lfm2.go b/model/renderers/lfm2.go
index 17dcc6b73..48ef87306 100644
--- a/model/renderers/lfm2.go
+++ b/model/renderers/lfm2.go
@@ -3,7 +3,6 @@ package renderers
import (
"bytes"
"encoding/json"
- "fmt"
"sort"
"strings"
@@ -199,19 +198,18 @@ func (r *LFM2Renderer) renderMessageContent(message api.Message, imageOffset int
return content
}
- var sb strings.Builder
if r.useImgTags {
- for i := range message.Images {
- sb.WriteString(fmt.Sprintf("[img-%d]", imageOffset+i))
- }
- } else {
- placeholder := lfm2ImagePlaceholder(false)
- if strings.Contains(content, placeholder) {
- return content
- }
- for range message.Images {
- sb.WriteString(placeholder)
- }
+ content, _ = renderContentWithImageTags(content, len(message.Images), imageOffset)
+ return content
+ }
+
+ var sb strings.Builder
+ placeholder := lfm2ImagePlaceholder(false)
+ if strings.Contains(content, placeholder) {
+ return content
+ }
+ for range message.Images {
+ sb.WriteString(placeholder)
}
sb.WriteString(content)
return sb.String()
diff --git a/model/renderers/lfm2_test.go b/model/renderers/lfm2_test.go
index 106f0f5cc..6e27af9bf 100644
--- a/model/renderers/lfm2_test.go
+++ b/model/renderers/lfm2_test.go
@@ -236,7 +236,7 @@ func TestLFM2Renderer_Images(t *testing.T) {
Content: "Describe this image.",
Images: []api.ImageData{api.ImageData("img1")},
},
- expected: "<|startoftext|><|im_start|>user\n[img-0]Describe this image.<|im_end|>\n<|im_start|>assistant\n",
+ expected: "<|startoftext|><|im_start|>user\n[img-0] Describe this image.<|im_end|>\n<|im_start|>assistant\n",
},
{
name: "existing_template_image_placeholder_not_duplicated",
diff --git a/model/renderers/nemotron3nano.go b/model/renderers/nemotron3nano.go
index 8c3cb58aa..744e212db 100644
--- a/model/renderers/nemotron3nano.go
+++ b/model/renderers/nemotron3nano.go
@@ -79,12 +79,14 @@ func (r *Nemotron3NanoRenderer) Render(messages []api.Message, tools []api.Tool,
// Check if previous message was also a tool message
prevWasTool := i > 0 && loopMessages[i-1].Role == "tool"
nextIsTool := i+1 < len(loopMessages) && loopMessages[i+1].Role == "tool"
+ content := r.renderMessageContent(message, imageOffset)
+ imageOffset += len(message.Images)
if !prevWasTool {
sb.WriteString("<|im_start|>user\n")
}
sb.WriteString("\n")
- sb.WriteString(message.Content)
+ sb.WriteString(content)
sb.WriteString("\n\n")
if !nextIsTool {
@@ -237,23 +239,8 @@ func (r *Nemotron3NanoRenderer) renderMessageContent(message api.Message, imageO
return content
}
- if strings.Contains(content, "[img-") {
- return content
- }
-
- if strings.Contains(content, "[img]") {
- for i := range message.Images {
- content = strings.Replace(content, "[img]", fmt.Sprintf("[img-%d]", imageOffset+i), 1)
- }
- return content
- }
-
- var sb strings.Builder
- for i := range message.Images {
- sb.WriteString(fmt.Sprintf("[img-%d]", imageOffset+i))
- }
- sb.WriteString(content)
- return sb.String()
+ content, _ = renderContentWithImageTags(content, len(message.Images), imageOffset)
+ return content
}
func nemotron3NanoRenderContent(content any) string {
diff --git a/model/renderers/nemotron3nano_test.go b/model/renderers/nemotron3nano_test.go
index 1c55ab3e7..79daf8c68 100644
--- a/model/renderers/nemotron3nano_test.go
+++ b/model/renderers/nemotron3nano_test.go
@@ -19,7 +19,7 @@ func TestNemotron3NanoRenderer_Images(t *testing.T) {
msgs: []api.Message{
{Role: "user", Content: "Describe this image.", Images: []api.ImageData{api.ImageData("img1")}},
},
- expected: "\n\n\n<|im_start|>system\n<|im_end|>\n\n<|im_start|>user\n[img-0]Describe this image.<|im_end|>\n\n<|im_start|>assistant\n\n",
+ expected: "\n\n\n<|im_start|>system\n<|im_end|>\n\n<|im_start|>user\n[img-0] Describe this image.<|im_end|>\n\n<|im_start|>assistant\n\n",
},
{
name: "generic image placeholder is rewritten",
@@ -35,7 +35,7 @@ func TestNemotron3NanoRenderer_Images(t *testing.T) {
{Role: "assistant", Content: "It shows something."},
{Role: "user", Content: "Compare these.", Images: []api.ImageData{api.ImageData("img2"), api.ImageData("img3")}},
},
- expected: "\n\n\n<|im_start|>system\n<|im_end|>\n\n<|im_start|>user\n[img-0]Describe the first image.<|im_end|>\n<|im_start|>assistant\nIt shows something.<|im_end|>\n<|im_start|>user\n[img-1][img-2]Compare these.<|im_end|>\n\n<|im_start|>assistant\n\n",
+ expected: "\n\n\n<|im_start|>system\n<|im_end|>\n\n<|im_start|>user\n[img-0] Describe the first image.<|im_end|>\n<|im_start|>assistant\nIt shows something.<|im_end|>\n<|im_start|>user\n[img-1][img-2] Compare these.<|im_end|>\n\n<|im_start|>assistant\n\n",
},
}
diff --git a/model/renderers/qwen35.go b/model/renderers/qwen35.go
index 1e6accbc3..9047db79c 100644
--- a/model/renderers/qwen35.go
+++ b/model/renderers/qwen35.go
@@ -1,7 +1,6 @@
package renderers
import (
- "fmt"
"strings"
"github.com/ollama/ollama/api"
@@ -45,15 +44,14 @@ type Qwen35Renderer struct {
}
func (r *Qwen35Renderer) renderContent(content api.Message, imageOffset int) (string, int) {
+ if r.useImgTags {
+ return renderContentWithImageTags(content.Content, len(content.Images), imageOffset)
+ }
+
// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
var subSb strings.Builder
for range content.Images {
- if r.useImgTags {
- subSb.WriteString(fmt.Sprintf("[img-%d]", imageOffset))
- imageOffset++
- } else {
- subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
- }
+ subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
}
// TODO: support videos
diff --git a/model/renderers/qwen3vl.go b/model/renderers/qwen3vl.go
index f623e8c9c..9a196e5c3 100644
--- a/model/renderers/qwen3vl.go
+++ b/model/renderers/qwen3vl.go
@@ -1,7 +1,6 @@
package renderers
import (
- "fmt"
"strings"
"github.com/ollama/ollama/api"
@@ -15,18 +14,17 @@ type Qwen3VLRenderer struct {
}
func (r *Qwen3VLRenderer) renderContent(content api.Message, imageOffset int) (string, int) {
+ if r.useImgTags {
+ return renderContentWithImageTags(content.Content, len(content.Images), imageOffset)
+ }
+
// This assumes all images are at the front of the message - same assumption as ollama/ollama/runner.go
var subSb strings.Builder
for range content.Images {
// TODO: (jmorganca): how to render this is different for different
// model backends, and so we should eventually parameterize this or
// only output a placeholder such as [img]
- if r.useImgTags {
- subSb.WriteString(fmt.Sprintf("[img-%d]", imageOffset))
- imageOffset++
- } else {
- subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
- }
+ subSb.WriteString("<|vision_start|><|image_pad|><|vision_end|>")
}
// TODO: support videos
@@ -126,7 +124,7 @@ func (r *Qwen3VLRenderer) Render(messages []api.Message, tools []api.Tool, think
if i == 0 || messages[i-1].Role != "tool" {
sb.WriteString("<|im_start|>user")
}
- sb.WriteString("\n\n" + message.Content + "\n")
+ sb.WriteString("\n\n" + content + "\n")
if i == len(messages)-1 || messages[i+1].Role != "tool" {
sb.WriteString("<|im_end|>\n")
}
diff --git a/model/renderers/qwen3vl_nonthinking_test.go b/model/renderers/qwen3vl_nonthinking_test.go
index d72c4ae96..35ae4cbec 100644
--- a/model/renderers/qwen3vl_nonthinking_test.go
+++ b/model/renderers/qwen3vl_nonthinking_test.go
@@ -101,7 +101,7 @@ Let me analyze this image.`,
},
useImgTags: true,
expected: `<|im_start|>user
-[img-0]Describe this image.<|im_end|>
+[img-0] Describe this image.<|im_end|>
<|im_start|>assistant
Let me analyze this image.`,
},
@@ -123,7 +123,7 @@ Let me analyze this image.`,
},
useImgTags: true,
expected: `<|im_start|>user
-[img-0][img-1]Describe these images.<|im_end|>
+[img-0][img-1] Describe these images.<|im_end|>
<|im_start|>assistant
Let me analyze this image.`,
},
diff --git a/server/prompt.go b/server/prompt.go
index 8fa164557..8ec5f23d5 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -75,7 +75,9 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[currMsgIdx:]))
}
- for cnt, msg := range msgs[currMsgIdx:] {
+ renderMsgs := slices.Clone(msgs)
+
+ for cnt, msg := range renderMsgs[currMsgIdx:] {
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
return "", nil, errors.New("this model only supports one image while more than one image requested")
}
@@ -101,11 +103,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
prompt = strings.Replace(prompt, "[img]", imgTag, 1)
}
}
- msgs[currMsgIdx+cnt].Content = prefix + prompt
+
+ if m.Config.Renderer != "" {
+ continue
+ }
+
+ renderMsgs[currMsgIdx+cnt].Content = prefix + prompt
}
// truncate any messages that do not fit into the context window
- p, err := renderPrompt(m, append(system, msgs[currMsgIdx:]...), tools, think)
+ p, err := renderPrompt(m, append(system, renderMsgs[currMsgIdx:]...), tools, think)
if err != nil {
return "", nil, err
}
diff --git a/server/prompt_test.go b/server/prompt_test.go
index e4cc27a5a..3939e06bc 100644
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -401,11 +401,170 @@ func TestChatPromptGLMOcrRendererAddsImageTags(t *testing.T) {
t.Fatalf("len(images) = %d, want %d", got, want)
}
- if !strings.Contains(prompt, "<|user|>\n[img-0][img-1]extract text") {
+ if !strings.Contains(prompt, "<|user|>\n[img-0][img-1] extract text") {
t.Fatalf("prompt missing glm-ocr image tags, got: %q", prompt)
}
}
+func TestChatPromptRendererAddsToolImageTags(t *testing.T) {
+ msgs := []api.Message{
+ {
+ Role: "user",
+ Content: "look at this file",
+ Images: []api.ImageData{[]byte("img-1")},
+ },
+ {
+ Role: "assistant",
+ ToolCalls: []api.ToolCall{
+ {
+ ID: "call_read",
+ Function: api.ToolCallFunction{
+ Name: "Read",
+ },
+ },
+ },
+ },
+ {
+ Role: "tool",
+ Content: "attached image",
+ Images: []api.ImageData{[]byte("img-2")},
+ ToolCallID: "call_read",
+ },
+ }
+
+ tests := []struct {
+ name string
+ renderer string
+ wantUserTag string
+ wantToolContent string
+ }{
+ {
+ name: "gemma4",
+ renderer: "gemma4",
+ wantUserTag: "<|turn>user\n[img-0] look at this file\n",
+ wantToolContent: "[img-1] attached image",
+ },
+ {
+ name: "qwen3-vl",
+ renderer: "qwen3-vl-instruct",
+ wantUserTag: "<|im_start|>user\n[img-0] look at this file<|im_end|>\n",
+ wantToolContent: "\n[img-1] attached image\n",
+ },
+ {
+ name: "qwen3.5",
+ renderer: "qwen3.5",
+ wantUserTag: "<|im_start|>user\n[img-0] look at this file<|im_end|>\n",
+ wantToolContent: "\n[img-1] attached image\n",
+ },
+ {
+ name: "glm-ocr",
+ renderer: "glm-ocr",
+ wantUserTag: "<|user|>\n[img-0] look at this file",
+ wantToolContent: "\n[img-1] attached image\n",
+ },
+ {
+ name: "nemotron-3-nano",
+ renderer: "nemotron-3-nano",
+ wantUserTag: "<|im_start|>user\n[img-0] look at this file<|im_end|>\n",
+ wantToolContent: "\n[img-1] attached image\n",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ m := Model{
+ Config: model.ConfigV2{Renderer: tt.renderer},
+ ProjectorPaths: []string{"vision"},
+ }
+ opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
+ think := false
+
+ prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if got, want := len(images), 2; got != want {
+ t.Fatalf("len(images) = %d, want %d", got, want)
+ }
+
+ if !strings.Contains(prompt, tt.wantUserTag) {
+ t.Fatalf("prompt missing user image tag, got: %q", prompt)
+ }
+
+ if !strings.Contains(prompt, tt.wantToolContent) {
+ t.Fatalf("prompt missing tool image tag, got: %q", prompt)
+ }
+ })
+ }
+}
+
+func TestChatPromptRendererPreservesExplicitImagePlaceholders(t *testing.T) {
+ msgs := []api.Message{
+ {
+ Role: "user",
+ Content: "compare [img] and [img]",
+ Images: []api.ImageData{[]byte("img-1"), []byte("img-2")},
+ },
+ }
+
+ tests := []struct {
+ name string
+ renderer string
+ wantSnippet string
+ }{
+ {
+ name: "gemma4",
+ renderer: "gemma4",
+ wantSnippet: "<|turn>user\ncompare [img-0] and [img-1]\n",
+ },
+ {
+ name: "qwen3-vl",
+ renderer: "qwen3-vl-instruct",
+ wantSnippet: "<|im_start|>user\ncompare [img-0] and [img-1]<|im_end|>\n",
+ },
+ {
+ name: "qwen3.5",
+ renderer: "qwen3.5",
+ wantSnippet: "<|im_start|>user\ncompare [img-0] and [img-1]<|im_end|>\n",
+ },
+ {
+ name: "glm-ocr",
+ renderer: "glm-ocr",
+ wantSnippet: "<|user|>\ncompare [img-0] and [img-1]",
+ },
+ {
+ name: "nemotron-3-nano",
+ renderer: "nemotron-3-nano",
+ wantSnippet: "<|im_start|>user\ncompare [img-0] and [img-1]<|im_end|>\n",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ m := Model{
+ Config: model.ConfigV2{Renderer: tt.renderer},
+ ProjectorPaths: []string{"vision"},
+ }
+ opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
+ think := false
+
+ prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if got, want := len(images), 2; got != want {
+ t.Fatalf("len(images) = %d, want %d", got, want)
+ }
+
+ if !strings.Contains(prompt, tt.wantSnippet) {
+ t.Fatalf("prompt missing replaced placeholders, got: %q", prompt)
+ }
+ })
+ }
+}
+
func TestRenderPromptResolvesDynamicGemma4Renderer(t *testing.T) {
msgs := []api.Message{{Role: "user", Content: "Hello"}}