mirror of
https://github.com/jmorganca/ollama
synced 2025-10-06 00:32:49 +02:00
This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.
414 lines
13 KiB
Go
414 lines
13 KiB
Go
package server
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"net/http"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/gin-gonic/gin"
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
"github.com/ollama/ollama/llm"
|
|
)
|
|
|
|
func TestGenerateDebugRenderOnly(t *testing.T) {
|
|
gin.SetMode(gin.TestMode)
|
|
|
|
mock := mockRunner{
|
|
CompletionResponse: llm.CompletionResponse{
|
|
Done: true,
|
|
DoneReason: llm.DoneReasonStop,
|
|
PromptEvalCount: 1,
|
|
PromptEvalDuration: 1,
|
|
EvalCount: 1,
|
|
EvalDuration: 1,
|
|
},
|
|
}
|
|
|
|
s := Server{
|
|
sched: &Scheduler{
|
|
pendingReqCh: make(chan *LlmRequest, 1),
|
|
finishedReqCh: make(chan *LlmRequest, 1),
|
|
expiredCh: make(chan *runnerRef, 1),
|
|
unloadedCh: make(chan any, 1),
|
|
loaded: make(map[string]*runnerRef),
|
|
newServerFn: newMockServer(&mock),
|
|
getGpuFn: getGpuFn,
|
|
getCpuFn: getCpuFn,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
|
// add small delay to simulate loading
|
|
time.Sleep(time.Millisecond)
|
|
req.successCh <- &runnerRef{
|
|
llama: &mock,
|
|
}
|
|
return false
|
|
},
|
|
},
|
|
}
|
|
|
|
go s.sched.Run(t.Context())
|
|
|
|
// Create a test model
|
|
stream := false
|
|
_, digest := createBinFile(t, ggml.KV{
|
|
"general.architecture": "llama",
|
|
"llama.block_count": uint32(1),
|
|
"llama.context_length": uint32(8192),
|
|
"llama.embedding_length": uint32(4096),
|
|
"llama.attention.head_count": uint32(32),
|
|
"llama.attention.head_count_kv": uint32(8),
|
|
"tokenizer.ggml.tokens": []string{""},
|
|
"tokenizer.ggml.scores": []float32{0},
|
|
"tokenizer.ggml.token_type": []int32{0},
|
|
}, []*ggml.Tensor{
|
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
})
|
|
|
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
|
Model: "test-model",
|
|
Files: map[string]string{"file.gguf": digest},
|
|
Template: "{{ .Prompt }}",
|
|
Stream: &stream,
|
|
})
|
|
|
|
if w.Code != http.StatusOK {
|
|
t.Fatalf("expected status 200, got %d", w.Code)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
request api.GenerateRequest
|
|
expectDebug bool
|
|
expectTemplate string
|
|
expectNumImages int
|
|
}{
|
|
{
|
|
name: "debug render only enabled",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "Hello, world!",
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "Hello, world!",
|
|
},
|
|
{
|
|
name: "debug render only disabled",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "Hello, world!",
|
|
DebugRenderOnly: false,
|
|
},
|
|
expectDebug: false,
|
|
},
|
|
{
|
|
name: "debug render only with system prompt",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "User question",
|
|
System: "You are a helpful assistant",
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "User question",
|
|
},
|
|
{
|
|
name: "debug render only with template",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "Hello",
|
|
Template: "PROMPT: {{ .Prompt }}",
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "PROMPT: Hello",
|
|
},
|
|
{
|
|
name: "debug render only with images",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "Describe this image",
|
|
Images: []api.ImageData{[]byte("fake-image-data")},
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "[img-0]\n\nDescribe this image",
|
|
expectNumImages: 1,
|
|
},
|
|
{
|
|
name: "debug render only with raw mode",
|
|
request: api.GenerateRequest{
|
|
Model: "test-model",
|
|
Prompt: "Raw prompt text",
|
|
Raw: true,
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "Raw prompt text",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
// Test both with and without streaming
|
|
streamValues := []bool{false, true}
|
|
for _, stream := range streamValues {
|
|
streamSuffix := ""
|
|
if stream {
|
|
streamSuffix = " (streaming)"
|
|
}
|
|
t.Run(tt.name+streamSuffix, func(t *testing.T) {
|
|
req := tt.request
|
|
req.Stream = &stream
|
|
w := createRequest(t, s.GenerateHandler, req)
|
|
|
|
if tt.expectDebug {
|
|
if w.Code != http.StatusOK {
|
|
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
|
|
}
|
|
|
|
var response api.GenerateResponse
|
|
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
|
|
t.Fatalf("failed to unmarshal response: %v", err)
|
|
}
|
|
|
|
if response.Model != tt.request.Model {
|
|
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
|
|
}
|
|
|
|
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
|
|
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
|
|
}
|
|
|
|
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
|
|
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
|
|
}
|
|
} else {
|
|
// When debug is disabled, it should attempt normal processing
|
|
if w.Code != http.StatusOK {
|
|
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestChatDebugRenderOnly(t *testing.T) {
|
|
gin.SetMode(gin.TestMode)
|
|
|
|
mock := mockRunner{
|
|
CompletionResponse: llm.CompletionResponse{
|
|
Done: true,
|
|
DoneReason: llm.DoneReasonStop,
|
|
PromptEvalCount: 1,
|
|
PromptEvalDuration: 1,
|
|
EvalCount: 1,
|
|
EvalDuration: 1,
|
|
},
|
|
}
|
|
|
|
s := Server{
|
|
sched: &Scheduler{
|
|
pendingReqCh: make(chan *LlmRequest, 1),
|
|
finishedReqCh: make(chan *LlmRequest, 1),
|
|
expiredCh: make(chan *runnerRef, 1),
|
|
unloadedCh: make(chan any, 1),
|
|
loaded: make(map[string]*runnerRef),
|
|
newServerFn: newMockServer(&mock),
|
|
getGpuFn: getGpuFn,
|
|
getCpuFn: getCpuFn,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
|
|
// add small delay to simulate loading
|
|
time.Sleep(time.Millisecond)
|
|
req.successCh <- &runnerRef{
|
|
llama: &mock,
|
|
}
|
|
return false
|
|
},
|
|
},
|
|
}
|
|
|
|
go s.sched.Run(t.Context())
|
|
|
|
// Create a test model
|
|
stream := false
|
|
_, digest := createBinFile(t, ggml.KV{
|
|
"general.architecture": "llama",
|
|
"llama.block_count": uint32(1),
|
|
"llama.context_length": uint32(8192),
|
|
"llama.embedding_length": uint32(4096),
|
|
"llama.attention.head_count": uint32(32),
|
|
"llama.attention.head_count_kv": uint32(8),
|
|
"tokenizer.ggml.tokens": []string{""},
|
|
"tokenizer.ggml.scores": []float32{0},
|
|
"tokenizer.ggml.token_type": []int32{0},
|
|
}, []*ggml.Tensor{
|
|
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
|
|
})
|
|
|
|
w := createRequest(t, s.CreateHandler, api.CreateRequest{
|
|
Model: "test-model",
|
|
Files: map[string]string{"file.gguf": digest},
|
|
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
|
|
Stream: &stream,
|
|
})
|
|
|
|
if w.Code != http.StatusOK {
|
|
t.Fatalf("expected status 200, got %d", w.Code)
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
request api.ChatRequest
|
|
expectDebug bool
|
|
expectTemplate string
|
|
expectNumImages int
|
|
}{
|
|
{
|
|
name: "chat debug render only enabled",
|
|
request: api.ChatRequest{
|
|
Model: "test-model",
|
|
Messages: []api.Message{
|
|
{Role: "system", Content: "You are a helpful assistant"},
|
|
{Role: "user", Content: "Hello"},
|
|
},
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
|
|
},
|
|
{
|
|
name: "chat debug render only disabled",
|
|
request: api.ChatRequest{
|
|
Model: "test-model",
|
|
Messages: []api.Message{
|
|
{Role: "user", Content: "Hello"},
|
|
},
|
|
DebugRenderOnly: false,
|
|
},
|
|
expectDebug: false,
|
|
},
|
|
{
|
|
name: "chat debug with assistant message",
|
|
request: api.ChatRequest{
|
|
Model: "test-model",
|
|
Messages: []api.Message{
|
|
{Role: "user", Content: "Hello"},
|
|
{Role: "assistant", Content: "Hi there!"},
|
|
{Role: "user", Content: "How are you?"},
|
|
},
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
|
|
},
|
|
{
|
|
name: "chat debug with images",
|
|
request: api.ChatRequest{
|
|
Model: "test-model",
|
|
Messages: []api.Message{
|
|
{
|
|
Role: "user",
|
|
Content: "What's in this image?",
|
|
Images: []api.ImageData{[]byte("fake-image-data")},
|
|
},
|
|
},
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "user: [img-0]What's in this image?\n",
|
|
expectNumImages: 1,
|
|
},
|
|
{
|
|
name: "chat debug with tools",
|
|
request: api.ChatRequest{
|
|
Model: "test-model",
|
|
Messages: []api.Message{
|
|
{Role: "user", Content: "Get the weather"},
|
|
},
|
|
Tools: api.Tools{
|
|
{
|
|
Type: "function",
|
|
Function: api.ToolFunction{
|
|
Name: "get_weather",
|
|
Description: "Get weather information",
|
|
},
|
|
},
|
|
},
|
|
DebugRenderOnly: true,
|
|
},
|
|
expectDebug: true,
|
|
expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
// Test both with and without streaming
|
|
streamValues := []bool{false, true}
|
|
for _, stream := range streamValues {
|
|
streamSuffix := ""
|
|
if stream {
|
|
streamSuffix = " (streaming)"
|
|
}
|
|
t.Run(tt.name+streamSuffix, func(t *testing.T) {
|
|
req := tt.request
|
|
req.Stream = &stream
|
|
w := createRequest(t, s.ChatHandler, req)
|
|
|
|
if tt.expectDebug {
|
|
if w.Code != http.StatusOK {
|
|
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
|
|
}
|
|
|
|
var response api.ChatResponse
|
|
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
|
|
t.Fatalf("failed to unmarshal response: %v", err)
|
|
}
|
|
|
|
if response.Model != tt.request.Model {
|
|
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
|
|
}
|
|
|
|
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
|
|
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
|
|
}
|
|
|
|
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
|
|
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
|
|
}
|
|
} else {
|
|
// When debug is disabled, it should attempt normal processing
|
|
if w.Code != http.StatusOK {
|
|
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
}
|
|
}
|