mirror of
https://github.com/jmorganca/ollama
synced 2025-10-05 16:22:53 +02:00
llm: Support KV cache quantization with gpt-oss
With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.
This commit is contained in:
@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
||||
// gpt-oss uses attention with sinks which does not support quantized cache types
|
||||
slog.Warn("model only supports non-quantized cache types", "model", arch)
|
||||
return false
|
||||
}
|
||||
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user