mirror of
https://github.com/jmorganca/ollama
synced 2025-10-06 00:32:49 +02:00
llm: Support KV cache quantization with gpt-oss
With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.
This commit is contained in:
@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
|
||||||
// gpt-oss uses attention with sinks which does not support quantized cache types
|
|
||||||
slog.Warn("model only supports non-quantized cache types", "model", arch)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user