llm: Support KV cache quantization with gpt-oss

With the new version of GGML in #12245, KV cache quantization
no longer causes a fallback to CPU.
This commit is contained in:
Jesse Gross
2025-10-03 13:50:02 -07:00
committed by Jesse Gross
parent 33801c1597
commit 19e6796eac

View File

@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
return true
}
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
// gpt-oss uses attention with sinks which does not support quantized cache types
slog.Warn("model only supports non-quantized cache types", "model", arch)
return false
}
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
}