llm: Support KV cache quantization with gpt-oss

With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.
2025-10-06 00:32:49 +02:00 · 2025-10-03 13:50:02 -07:00
parent 33801c1597
commit 19e6796eac
1 changed files with 0 additions and 5 deletions
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
 		return true
 	}
 	if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
 		// gpt-oss uses attention with sinks which does not support quantized cache types
 		slog.Warn("model only supports non-quantized cache types", "model", arch)
 		return false
 	}
 	return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
 }