feat(serve): upgrade daily driver to qwen3.6-35b-a3b q6_k_xl
Switch `make serve` default to Qwen3.6 UD Q6_K_XL (32 GB, hybrid DeltaNet, near-lossless) and register it in the model catalog. Add --jinja to the llama-server launcher so tool/function calling works — without it clients silently ignore tool definitions advertised by the server.
This commit is contained in:
4
Makefile
4
Makefile
@@ -39,8 +39,8 @@ benchmark-compare: ## Compare two benchmark runs (usage: make benchmark-compare
|
|||||||
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
@bash bin/benchmark compare $(BEFORE) $(AFTER)
|
||||||
|
|
||||||
# --- Serve ---
|
# --- Serve ---
|
||||||
serve: ## Launch APEX I-Compact daily driver (2 slots, 256K ctx)
|
serve: ## Launch Qwen3.6-35B-A3B UD-Q6_K_XL daily driver (2 slots, 256K ctx)
|
||||||
@bash bin/serve -m Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf --parallel 2 --ctx 262144 $(ARGS)
|
@bash bin/serve -m Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf --parallel 2 --ctx 262144 $(ARGS)
|
||||||
|
|
||||||
serve-custom: ## Launch llama-server with custom model (ARGS="-m MODEL.gguf")
|
serve-custom: ## Launch llama-server with custom model (ARGS="-m MODEL.gguf")
|
||||||
@bash bin/serve $(ARGS)
|
@bash bin/serve $(ARGS)
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ qwen3.5-27b-opus-distill|Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distille
|
|||||||
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
qwen3.5-35b-a3b-q4|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf|21|moe|MoE 35B, 3B active, Unsloth dynamic XL
|
||||||
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
qwen3.5-35b-a3b-q8|unsloth/Qwen3.5-35B-A3B-GGUF|Qwen3.5-35B-A3B-Q8_0.gguf|37|moe|MoE 35B Q8, near-full precision
|
||||||
qwen3.5-35b-a3b-apex-compact|mudler/Qwen3.5-35B-A3B-Claude-Distilled-APEX-GGUF|Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf|17|moe|MoE 35B Claude-distilled APEX, I-Compact quant
|
qwen3.5-35b-a3b-apex-compact|mudler/Qwen3.5-35B-A3B-Claude-Distilled-APEX-GGUF|Qwen3.5-35B-A3B-Claude-Distilled-APEX-I-Compact.gguf|17|moe|MoE 35B Claude-distilled APEX, I-Compact quant
|
||||||
|
qwen3.6-35b-a3b-q6xl|unsloth/Qwen3.6-35B-A3B-GGUF|Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf|32|moe|Qwen3.6 MoE 35B/3B active, UD Q6 XL (near-lossless, hybrid DeltaNet)
|
||||||
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
nemotron-cascade2-q8|bartowski/nvidia_Nemotron-Cascade-2-30B-A3B-GGUF|nvidia_Nemotron-Cascade-2-30B-A3B-Q8_0.gguf|31|moe|Nemotron Cascade 2, Mamba-2 hybrid (replaces Nano)
|
||||||
|
|
||||||
# ── Coding models ─────────────────────────────────────────
|
# ── Coding models ─────────────────────────────────────────
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ SERVER_ARGS=(
|
|||||||
-ngl 99 # Full GPU offload
|
-ngl 99 # Full GPU offload
|
||||||
--no-mmap # Direct load, no mmap overhead
|
--no-mmap # Direct load, no mmap overhead
|
||||||
-fa on # Flash attention
|
-fa on # Flash attention
|
||||||
|
--jinja # Required for tool calling (clients ignored without it)
|
||||||
-m "$TOOLBOX_MODEL_PATH"
|
-m "$TOOLBOX_MODEL_PATH"
|
||||||
-c "$CTX_SIZE" # Context size
|
-c "$CTX_SIZE" # Context size
|
||||||
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
--cache-type-k q4_0 # KV cache quantization (fastest on Vulkan)
|
||||||
|
|||||||
Reference in New Issue
Block a user