fix(benchmark): parse llama-bench output with variable column count

KV cache quantization adds type_k/type_v columns to llama-bench output,
shifting test and t/s to different indices. Parse from end of row instead
of hardcoded positions. Also fix KV suffix separator (underscore to dash)
to avoid regex ambiguity with type names like q8_0.

Add 5-phase optimization guide, optimization log for tracking results,
and research docs on llama.cpp and inference landscape optimizations.
This commit is contained in:
Felipe Cardoso
2026-03-27 14:54:19 +01:00
parent 7531f6fa74
commit f92b710492
7 changed files with 2148 additions and 52 deletions

View File

@@ -217,7 +217,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
KV_SUFFIX=""
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
KV_SUFFIX="__kv_${KV_K}_${KV_V}"
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
fi
# Standard test
@@ -292,8 +292,8 @@ for logfile in sorted(result_dir.glob("*.log")):
if "FAILED" in content:
continue
# Extract KV cache type from filename (__kv_q8_0_q8_0)
kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
# Extract KV cache type from filename (__kv_q8_0-q8_0)
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
for line in content.splitlines():
@@ -304,12 +304,15 @@ for logfile in sorted(result_dir.glob("*.log")):
continue
parts = [p.strip() for p in line.split("|")]
if len(parts) < 10:
# Filter out empty parts from leading/trailing pipes
data = [p for p in parts if p and "---" not in p]
if len(data) < 6:
continue
try:
test_type = parts[8].strip() if len(parts) > 8 else ""
ts_raw = parts[9].strip() if len(parts) > 9 else ""
# test and t/s are always the last two columns
test_type = data[-2]
ts_raw = data[-1]
if not test_type or not ts_raw:
continue
@@ -319,9 +322,9 @@ for logfile in sorted(result_dir.glob("*.log")):
results.append({
"file": logfile.name,
"model": parts[1].strip(),
"size": parts[2].strip(),
"backend": parts[4].strip(),
"model": data[0],
"size": data[1],
"backend": data[3],
"test": test_type,
"tokens_per_sec": float(ts_match.group(1)),
"kv_cache": kv_type,