fix(benchmark): parse llama-bench output with variable column count

KV cache quantization adds type_k/type_v columns to llama-bench output, shifting test and t/s to different indices. Parse from end of row instead of hardcoded positions. Also fix KV suffix separator (underscore to dash) to avoid regex ambiguity with type names like q8_0. Add 5-phase optimization guide, optimization log for tracking results, and research docs on llama.cpp and inference landscape optimizations.
2026-03-27 14:54:19 +01:00
parent 7531f6fa74
commit f92b710492
7 changed files with 2148 additions and 52 deletions
--- a/scripts/benchmark/run-baseline.sh
+++ b/scripts/benchmark/run-baseline.sh
@@ -217,7 +217,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
            KV_SUFFIX=""
            if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
                KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
-                KV_SUFFIX="__kv_${KV_K}_${KV_V}"
+                KV_SUFFIX="__kv_${KV_K}-${KV_V}"
            fi

            # Standard test
@@ -292,8 +292,8 @@ for logfile in sorted(result_dir.glob("*.log")):
    if "FAILED" in content:
        continue

-    # Extract KV cache type from filename (__kv_q8_0_q8_0)
-    kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
+    # Extract KV cache type from filename (__kv_q8_0-q8_0)
+    kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
    kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"

    for line in content.splitlines():
@@ -304,12 +304,15 @@ for logfile in sorted(result_dir.glob("*.log")):
            continue

        parts = [p.strip() for p in line.split("|")]
-        if len(parts) < 10:
+        # Filter out empty parts from leading/trailing pipes
+        data = [p for p in parts if p and "---" not in p]
+        if len(data) < 6:
            continue

        try:
-            test_type = parts[8].strip() if len(parts) > 8 else ""
-            ts_raw = parts[9].strip() if len(parts) > 9 else ""
+            # test and t/s are always the last two columns
+            test_type = data[-2]
+            ts_raw = data[-1]
            if not test_type or not ts_raw:
                continue

@@ -319,9 +322,9 @@ for logfile in sorted(result_dir.glob("*.log")):

            results.append({
                "file": logfile.name,
-                "model": parts[1].strip(),
-                "size": parts[2].strip(),
-                "backend": parts[4].strip(),
+                "model": data[0],
+                "size": data[1],
+                "backend": data[3],
                "test": test_type,
                "tokens_per_sec": float(ts_match.group(1)),
                "kv_cache": kv_type,
--- a/scripts/benchmark/run-suite.sh
+++ b/scripts/benchmark/run-suite.sh
@@ -210,7 +210,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
            KV_SUFFIX=""
            if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
                KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
-                KV_SUFFIX="__kv_${KV_K}_${KV_V}"
+                KV_SUFFIX="__kv_${KV_K}-${KV_V}"
            fi

            # Standard test
@@ -267,8 +267,8 @@ for logfile in sorted(result_dir.glob("*.log")):
    if "FAILED" in content:
        continue

-    # Extract KV cache type from filename (__kv_q8_0_q8_0)
-    kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
+    # Extract KV cache type from filename (__kv_q8_0-q8_0)
+    kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
    kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"

    for line in content.splitlines():
@@ -278,19 +278,22 @@ for logfile in sorted(result_dir.glob("*.log")):
        if "---" in line:
            continue
        parts = [p.strip() for p in line.split("|")]
-        if len(parts) < 10:
+        # Filter out empty parts from leading/trailing pipes
+        data = [p for p in parts if p and "---" not in p]
+        if len(data) < 6:
            continue
        try:
-            test_type = parts[8].strip()
-            ts_raw = parts[9].strip()
+            # test and t/s are always the last two columns
+            test_type = data[-2]
+            ts_raw = data[-1]
            ts_match = re.match(r'([\d.]+)', ts_raw)
            if not ts_match:
                continue
            results.append({
                "file": logfile.name,
-                "model": parts[1].strip(),
-                "size": parts[2].strip(),
-                "backend": parts[4].strip(),
+                "model": data[0],
+                "size": data[1],
+                "backend": data[3],
                "test": test_type,
                "tokens_per_sec": float(ts_match.group(1)),
                "kv_cache": kv_type,