fix(benchmark): parse llama-bench output with variable column count
KV cache quantization adds type_k/type_v columns to llama-bench output, shifting test and t/s to different indices. Parse from end of row instead of hardcoded positions. Also fix KV suffix separator (underscore to dash) to avoid regex ambiguity with type names like q8_0. Add 5-phase optimization guide, optimization log for tracking results, and research docs on llama.cpp and inference landscape optimizations.
This commit is contained in:
@@ -217,7 +217,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
KV_SUFFIX=""
|
||||
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||
KV_SUFFIX="__kv_${KV_K}_${KV_V}"
|
||||
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||
fi
|
||||
|
||||
# Standard test
|
||||
@@ -292,8 +292,8 @@ for logfile in sorted(result_dir.glob("*.log")):
|
||||
if "FAILED" in content:
|
||||
continue
|
||||
|
||||
# Extract KV cache type from filename (__kv_q8_0_q8_0)
|
||||
kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
|
||||
# Extract KV cache type from filename (__kv_q8_0-q8_0)
|
||||
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
|
||||
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||
|
||||
for line in content.splitlines():
|
||||
@@ -304,12 +304,15 @@ for logfile in sorted(result_dir.glob("*.log")):
|
||||
continue
|
||||
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) < 10:
|
||||
# Filter out empty parts from leading/trailing pipes
|
||||
data = [p for p in parts if p and "---" not in p]
|
||||
if len(data) < 6:
|
||||
continue
|
||||
|
||||
try:
|
||||
test_type = parts[8].strip() if len(parts) > 8 else ""
|
||||
ts_raw = parts[9].strip() if len(parts) > 9 else ""
|
||||
# test and t/s are always the last two columns
|
||||
test_type = data[-2]
|
||||
ts_raw = data[-1]
|
||||
if not test_type or not ts_raw:
|
||||
continue
|
||||
|
||||
@@ -319,9 +322,9 @@ for logfile in sorted(result_dir.glob("*.log")):
|
||||
|
||||
results.append({
|
||||
"file": logfile.name,
|
||||
"model": parts[1].strip(),
|
||||
"size": parts[2].strip(),
|
||||
"backend": parts[4].strip(),
|
||||
"model": data[0],
|
||||
"size": data[1],
|
||||
"backend": data[3],
|
||||
"test": test_type,
|
||||
"tokens_per_sec": float(ts_match.group(1)),
|
||||
"kv_cache": kv_type,
|
||||
|
||||
@@ -210,7 +210,7 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
KV_SUFFIX=""
|
||||
if [[ "$KV_K" != "f16" || "$KV_V" != "f16" ]]; then
|
||||
KV_ARGS+=(-ctk "$KV_K" -ctv "$KV_V")
|
||||
KV_SUFFIX="__kv_${KV_K}_${KV_V}"
|
||||
KV_SUFFIX="__kv_${KV_K}-${KV_V}"
|
||||
fi
|
||||
|
||||
# Standard test
|
||||
@@ -267,8 +267,8 @@ for logfile in sorted(result_dir.glob("*.log")):
|
||||
if "FAILED" in content:
|
||||
continue
|
||||
|
||||
# Extract KV cache type from filename (__kv_q8_0_q8_0)
|
||||
kv_match = re.search(r'__kv_([a-z0-9_]+)_([a-z0-9_]+)\.log$', logfile.name)
|
||||
# Extract KV cache type from filename (__kv_q8_0-q8_0)
|
||||
kv_match = re.search(r'__kv_(.+)-(.+)\.log$', logfile.name)
|
||||
kv_type = f"{kv_match.group(1)}/{kv_match.group(2)}" if kv_match else "f16/f16"
|
||||
|
||||
for line in content.splitlines():
|
||||
@@ -278,19 +278,22 @@ for logfile in sorted(result_dir.glob("*.log")):
|
||||
if "---" in line:
|
||||
continue
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
if len(parts) < 10:
|
||||
# Filter out empty parts from leading/trailing pipes
|
||||
data = [p for p in parts if p and "---" not in p]
|
||||
if len(data) < 6:
|
||||
continue
|
||||
try:
|
||||
test_type = parts[8].strip()
|
||||
ts_raw = parts[9].strip()
|
||||
# test and t/s are always the last two columns
|
||||
test_type = data[-2]
|
||||
ts_raw = data[-1]
|
||||
ts_match = re.match(r'([\d.]+)', ts_raw)
|
||||
if not ts_match:
|
||||
continue
|
||||
results.append({
|
||||
"file": logfile.name,
|
||||
"model": parts[1].strip(),
|
||||
"size": parts[2].strip(),
|
||||
"backend": parts[4].strip(),
|
||||
"model": data[0],
|
||||
"size": data[1],
|
||||
"backend": data[3],
|
||||
"test": test_type,
|
||||
"tokens_per_sec": float(ts_match.group(1)),
|
||||
"kv_cache": kv_type,
|
||||
|
||||
Reference in New Issue
Block a user