Merge pull request #2339 from ramalama-labs/metrics

Add benchmark metrics persistence
2026-02-05 15:47:26 +01:00 · 2026-01-25 02:24:52 -06:00
parent 45f1556e42 82843f94f0
commit 739563aa77
21 changed files with 759 additions and 56 deletions
--- a/docs/ramalama-bench.1.md
+++ b/docs/ramalama-bench.1.md
@@ -51,6 +51,9 @@ process to be launched inside of the container. If an environment variable is
 specified without a value, the container engine checks the host environment
 for a value and set the variable only if it is set on the host.

+#### **--format**
+Set the output format of the benchmark results. Options include json and table (default: table).
+
 #### **--help**, **-h**
 show this help message and exit

--- a/docs/ramalama-benchmarks.1.md
+++ b/docs/ramalama-benchmarks.1.md
@@ -0,0 +1,46 @@
+% ramalama-benchmarks 1
+
+## NAME
+ramalama\-benchmarks - view and interact with historical benchmark results
+
+## SYNOPSIS
+**ramalama benchmarks** [*options*] *command* [*args*...]
+
+## DESCRIPTION
+View and interact with historical benchmark results.
+Results are stored as newline-delimited JSON (JSONL) in a `benchmarks.jsonl` file.
+The storage folder is shown in `ramalama benchmarks --help` and can be
+overridden via `ramalama.benchmarks.storage_folder` in `ramalama.conf`.
+
+## OPTIONS
+
+#### **--help**, **-h**
+show this help message and exit
+
+## COMMANDS
+
+#### **list**
+list benchmark results
+
+## LIST OPTIONS
+
+#### **--limit**=LIMIT
+limit number of results to display
+
+#### **--offset**=OFFSET
+offset for pagination (default: 0)
+
+#### **--format**={table,json}
+output format (table or json) (default: table)
+
+## EXAMPLES
+
+```
+ramalama benchmarks list
+```
+
+## SEE ALSO
+**[ramalama(1)](ramalama.1.md)**, **[ramalama-bench(1)](ramalama-bench.1.md)**, **[ramalama.conf(5)](ramalama.conf.5.md)**
+
+## HISTORY
+Jan 2026, Originally compiled by Ian Eaves <ian@ramalama.com>
--- a/docs/ramalama.1.md
+++ b/docs/ramalama.1.md
@@ -137,6 +137,7 @@ The default can be overridden in the ramalama.conf file.
 | Command                                           | Description                                                |
 | ------------------------------------------------- | ---------------------------------------------------------- |
 | [ramalama-bench(1)](ramalama-bench.1.md)          |benchmark specified AI Model|
+| [ramalama-benchmarks(1)](ramalama-benchmarks.1.md)|view and interact with historical benchmark results|
 | [ramalama-chat(1)](ramalama-chat.1.md)            |OpenAI chat with the specified REST API URL|
 | [ramalama-containers(1)](ramalama-containers.1.md)|list all RamaLama containers|
 | [ramalama-convert(1)](ramalama-convert.1.md)      |convert AI Models from local storage to OCI Image|
--- a/docs/ramalama.conf
+++ b/docs/ramalama.conf
@@ -221,10 +221,22 @@


 [ramalama.user]
+#
 # Suppress the interactive prompt when running on macOS with a Podman VM
 # that doesn't support GPU acceleration (e.g., applehv provider).
 # When set to true, RamaLama will automatically proceed without GPU support
 # instead of asking for confirmation.
 # Can also be set via the `RAMALAMA_USER__NO_MISSING_GPU_PROMPT` environment variable.
 #
+
+[ramalama.benchmarks]
+#storage_folder = <default store>/benchmarks
+#
+# Manually specify where to save benchmark results.
+# By default, results are stored under the default model store directory
+# in benchmarks/benchmarks.jsonl.
+# Changing `ramalama.store` does not update this; set storage_folder explicitly.
+
+
+[ramalama.user]
 #no_missing_gpu_prompt = false
--- a/docs/ramalama.conf.5.md
+++ b/docs/ramalama.conf.5.md
@@ -267,6 +267,16 @@ Configuration settings for the openai hosted provider
 **api_key**=""

 Provider-specific API key used when invoking OpenAI-hosted transports. Overrides `RAMALAMA_API_KEY` when set.
+## RAMALAMA.BENCHMARKS TABLE
+The ramalama.benchmarks table contains benchmark related settings.
+
+`[[ramalama.benchmarks]]`
+
+**storage_folder**="<default store>/benchmarks"
+
+Manually specify where to save benchmark results.
+By default, this will be stored in the default model store directory under `benchmarks/`.
+Changing `ramalama.store` does not update this; set `ramalama.benchmarks.storage_folder` explicitly if needed.

 ## RAMALAMA.USER TABLE
 The ramalama.user table contains user preference settings.
--- a/inference-spec/engines/llama.cpp.yaml
+++ b/inference-spec/engines/llama.cpp.yaml
@@ -117,7 +117,22 @@ commands:
    inference_engine:
      name: "llama-bench"
      binary: "llama-bench"
-      options: *bench_perplexity_options
+      options:
+        - name: "--model"
+          description: "The AI model to run"
+          value: "{{ model.model_path }}"
+        - name: "-ngl"
+          description: "Number of layers to offload to the GPU if available"
+          value: "{{ 999 if args.ngl < 0 else args.ngl }}"
+        - name: "-ngld"
+          description: "Number of layers to offload to the GPU if available"
+          value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
+        - name: "--threads"
+          description: "Number of Threads to use during generation"
+          value: "{{ args.threads }}"
+        - name: "-o"
+          description: "Output format printed to stdout"
+          value: "json"
  - name: rag
    inference_engine:
      name: "rag"
--- a/ramalama/benchmarks/errors.py
+++ b/ramalama/benchmarks/errors.py
@@ -0,0 +1,12 @@
+class MissingStorageFolderError(Exception):
+    def __init__(self):
+        message = """
+No valid benchmarks storage folder could be determined.
+
+Set an explicit path via:
+RAMALAMA__BENCHMARKS_STORAGE_FOLDER=/absolute/path/to/benchmarks
+
+If this seems wrong for your setup, report it at:
+https://www.github.com/containers/ramalama/issues
+        """
+        super().__init__(message)
--- a/ramalama/benchmarks/manager.py
+++ b/ramalama/benchmarks/manager.py
@@ -0,0 +1,50 @@
+import json
+import logging
+from dataclasses import asdict
+from functools import cached_property
+from pathlib import Path
+
+from ramalama.benchmarks.errors import MissingStorageFolderError
+from ramalama.benchmarks.schemas import BenchmarkRecord, DeviceInfoV1, get_benchmark_record
+from ramalama.benchmarks.utilities import parse_jsonl
+from ramalama.config import CONFIG
+from ramalama.log_levels import LogLevel
+
+logger = logging.getLogger("ramalama.benchmarks")
+logger.setLevel(CONFIG.log_level or LogLevel.WARNING)
+
+SCHEMA_VERSION = 1
+BENCHMARKS_FILENAME = "benchmarks.jsonl"
+
+
+class BenchmarksManager:
+    def __init__(self, storage_folder: str | Path | None):
+        if storage_folder is None:
+            raise MissingStorageFolderError
+
+        self.storage_folder = Path(storage_folder)
+        self.storage_file = self.storage_folder / BENCHMARKS_FILENAME
+        self.storage_file.parent.mkdir(parents=True, exist_ok=True)
+
+    @cached_property
+    def device_info(self) -> DeviceInfoV1:
+        return DeviceInfoV1.current_device_info()
+
+    def save(self, results: list[BenchmarkRecord] | BenchmarkRecord):
+        if not isinstance(results, list):
+            results = [results]
+
+        if len(results) == 0:
+            return
+
+        with self.storage_file.open("a", encoding="utf-8") as handle:
+            for record in results:
+                handle.write(json.dumps(asdict(record), ensure_ascii=True))
+                handle.write("\n")
+
+    def list(self) -> list[BenchmarkRecord]:
+        """List benchmark results from JSONL storage."""
+        if not self.storage_file.exists():
+            return []
+        content = self.storage_file.read_text(encoding="utf-8")
+        return [get_benchmark_record(result) for result in parse_jsonl(content)]
--- a/ramalama/benchmarks/schemas.py
+++ b/ramalama/benchmarks/schemas.py
@@ -0,0 +1,211 @@
+import platform
+import socket
+from dataclasses import dataclass, field, fields
+from datetime import datetime, timezone
+from functools import lru_cache
+from typing import Any, ClassVar, Literal, TypeVar, overload
+
+from ramalama.common import get_accel
+
+VersionerT = TypeVar("VersionerT")
+
+
+@dataclass
+class DeviceInfo:
+    pass
+
+
+@dataclass
+class DeviceInfoV1(DeviceInfo):
+    hostname: str
+    operating_system: str
+    cpu_info: str
+    accel: str
+    version: ClassVar[Literal["v1"]] = "v1"
+
+    @classmethod
+    @lru_cache(maxsize=1)
+    def current_device_info(cls) -> "DeviceInfoV1":
+        return cls(
+            hostname=socket.gethostname(),
+            operating_system=f"{platform.system()} {platform.release()}",
+            cpu_info=platform.processor() or platform.machine(),
+            accel=get_accel(),
+        )
+
+
+@dataclass
+class TestConfiguration:
+    pass
+
+
+@dataclass
+class TestConfigurationV1(TestConfiguration):
+    """Container configuration metadata for a benchmark run."""
+
+    container_image: str = ""
+    container_runtime: str = ""
+    inference_engine: str = ""
+    version: Literal["v1"] = "v1"
+    runtime_args: list[str] | None = None
+
+
+@dataclass
+class LlamaBenchResult:
+    pass
+
+
+@dataclass
+class LlamaBenchResultV1(LlamaBenchResult):
+    version: Literal["v1"] = "v1"
+    build_commit: str | None = None
+    build_number: int | None = None
+    backends: str | None = None
+    cpu_info: str | None = None
+    gpu_info: str | None = None
+    model_filename: str | None = None
+    model_type: str | None = None
+    model_size: int | None = None
+    model_n_params: int | None = None
+    n_batch: int | None = None
+    n_ubatch: int | None = None
+    n_threads: int | None = None
+    cpu_mask: str | None = None
+    cpu_strict: int | None = None
+    poll: int | None = None
+    type_k: str | None = None
+    type_v: str | None = None
+    n_gpu_layers: int | None = None
+    n_cpu_moe: int | None = None
+    split_mode: str | None = None
+    main_gpu: int | None = None
+    no_kv_offload: int | None = None
+    flash_attn: int | None = None
+    devices: str | None = None
+    tensor_split: str | None = None
+    tensor_buft_overrides: str | None = None
+    use_mmap: int | None = None
+    embeddings: int | None = None
+    no_op_offload: int | None = None
+    no_host: int | None = None
+    use_direct_io: int | None = None
+    n_prompt: int | None = None
+    n_gen: int | None = None
+    n_depth: int | None = None
+    test_time: str | None = None
+    avg_ns: int | None = None
+    stddev_ns: int | None = None
+    avg_ts: float | None = None
+    stddev_ts: float | None = None
+    samples_ns: str | None = None  # JSON array stored as string
+    samples_ts: str | None = None  # JSON array stored as string
+
+    @classmethod
+    def from_payload(cls, payload: dict) -> "LlamaBenchResult":
+        """Build a result from a llama-bench JSON/JSONL object."""
+        return cls(**{f.name: payload[f.name] for f in fields(cls) if f.name in payload})
+
+
+@dataclass
+class BenchmarkRecord:
+    pass
+
+
+@dataclass
+class BenchmarkRecordV1(BenchmarkRecord):
+    configuration: TestConfigurationV1
+    result: LlamaBenchResultV1
+    version: Literal["v1"] = "v1"
+    created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
+    device: DeviceInfoV1 = field(default_factory=DeviceInfoV1.current_device_info)
+
+    @classmethod
+    def from_payload(cls, payload: dict) -> "BenchmarkRecordV1":
+        payload = {**payload}
+
+        if 'device' in payload:
+            payload['device'] = DeviceInfoV1(**payload.pop("device"))
+
+        configuration = TestConfigurationV1(**payload.pop('configuration', {}))
+        result = LlamaBenchResultV1(**payload.pop('result', {}))
+
+        return cls(configuration=configuration, result=result, **payload)
+
+
+@overload
+def get_device_info(payload: dict) -> DeviceInfoV1: ...
+
+
+@overload
+def get_device_info(payload: dict, version: Literal["v1"]) -> DeviceInfoV1: ...
+
+
+def get_device_info(payload: dict, version: Any = None) -> DeviceInfo:
+    if version is None:
+        version = payload.get('version', "v1")
+
+    if version == "v1":
+        return DeviceInfoV1(**payload)
+
+    raise NotImplementedError(f"No supported DeviceInfo schemas for version {version}")
+
+
+@overload
+def get_test_config(payload: dict) -> TestConfigurationV1: ...
+
+
+@overload
+def get_test_config(payload: dict, version: Literal["v1"]) -> TestConfigurationV1: ...
+
+
+def get_test_config(payload: dict, version: Any = None) -> TestConfiguration:
+    if version is None:
+        version = payload.get('version', "v1")
+
+    if version == "v1":
+        return TestConfigurationV1(**payload)
+
+    raise NotImplementedError(f"No supported TestConfiguration schemas for version {version}")
+
+
+@overload
+def get_llama_bench_result(payload: dict) -> LlamaBenchResultV1: ...
+
+
+@overload
+def get_llama_bench_result(payload: dict, version: Literal["v1"]) -> LlamaBenchResultV1: ...
+
+
+def get_llama_bench_result(payload: dict, version: Any = None) -> LlamaBenchResult:
+    if version is None:
+        version = payload.get('version', "v1")
+
+    if version == "v1":
+        return LlamaBenchResultV1(**payload)
+
+    raise NotImplementedError(f"No supported LlamaBench schemas for version {version}")
+
+
+@overload
+def get_benchmark_record(payload: dict) -> BenchmarkRecord: ...
+
+
+@overload
+def get_benchmark_record(payload: dict, version: Literal["v1"]) -> BenchmarkRecordV1: ...
+
+
+def get_benchmark_record(payload: dict, version: Any = None) -> BenchmarkRecord:
+    if version is None:
+        version = payload.get('version', "v1")
+
+    if version == "v1":
+        return BenchmarkRecordV1.from_payload(payload)
+
+    raise NotImplementedError(f"No supported benchmark schemas for version {version}")
+
+
+def normalize_benchmark_record(benchmark: BenchmarkRecord) -> BenchmarkRecordV1:
+    if isinstance(benchmark, BenchmarkRecordV1):
+        return benchmark
+
+    raise NotImplementedError(f"Received an unsupported benchmark record type {type(benchmark)}")
--- a/ramalama/benchmarks/utilities.py
+++ b/ramalama/benchmarks/utilities.py
@@ -0,0 +1,104 @@
+import json
+
+from ramalama.benchmarks.schemas import (
+    BenchmarkRecord,
+    BenchmarkRecordV1,
+    normalize_benchmark_record,
+)
+
+
+def parse_jsonl(content: str) -> list[dict]:
+    """Parse newline-delimited JSON benchmark results."""
+    results = []
+    for line in content.strip().split("\n"):
+        if not line.strip():
+            continue
+        results.append(json.loads(line))
+    return results
+
+
+def parse_json(content: str) -> list[dict]:
+    """Parse JSON array or single object benchmark results."""
+    data = json.loads(content)
+    if not isinstance(data, list):
+        data = [data]
+    return data
+
+
+def print_bench_results(records: list[BenchmarkRecord]):
+    """Format benchmark results as a table for display."""
+    if not records:
+        return
+    normalized_records: list[BenchmarkRecordV1] = [normalize_benchmark_record(result) for result in records]
+
+    rows: list[dict[str, object | None]] = []
+    for i, item in enumerate(normalized_records):
+        result = item.result
+        model = result.model_filename or ""
+        params = f"{result.model_n_params / 1e9:.2f} B" if result.model_n_params else "-"
+        backend = result.gpu_info or result.cpu_info or "CPU"
+        ngl = str(result.n_gpu_layers) if result.n_gpu_layers else "-"
+        threads = str(result.n_threads) if result.n_threads else "-"
+
+        # Format test type
+        if result.n_prompt and result.n_gen:
+            test = f"pp{result.n_prompt}+tg{result.n_gen}"
+        elif result.n_prompt:
+            test = f"pp{result.n_prompt}"
+        elif result.n_gen:
+            test = f"tg{result.n_gen}"
+        else:
+            test = "-"
+
+        # Format tokens/sec with stddev
+        if result.avg_ts and result.stddev_ts:
+            t_s = f"{result.avg_ts:.2f} ± {result.stddev_ts:.2f}"
+        elif result.avg_ts:
+            t_s = f"{result.avg_ts:.2f}"
+        else:
+            t_s = "-"
+
+        rows.append(
+            {
+                "id": i,
+                "model": model,
+                "params": params,
+                "backend": backend,
+                "ngl": ngl,
+                "threads": threads,
+                "test": test,
+                "t/s": t_s,
+                "engine": item.configuration.container_runtime,
+                "date": item.created_at,
+            }
+        )
+
+    optional_fields = ["id", "engine", "date"]
+    for field in optional_fields:
+        if all(not row.get(field) for row in rows):
+            for row in rows:
+                row.pop(field, None)
+
+    column_order = ["id", "model", "params", "backend", "ngl", "threads", "test", "t/s", "engine", "date"]
+    headers = [column for column in column_order if column in rows[0]]
+
+    col_widths: dict[str, int] = {}
+    for header in headers:
+        max_len = len(header)
+        for row in rows:
+            value = row.get(header)
+            text = "-" if value in (None, "") else str(value)
+            max_len = max(max_len, len(text))
+        col_widths[header] = max_len
+
+    header_row = " | ".join(header.ljust(col_widths[header]) for header in headers)
+    print(f"| {header_row} |")
+    print(f"| {'-' * len(header_row)} |")
+
+    for row in rows:
+        cells = []
+        for header in headers:
+            value = row.get(header)
+            text = "-" if value in (None, "") else str(value)
+            cells.append(text.ljust(col_widths[header]))
+        print(f"| {' | '.join(cells)} |")
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -7,11 +7,14 @@ import shlex
 import subprocess
 import sys
 import urllib.error
+from dataclasses import asdict
 from datetime import datetime, timezone
 from textwrap import dedent
 from typing import Any, get_args
 from urllib.parse import urlparse

+from ramalama.benchmarks.manager import BenchmarksManager
+
 # if autocomplete doesn't exist, just do nothing, don't break
 try:
    import argcomplete
@@ -24,6 +27,7 @@ except Exception:
 import ramalama.chat as chat
 from ramalama import engine
 from ramalama.arg_types import DefaultArgsType
+from ramalama.benchmarks.utilities import print_bench_results
 from ramalama.chat_utils import default_prefix
 from ramalama.cli_arg_normalization import normalize_pull_arg
 from ramalama.command.factory import assemble_command
@@ -301,6 +305,7 @@ def configure_subcommands(parser):
    subparsers = parser.add_subparsers(dest="subcommand")
    subparsers.required = False
    bench_parser(subparsers)
+    benchmarks_parser(subparsers)
    chat_parser(subparsers)
    containers_parser(subparsers)
    convert_parser(subparsers)
@@ -324,6 +329,8 @@ def configure_subcommands(parser):

 def post_parse_setup(args):
    """Perform additional setup after parsing arguments."""
+    if getattr(args, "subcommand", None) == "benchmark":
+        args.subcommand = "bench"

    def map_https_to_transport(input: str) -> str:
        if input.startswith("https://") or input.startswith("http://"):
@@ -510,10 +517,67 @@ def add_network_argument(parser, dflt: str | None = "none"):
 def bench_parser(subparsers):
    parser = subparsers.add_parser("bench", aliases=["benchmark"], help="benchmark specified AI Model")
    runtime_options(parser, "bench")
-    parser.add_argument("MODEL", completer=local_models)  # positional argument
+    parser.add_argument("MODEL", completer=local_models)
+    parser.add_argument(
+        "--format",
+        choices=["table", "json"],
+        default="table",
+        help="output format (table or json)",
+    )
    parser.set_defaults(func=bench_cli)


+def benchmarks_parser(subparsers):
+    storage_folder = CONFIG.benchmarks.storage_folder
+    epilog = f"Storage folder: {storage_folder}" if storage_folder else "Storage folder: not configured"
+    parser = subparsers.add_parser(
+        "benchmarks",
+        help="manage and view benchmark results",
+        epilog=epilog,
+    )
+    parser.set_defaults(func=lambda _: parser.print_help())
+
+    benchmarks_subparsers = parser.add_subparsers(dest="benchmarks_command", metavar="[command]")
+
+    list_parser = benchmarks_subparsers.add_parser("list", help="list benchmark results")
+    list_parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="limit number of results to display",
+    )
+    list_parser.add_argument(
+        "--offset",
+        type=int,
+        default=0,
+        help="offset for pagination",
+    )
+    list_parser.add_argument(
+        "--format",
+        choices=["table", "json"],
+        default="table",
+        help="output format (table or json)",
+    )
+    list_parser.set_defaults(func=benchmarks_list_cli)
+
+
+def benchmarks_list_cli(args):
+    """Display a list of benchmark results from storage."""
+
+    bench_manager = BenchmarksManager(CONFIG.benchmarks.storage_folder)
+    results = bench_manager.list()
+
+    if not results:
+        print("No benchmark results found")
+        return
+
+    if args.format == "json":
+        output = [asdict(item) for item in results]
+        print(json.dumps(output, indent=2, sort_keys=True))
+    else:
+        print_bench_results(results)
+
+
 def containers_parser(subparsers):
    parser = subparsers.add_parser("containers", aliases=["ps"], help="list all RamaLama containers")
    parser.add_argument(
--- a/ramalama/command/context.py
+++ b/ramalama/command/context.py
@@ -2,13 +2,12 @@ import argparse
 import os
 from typing import Optional

-from ramalama.common import check_metal, check_nvidia
+from ramalama.common import check_metal, check_nvidia, get_accel_env_vars
 from ramalama.console import should_colorize
 from ramalama.transports.transport_factory import CLASS_MODEL_TYPES, New


 class RamalamaArgsContext:
-
    def __init__(self) -> None:
        self.cache_reuse: Optional[int] = None
        self.container: Optional[bool] = None
@@ -52,7 +51,6 @@ class RamalamaArgsContext:


 class RamalamaRagGenArgsContext:
-
    def __init__(self) -> None:
        self.debug: bool | None = None
        self.format: str | None = None
@@ -74,7 +72,6 @@ class RamalamaRagGenArgsContext:


 class RamalamaRagArgsContext:
-
    def __init__(self) -> None:
        self.debug: bool | None = None
        self.port: str | None = None
@@ -92,7 +89,6 @@ class RamalamaRagArgsContext:


 class RamalamaModelContext:
-
    def __init__(self, model: CLASS_MODEL_TYPES, is_container: bool, should_generate: bool, dry_run: bool):
        self.model = model
        self.is_container = is_container
@@ -128,7 +124,6 @@ class RamalamaModelContext:


 class RamalamaHostContext:
-
    def __init__(
        self, is_container: bool, uses_nvidia: bool, uses_metal: bool, should_colorize: bool, rpc_nodes: Optional[str]
    ):
@@ -140,7 +135,6 @@ class RamalamaHostContext:


 class RamalamaCommandContext:
-
    def __init__(
        self,
        args: RamalamaArgsContext | RamalamaRagGenArgsContext | RamalamaRagArgsContext,
@@ -169,9 +163,13 @@ class RamalamaCommandContext:
            model = cli_args.model
        else:
            model = None
+
+        skip_gpu_probe = should_generate or bool(get_accel_env_vars())
+        uses_nvidia = True if skip_gpu_probe else (check_nvidia() is None)
+
        host = RamalamaHostContext(
            is_container,
-            check_nvidia() is None,
+            uses_nvidia,
            check_metal(argparse.Namespace(**{"container": is_container})),
            should_colorize(),
            os.getenv("RAMALAMA_LLAMACPP_RPC_NODES", None),
--- a/ramalama/command/factory.py
+++ b/ramalama/command/factory.py
@@ -19,7 +19,6 @@ def is_truthy(resolved_stmt: str) -> bool:


 class CommandFactory:
-
    def __init__(self, spec_files: dict[str, Path], schema_files: dict[str, Path]):
        self.spec_files = spec_files
        self.schema_files = schema_files
--- a/ramalama/config.py
+++ b/ramalama/config.py
@@ -2,6 +2,7 @@ import json
 import os
 import sys
 from dataclasses import dataclass, field, fields
+from functools import lru_cache
 from pathlib import Path
 from typing import Any, Literal, Mapping, TypeAlias

@@ -79,6 +80,7 @@ def get_default_engine() -> SUPPORTED_ENGINES | None:
    return "docker" if available("docker") else None


+@lru_cache(maxsize=1)
 def get_default_store() -> str:
    # Check if running as root (Unix only)
    if hasattr(os, 'geteuid') and os.geteuid() == 0:
@@ -136,6 +138,22 @@ def coerce_to_bool(value: Any) -> bool:
    raise ValueError(f"Cannot coerce {value!r} to bool")


+def get_storage_folder(base_path: str | None = None):
+    if base_path is None:
+        base_path = get_default_store()
+
+    return os.path.join(base_path, "benchmarks")
+
+
+@dataclass
+class Benchmarks:
+    storage_folder: str = field(default_factory=get_storage_folder)
+    disable: bool = False
+
+    def __post_init__(self):
+        os.makedirs(self.storage_folder, exist_ok=True)
+
+
@dataclass
 class UserConfig:
    no_missing_gpu_prompt: bool = False
@@ -225,6 +243,7 @@ class HTTPClientConfig:
 class BaseConfig:
    api: str = "none"
    api_key: str | None = None
+    benchmarks: Benchmarks = field(default_factory=Benchmarks)
    cache_reuse: int = 256
    carimage: str = "registry.access.redhat.com/ubi10-micro:latest"
    container: bool = None  # type: ignore
@@ -235,12 +254,15 @@ class BaseConfig:
    dryrun: bool = False
    engine: SUPPORTED_ENGINES | None = field(default_factory=get_default_engine)
    env: list[str] = field(default_factory=list)
+    gguf_quantization_mode: GGUF_QUANTIZATION_MODES = DEFAULT_GGUF_QUANTIZATION_MODE
    host: str = "0.0.0.0"
+    http_client: HTTPClientConfig = field(default_factory=HTTPClientConfig)
    image: str = None  # type: ignore
    images: RamalamaImages = field(default_factory=RamalamaImages)
    rag_image: str | None = None
    rag_images: RamalamaRagImages = field(default_factory=RamalamaRagImages)
    keep_groups: bool = False
+    log_level: LogLevel | None = None
    max_tokens: int = 0
    ngl: int = -1
    ocr: bool = False
@@ -260,9 +282,6 @@ class BaseConfig:
    transport: str = "ollama"
    user: UserConfig = field(default_factory=UserConfig)
    verify: bool = True
-    gguf_quantization_mode: GGUF_QUANTIZATION_MODES = DEFAULT_GGUF_QUANTIZATION_MODE
-    http_client: HTTPClientConfig = field(default_factory=HTTPClientConfig)
-    log_level: LogLevel | None = None
    provider: ProviderConfig = field(default_factory=ProviderConfig)

    def __post_init__(self):
@@ -297,30 +316,26 @@ class Config(LayeredMixin, BaseConfig):

 def load_file_config() -> dict[str, Any]:
    parser = TOMLParser()
-    config_path = os.getenv("RAMALAMA_CONFIG")
+    config_paths: list[str] = []

-    if config_path and os.path.exists(config_path):
-        config = parser.parse_file(config_path)
-        config = config.get("ramalama", {})
-        config['settings'] = {'config_files': [config_path]}
-        if log_level := config.get("log_level"):
-            config["log_level"] = coerce_log_level(log_level)
-        return config
+    if (config_path := os.getenv("RAMALAMA_CONFIG", None)) and os.path.exists(config_path):
+        config_paths.append(config_path)
+    else:
+        default_config_paths = [os.path.join(conf_dir, "ramalama.conf") for conf_dir in DEFAULT_CONFIG_DIRS]

-    config = {}
-    default_config_paths = [os.path.join(conf_dir, "ramalama.conf") for conf_dir in DEFAULT_CONFIG_DIRS]
+        for path in default_config_paths:
+            if os.path.exists(path):
+                config_paths.append(str(path))

-    config_paths = []
-    for path in default_config_paths:
-        if os.path.exists(path):
-            config_paths.append(str(path))
-            parser.parse_file(path)
-        path_str = f"{path}.d"
-        if os.path.isdir(path_str):
-            for conf_file in sorted(Path(path_str).glob("*.conf")):
-                config_paths.append(str(conf_file))
-                parser.parse_file(conf_file)
-    config = parser.data
+            path_str = f"{path}.d"
+            if os.path.isdir(path_str):
+                for conf_file in sorted(Path(path_str).glob("*.conf")):
+                    config_paths.append(str(conf_file))
+
+    for file in config_paths:
+        parser.parse_file(file)
+
+    config: dict[str, Any] = parser.data
    if config:
        config = config.get('ramalama', {})
        config['settings'] = {'config_files': config_paths}
--- a/ramalama/transports/base.py
+++ b/ramalama/transports/base.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import os
 import platform
 import random
@@ -8,14 +9,19 @@ import sys
 import time
 from abc import ABC, abstractmethod
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Optional

 from ramalama.common import ContainerEntryPoint

 if TYPE_CHECKING:
    from ramalama.chat import ChatOperationalArgs

+from datetime import datetime, timezone
+
 import ramalama.chat as chat
+from ramalama.benchmarks.manager import BenchmarksManager
+from ramalama.benchmarks.schemas import BenchmarkRecord, BenchmarkRecordV1, get_benchmark_record
+from ramalama.benchmarks.utilities import parse_json, print_bench_results
 from ramalama.common import (
    MNT_DIR,
    MNT_FILE_DRAFT,
@@ -25,6 +31,7 @@ from ramalama.common import (
    is_split_file_model,
    perror,
    populate_volume_from_image,
+    run_cmd,
    set_accel_env_vars,
 )
 from ramalama.compose import Compose
@@ -463,7 +470,52 @@ class Transport(TransportBase):

    def bench(self, args, cmd: list[str]):
        set_accel_env_vars()
-        self.execute_command(cmd, args)
+
+        output_format = getattr(args, "format", "table")
+
+        if args.dryrun:
+            if args.container:
+                self.engine.dryrun()
+            else:
+                dry_run(cmd)
+
+            return
+        elif args.container:
+            self.setup_container(args)
+            self.setup_mounts(args)
+            self.engine.add([args.image] + cmd)
+            result = self.engine.run_process()
+        else:
+            result = run_cmd(cmd, encoding="utf-8")
+
+        try:
+            bench_results = parse_json(result.stdout)
+        except (json.JSONDecodeError, ValueError):
+            message = f"Could not parse benchmark output. Expected JSON but got:\n{result.stdout}"
+            raise ValueError(message)
+
+        base_payload: dict = {
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "configuration": {
+                "container_image": args.image,
+                "container_runtime": args.engine,
+                "inference_engine": args.runtime,
+                "runtime_args": cmd,
+            },
+        }
+        results: list[BenchmarkRecord] = list()
+        for bench_result in bench_results:
+            result_record: BenchmarkRecordV1 = get_benchmark_record({"result": bench_result, **base_payload}, "v1")
+            results.append(result_record)
+
+        if output_format == "json":
+            print(result.stdout)
+        else:
+            print_bench_results(results)
+
+        if not CONFIG.benchmarks.disable:
+            bench_manager = BenchmarksManager(CONFIG.benchmarks.storage_folder)
+            bench_manager.save(results)

    def run(self, args, cmd: list[str]):
        # The Run command will first launch a daemonized service
@@ -766,7 +818,7 @@ class Transport(TransportBase):
        compose = Compose(self.model_name, model_paths, chat_template_paths, mmproj_paths, args, exec_args)
        compose.generate().write(output_dir)

-    def inspect_metadata(self) -> Dict[str, Any]:
+    def inspect_metadata(self) -> dict[str, Any]:
        model_path = self._get_entry_model_path(False, False, False)
        if GGUFInfoParser.is_model_gguf(model_path):
            return GGUFInfoParser.parse_metadata(model_path).data
--- a/test/e2e/test_bench.py
+++ b/test/e2e/test_bench.py
@@ -7,7 +7,7 @@ import pytest

@pytest.mark.e2e
@skip_if_no_llama_bench
-def test_model_and_size_columns(test_model):
+def test_model_and_params_columns(test_model):
    result = check_output(["ramalama", "bench", "-t", "2", test_model])

-    assert re.search(r"\|\s+model\s+\|\s+size", result)
+    assert re.search(r"\|\s+model\s+\|\s+params", result)
--- a/test/system/002-bench.bats
+++ b/test/system/002-bench.bats
@@ -17,7 +17,7 @@ function setup() {
@test "ramalama bench" {
    skip_if_nocontainer
    run_ramalama bench -t 2 $(test_model smollm:135m)
-    is "$output" ".*model.*size.*" "model and size in output"
+    is "$output" ".*model.*params.*" "model and params in output"
 }

 # vim: filetype=sh
--- a/test/system/015-help.bats
+++ b/test/system/015-help.bats
@@ -50,7 +50,7 @@ function check_help() {
        # If usage lists no arguments (strings in ALL CAPS), confirm
        # by running with 'invalid-arg' and expecting failure.
        if ! expr "$usage" : '.*[A-Z]' >/dev/null; then
-            if [ "$cmd" != "help" ] && [ "$cmd" != "daemon" ]; then
+            if [ "$cmd" != "help" ] && [ "$cmd" != "daemon" ] && [ "$cmd" != "benchmarks" ]; then
                dprint "$command_string invalid-arg"
                run_ramalama '?' "$@" $cmd invalid-arg
                is "$status" 2 \
--- a/test/unit/test_benchmarks_manager.py
+++ b/test/unit/test_benchmarks_manager.py
@@ -0,0 +1,109 @@
+import json
+
+import pytest
+
+from ramalama.benchmarks import manager, schemas
+
+
+def _make_config(engine: str) -> schemas.TestConfigurationV1:
+    return schemas.TestConfigurationV1(
+        container_image="quay.io/ramalama/ramalama:latest",
+        container_runtime="docker",
+        inference_engine=engine,
+        runtime_args={"threads": 2},
+    )
+
+
+def _make_result(model_name: str, avg_ts: float) -> schemas.LlamaBenchResultV1:
+    return schemas.LlamaBenchResultV1(
+        build_commit="abc123",
+        build_number=1,
+        cpu_info="cpu",
+        gpu_info="gpu",
+        model_filename=model_name,
+        n_threads=2,
+        n_prompt=8,
+        n_gen=16,
+        avg_ts=avg_ts,
+        stddev_ts=0.1,
+    )
+
+
+def _make_device() -> schemas.DeviceInfoV1:
+    return schemas.DeviceInfoV1(
+        hostname="host",
+        operating_system="TestOS 1.0",
+        cpu_info="cpu",
+        accel="none",
+    )
+
+
+def test_save_benchmark_record_writes_jsonl(tmp_path):
+    db = manager.BenchmarksManager(tmp_path)
+    cfg = _make_config("llama.cpp")
+    res = _make_result("model.gguf", 1.5)
+    device = _make_device()
+    record = schemas.BenchmarkRecordV1(
+        configuration=cfg,
+        result=res,
+        created_at="2024-01-01 00:00:00",
+        device=device,
+    )
+
+    db.save(record)
+
+    assert db.storage_file.exists()
+    payload = json.loads(db.storage_file.read_text().strip())
+
+    assert payload["version"] == "v1"
+    assert payload["created_at"] == "2024-01-01 00:00:00"
+    assert payload["configuration"]["inference_engine"] == "llama.cpp"
+    assert payload["result"]["model_filename"] == "model.gguf"
+    assert payload["device"]["hostname"] == "host"
+
+
+def test_list_empty_returns_empty_list(tmp_path):
+    db = manager.BenchmarksManager(tmp_path)
+
+    records = db.list()
+
+    assert records == []
+
+
+def test_manager_missing_storage_folder_raises():
+    with pytest.raises(manager.MissingStorageFolderError):
+        manager.BenchmarksManager(None)
+
+
+def test_list_returns_saved_records_in_order(tmp_path):
+    db = manager.BenchmarksManager(tmp_path)
+    device = _make_device()
+
+    cfg_a = _make_config("engine-a")
+    cfg_b = _make_config("engine-b")
+
+    res_a = _make_result("model-a.gguf", 1.0)
+    res_b = _make_result("model-b.gguf", 2.0)
+
+    record_a = schemas.BenchmarkRecordV1(
+        configuration=cfg_a,
+        result=res_a,
+        created_at="2024-01-01 00:00:00",
+        device=device,
+    )
+    record_b = schemas.BenchmarkRecordV1(
+        configuration=cfg_b,
+        result=res_b,
+        created_at="2024-01-02 00:00:00",
+        device=device,
+    )
+
+    db.save([record_a, record_b])
+
+    stored = db.list()
+    assert len(stored) == 2
+    assert stored[0].configuration.inference_engine == "engine-a"
+    assert stored[1].configuration.inference_engine == "engine-b"
+
+    assert stored[0].result.avg_ts == 1.0
+    assert stored[1].result.avg_ts == 2.0
--- a/test/unit/test_config.py
+++ b/test/unit/test_config.py
@@ -143,6 +143,7 @@ def test_env_overrides_file_and_default():
    ],
 )
 def test_get_default_store(uid, is_root, expected):
+    get_default_store.cache_clear()
    with patch("os.geteuid", return_value=uid):
        assert get_default_store() == expected

--- a/test/unit/test_config_documentation.py
+++ b/test/unit/test_config_documentation.py
@@ -25,7 +25,7 @@ def get_config_fields():
    }

    config_fields = [field.name for field in fields(BaseConfig) if field.name not in excluded_fields]
-    config_fields.extend(('http_client', 'images', 'rag_images', 'user'))
+    config_fields.extend(('benchmarks', 'http_client', 'images', 'rag_images', 'user'))
    return sorted(set(config_fields))


@@ -47,7 +47,7 @@ def get_documented_fields_in_conf():
    documented = set()

    # Subsections that contain their own field documentation (these fields should not be extracted)
-    subsections_with_fields = {'http_client', 'user'}
+    subsections_with_fields = {'benchmarks', 'http_client', 'user'}

    # Track which section we're in to exclude nested fields under commented subsections
    in_commented_nested_section = False
@@ -199,8 +199,8 @@ class TestConfigDocumentation:
        missing = set(config_fields) - set(documented_fields)

        assert not missing, (
-            f"The following CONFIG fields are missing from docs/ramalama.conf:\n"
-            f"{', '.join(sorted(missing))}\n\n"
+            f"The following CONFIG fields are missing from docs/ramalama.conf: "
+            f"`{', '.join(sorted(missing))}`. "
            f"Please add documentation for these fields in docs/ramalama.conf"
        )

@@ -211,11 +211,12 @@ class TestConfigDocumentation:

        missing = set(config_fields) - set(documented_fields)

-        assert not missing, (
-            f"The following CONFIG fields are missing from docs/ramalama.conf.5.md:\n"
-            f"{', '.join(sorted(missing))}\n\n"
+        warning_message = (
+            f"The following CONFIG fields are missing from docs/ramalama.conf.5.md:"
+            f"`{', '.join(sorted(missing))}`. "
            f"Please add documentation for these fields in docs/ramalama.conf.5.md"
        )
+        assert not missing, warning_message

    def test_no_undocumented_fields_in_conf(self):
        """Verify ramalama.conf doesn't document non-existent fields."""
@@ -225,8 +226,8 @@ class TestConfigDocumentation:
        extra = set(documented_fields) - set(config_fields) - self.KNOWN_ALIASES

        assert not extra, (
-            f"The following fields are documented in docs/ramalama.conf but not in CONFIG:\n"
-            f"{', '.join(sorted(extra))}\n\n"
+            f"The following fields are documented in docs/ramalama.conf but not in CONFIG:"
+            f"`{', '.join(sorted(extra))}`. "
            f"These might be typos or outdated documentation."
        )

@@ -238,8 +239,8 @@ class TestConfigDocumentation:
        extra = set(documented_fields) - set(config_fields) - self.KNOWN_ALIASES

        assert not extra, (
-            f"The following fields are documented in docs/ramalama.conf.5.md but not in CONFIG:\n"
-            f"{', '.join(sorted(extra))}\n\n"
+            f"The following fields are documented in docs/ramalama.conf.5.md but not in CONFIG: "
+            f"`{', '.join(sorted(extra))}`. "
            f"These might be typos or outdated documentation."
        )

@@ -258,7 +259,7 @@ class TestConfigDocumentation:
            error_msg.append(f"Fields documented only in ramalama.conf.5.md:\n{', '.join(sorted(only_in_manpage))}")

        assert not error_msg, (
-            "Documentation inconsistency between ramalama.conf and ramalama.conf.5.md:\n\n"
-            + "\n\n".join(error_msg)
-            + "\n\nBoth files should document the same configuration options."
+            "Documentation inconsistency between ramalama.conf and ramalama.conf.5.md:"
+            + " ".join(error_msg)
+            + ". Both files should document the same configuration options."
        )