Merge pull request #1986 from engelmi/add-perplexity-and-bench

Add perplexity and bench
2026-02-05 06:46:39 +01:00 · 2025-10-01 11:03:41 -04:00
parent 602468e1bf 5d00a839f5
commit ccc6e6a61f
8 changed files with 31 additions and 294 deletions
--- a/inference-spec/engines/llama.cpp.yaml
+++ b/inference-spec/engines/llama.cpp.yaml
@@ -4,7 +4,7 @@ commands:
    inference_engine:
      name: "llama-server"
      binary: "llama-server"
-      options:
+      options: &serve_run_options
        - name: "--host"
          description: "IP address for the AI model server to listen on"
          value: "{{ '0.0.0.0' if args.container else args.host }}"
@@ -91,86 +91,26 @@ commands:
    inference_engine:
      name: "llama-server with chat"
      binary: "llama-server"
-      options:
-        - name: "--host"
-          description: "IP address for the AI model server to listen on"
-          value: "{{ '0.0.0.0' if args.container else args.host }}"
-        - name: "--port"
-          description: "Port for the AI model server to listen on"
-          value: "{{ args.port }}"
-        - name: "--log-file"
-          description: "File path for the llama-server writing its logs to"
-          value: "{{ args.logfile }}"
+      options: *serve_run_options
+  - name: perplexity
+    inference_engine:
+      name: "llama-perplexity"
+      binary: "llama-perplexity"
+      options: &bench_perplexity_options
        - name: "--model"
          description: "The AI model to run"
          value: "{{ model.model_path }}"
-        - name: "--mmproj"
-          description: "File path to the mmproj model"
-          value: "{{ model.mmproj_path }}"
-          required: false
-          if: "{{ model.mmproj_path }}"
-        - name: "--chat-template-file"
-          description: "File path to the chat template used for the model"
-          value: "{{ model.chat_template_path }}"
-          required: false
-          if: "{{ not model.mmproj_path }}"
-        - name: "--jinja"
-          description: "Flag indicating if the chat template uses Jinja"
-          if: "{{ not model.mmproj_path }}"
-        - name: "--no-warmup"
-          description: "Flag to disable empty run for warm up"
-        - name: "--reasoning-budget"
-          description: "Controls the amount of thinking allowed"
-          value: "0"
-          if: "{{ not args.thinking }}"
-        - name: "--alias"
-          description: "The alias used when running the AI model"
-          value: "{{ model.alias }}"
-        - name: "--ctx-size"
-          description: "Size of the prompt context"
-          value: "{{ args.ctx_size }}"
-          if: "{{ args.ctx_size > 0 }}"
-        - name: "--temp"
-          description: "Temperature"
-          value: "{{ args.temp }}"
-        - name: "--cache-reuse"
-          description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
-          value: "{{ args.cache_reuse }}"
-        - name: "-v"
-          description: "Enable debug logs"
-          if: "{{ args.debug }}"
-        - name: "--no-webui"
-          description: "Disable the Web UI"
-          if: "{{ args.webui == 'off' }}"
-        - name: "--flash-attn"
-          description: "Set Flash Attention use"
-          value: "on"
-          if: "{{ host.uses_nvidia or host.uses_metal() }}"
        - name: "-ngl"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ 999 if args.ngl < 0 else args.ngl }}"
-        - name: "--model-draft"
-          description: "Draft model for speculative decoding"
-          value: "{{ model.draft_model_path }}"
-          if: "{{ args.model_draft }}"
        - name: "-ngld"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
        - name: "--threads"
          description: "Number of Threads to use during generation"
          value: "{{ args.threads }}"
-        - name: "--seed"
-          description: "Seed the global PRNG"
-          value: "{{ args.seed }}"
-        - name: "--log-colors"
-          description: "Add color to the logs"
-          value: "{{ 'on' }}"
-          if: "{{ host.should_colorize }}"
-        - name: "--rpc"
-          description: "Comma separated list of RPC servers"
-          value: "{{ host.rpc_nodes }}"
-        # Special case:
-        # Pass arbitrary runtime arguments to llama-server
-        - name: ""
-          description: "Arbitrary runtime arguments for llama-server"
-          value: "{{ args.runtime_args }}"
+  - name: bench
+    inference_engine:
+      name: "llama-bench"
+      binary: "llama-bench"
+      options: *bench_perplexity_options
--- a/inference-spec/engines/mlx.yaml
+++ b/inference-spec/engines/mlx.yaml
@@ -4,7 +4,7 @@ commands:
    inference_engine:
      name: "mlx server"
      binary: "mlx_lm.server"
-      options:
+      options: &serve_run_options
        - name: "--model"
          description: "The AI model to run"
          value: "{{ model.model_path }}"
@@ -32,27 +32,4 @@ commands:
    inference_engine:
      name: "mlx serve with chat"
      binary: "mlx_lm.server"
-      options:
-        - name: "--model"
-          description: "The AI model to run"
-          value: "{{ model.model_path }}"
-        - name: "--temp"
-          description: "Temperature"
-          value: "{{ args.temp }}"
-        - name: "--seed"
-          description: "Seed the global PRNG"
-          value: "{{ args.seed }}"
-        - name: "--max-tokens"
-          description: "Size of the prompt context"
-          value: "{{ args.ctx_size }}"
-        - name: "--host"
-          description: "IP address for the AI model server to listen on"
-          value: "{{ args.host }}"
-        - name: "--port"
-          description: "Port for the AI model server to listen on"
-          value: "{{ args.port }}"
-        # Special case:
-        # Pass arbitrary runtime arguments to mlx server
-        - name: ""
-          description: "Arbitrary runtime arguments for mlx server"
-          value: "{{ args.runtime_args }}"
+      options: *serve_run_options
--- a/inference-spec/engines/vllm.yaml
+++ b/inference-spec/engines/vllm.yaml
@@ -4,7 +4,7 @@ commands:
    inference_engine:
      name: "vllm server"
      binary: "/opt/venv/bin/python3 -m vllm.entrypoints.openai.api_server"
-      options:
+      options: &serve_run_options
        - name: "--model"
          description: "The AI model to run"
          value: "{{ model.model_path }}"
@@ -29,24 +29,4 @@ commands:
    inference_engine:
      name: "vllm server with chat"
      binary: "vllm start"
-      options:
-        - name: "--model"
-          description: "The AI model to run"
-          value: "{{ model.model_path }}"
-        - name: "--served-model-name"
-          description: "The name assigned to the run AI model"
-          value: "{{ model.model_name }}"
-        - name: "--max_model_len"
-          description: "Size of the model context"
-          value: "{{ args.ctx_size if args.ctx_size else 2048 }}"
-        - name: "--port"
-          description: "Port for the AI model server to listen on"
-          value: "{{ args.port }}"
-        - name: "--seed"
-          description: "Seed the global PRNG"
-          value: "{{ args.seed }}"
-        # Special case:
-        # Pass arbitrary runtime arguments to llama-server
-        - name: ""
-          description: "Arbitrary runtime arguments for llama-server"
-          value: "{{ args.runtime_args }}"
+      options: *serve_run_options
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -429,7 +429,8 @@ def list_files_by_modification(args):

 def bench_cli(args):
    model = New(args.MODEL, args)
-    model.bench(args)
+    model.ensure_model_exists(args)
+    model.bench(args, assemble_command(args))


 def add_network_argument(parser, dflt="none"):
@@ -1392,7 +1393,8 @@ def perplexity_parser(subparsers):

 def perplexity_cli(args):
    model = New(args.MODEL, args)
-    model.perplexity(args)
+    model.ensure_model_exists(args)
+    model.perplexity(args, assemble_command(args))


 def inspect_parser(subparsers):
--- a/ramalama/config.py
+++ b/ramalama/config.py
@@ -19,8 +19,8 @@ SUPPORTED_RUNTIMES: TypeAlias = Literal["llama.cpp", "vllm", "mlx"]
 COLOR_OPTIONS: TypeAlias = Literal["auto", "always", "never"]

 DEFAULT_CONFIG_DIRS = [
-    "/usr/share/ramalama",
-    "/usr/local/share/ramalama",
+    f"{sys.prefix}/share/ramalama",
+    f"{sys.prefix}/local/share/ramalama",
    "/etc/ramalama",
    os.path.expanduser(os.path.join(os.getenv("XDG_CONFIG_HOME", "~/.config"), "ramalama")),
 ]
--- a/ramalama/transports/base.py
+++ b/ramalama/transports/base.py
@@ -104,7 +104,7 @@ class TransportBase(ABC):
        raise self.__not_implemented_error("rm")

    @abstractmethod
-    def bench(self, args):
+    def bench(self, args, cmd: list[str]):
        raise self.__not_implemented_error("bench")

    @abstractmethod
@@ -112,7 +112,7 @@ class TransportBase(ABC):
        raise self.__not_implemented_error("run")

    @abstractmethod
-    def perplexity(self, args):
+    def perplexity(self, args, cmd: list[str]):
        raise self.__not_implemented_error("perplexity")

    @abstractmethod
@@ -313,26 +313,6 @@ class Transport(TransportBase):
        name = self.get_container_name(args)
        self.base(args, name)

-    def gpu_args(self, args, runner=False):
-        gpu_args = []
-        if args.ngl < 0:
-            args.ngl = 999
-
-        if runner:
-            gpu_args += ["--ngl"]  # double dash
-        else:
-            gpu_args += ["-ngl"]  # single dash
-
-        gpu_args += [f'{args.ngl}']
-
-        if self.draft_model:
-            # Use the same arg as ngl to reduce configuration space
-            gpu_args += ["-ngld", f'{args.ngl}']
-
-        gpu_args += ["--threads", f"{args.threads}"]
-
-        return gpu_args
-
    def exec_model_in_container(self, cmd_args, args):
        if not args.container:
            return False
@@ -381,10 +361,9 @@ class Transport(TransportBase):
                [f"--mount=type=bind,src={draft_model},destination={MNT_FILE_DRAFT},ro{self.engine.relabel()}"]
            )

-    def bench(self, args):
-        self.ensure_model_exists(args)
-        exec_args = self.build_exec_args_bench(args)
-        self.execute_command(exec_args, args)
+    def bench(self, args, cmd: list[str]):
+        set_accel_env_vars()
+        self.execute_command(cmd, args)

    def run(self, args, server_cmd: list[str]):
        # The Run command will first launch a daemonized service
@@ -506,25 +485,9 @@ class Transport(TransportBase):
        except ProcessLookupError:
            pass

-    def perplexity(self, args):
-        self.ensure_model_exists(args)
-        exec_args = self.build_exec_args_perplexity(args)
-        self.execute_command(exec_args, args)
-
-    def build_exec_args_perplexity(self, args):
-        if getattr(args, "runtime", None) == "mlx":
-            raise NotImplementedError("Perplexity calculation is not supported by the MLX runtime.")
-
-        # Default llama.cpp perplexity calculation
-        exec_args = ["llama-perplexity"]
+    def perplexity(self, args, cmd: list[str]):
        set_accel_env_vars()
-        gpu_args = self.gpu_args(args=args)
-        if gpu_args is not None:
-            exec_args.extend(gpu_args)
-
-        exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
-
-        return exec_args
+        self.execute_command(cmd, args)

    def exists(self) -> bool:
        _, _, all = self.model_store.get_cached_files(self.model_tag)
@@ -541,21 +504,6 @@ class Transport(TransportBase):

        self.pull(args)

-    def build_exec_args_bench(self, args):
-        if getattr(args, "runtime", None) == "mlx":
-            raise NotImplementedError("Benchmarking is not supported by the MLX runtime.")
-
-        # Default llama.cpp benchmarking
-        exec_args = ["llama-bench"]
-        set_accel_env_vars()
-        gpu_args = self.gpu_args(args=args)
-        if gpu_args is not None:
-            exec_args.extend(gpu_args)
-
-        exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
-
-        return exec_args
-
    def validate_args(self, args):
        # MLX validation
        if getattr(args, "runtime", None) == "mlx":
--- a/test/unit/command/data/engines/llama.cpp.yaml
+++ b/test/unit/command/data/engines/llama.cpp.yaml
@@ -4,7 +4,7 @@ commands:
    inference_engine:
      name: "llama-server"
      binary: "llama-server"
-      options:
+      options: &serve_run_options
        - name: "--host"
          description: "IP address for the AI model server to listen on"
          value: "{{ '0.0.0.0' if args.container else args.host }}"
@@ -91,86 +91,4 @@ commands:
    inference_engine:
      name: "llama-server with chat"
      binary: "llama-server"
-      options:
-        - name: "--host"
-          description: "IP address for the AI model server to listen on"
-          value: "{{ '0.0.0.0' if args.container else args.host }}"
-        - name: "--port"
-          description: "Port for the AI model server to listen on"
-          value: "{{ args.port }}"
-        - name: "--log-file"
-          description: "File path for the llama-server writing its logs to"
-          value: "{{ args.logfile }}"
-        - name: "--model"
-          description: "The AI model to run"
-          value: "{{ model.model_path }}"
-        - name: "--mmproj"
-          description: "File path to the mmproj model"
-          value: "{{ model.mmproj_path }}"
-          required: false
-          if: "{{ model.mmproj_path }}"
-        - name: "--chat-template-file"
-          description: "File path to the chat template used for the model"
-          value: "{{ model.chat_template_path }}"
-          required: false
-          if: "{{ not model.mmproj_path }}"
-        - name: "--jinja"
-          description: "Flag indicating if the chat template uses Jinja"
-          if: "{{ not model.mmproj_path }}"
-        - name: "--no-warmup"
-          description: "Flag to disable empty run for warm up"
-        - name: "--reasoning-budget"
-          description: "Controls the amount of thinking allowed"
-          value: "0"
-          if: "{{ not args.thinking }}"
-        - name: "--alias"
-          description: "The alias used when running the AI model"
-          value: "{{ model.alias }}"
-        - name: "--ctx-size"
-          description: "Size of the prompt context"
-          value: "{{ args.ctx_size }}"
-          if: "{{ args.ctx_size > 0 }}"
-        - name: "--temp"
-          description: "Temperature"
-          value: "{{ args.temp }}"
-        - name: "--cache-reuse"
-          description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
-          value: "{{ args.cache_reuse }}"
-        - name: "-v"
-          description: "Enable debug logs"
-          if: "{{ args.debug }}"
-        - name: "--no-webui"
-          description: "Disable the Web UI"
-          if: "{{ args.webui == 'off' }}"
-        - name: "--flash-attn"
-          description: "Set Flash Attention use"
-          value: "on"
-          if: "{{ host.uses_nvidia or host.uses_metal() }}"
-        - name: "-ngl"
-          description: "Number of layers to offload to the GPU if available"
-          value: "{{ 999 if args.ngl < 0 else args.ngl }}"
-        - name: "--model-draft"
-          description: "Draft model for speculative decoding"
-          value: "{{ model.draft_model_path }}"
-          if: "{{ args.model_draft }}"
-        - name: "-ngld"
-          description: "Number of layers to offload to the GPU if available"
-          value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
-        - name: "--threads"
-          description: "Number of Threads to use during generation"
-          value: "{{ args.threads }}"
-        - name: "--seed"
-          description: "Seed the global PRNG"
-          value: "{{ args.seed }}"
-        - name: "--log-colors"
-          description: "Add color to the logs"
-          value: "{{ 'on' }}"
-          if: "{{ host.should_colorize }}"
-        - name: "--rpc"
-          description: "Comma separated list of RPC servers"
-          value: "{{ host.rpc_nodes }}"
-        # Special case:
-        # Pass arbitrary runtime arguments to llama-server
-        - name: ""
-          description: "Arbitrary runtime arguments for llama-server"
-          value: "{{ args.runtime_args }}"
+      options: *serve_run_options
--- a/test/unit/test_transport_base.py
+++ b/test/unit/test_transport_base.py
@@ -231,34 +231,6 @@ class TestMLXRuntime:
        assert args.url == "http://127.0.0.1:8080/v1"
        assert args.pid2kill == 123

-    @patch('ramalama.transports.base.platform.system')
-    @patch('ramalama.transports.base.platform.machine')
-    def test_mlx_benchmarking_not_supported(self, mock_machine, mock_system):
-        """Test that MLX runtime raises NotImplementedError for benchmarking"""
-        mock_system.return_value = "Darwin"
-        mock_machine.return_value = "arm64"
-
-        args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
-
-        model = Transport("test-model", "/tmp/store")
-
-        with pytest.raises(NotImplementedError, match="Benchmarking is not supported by the MLX runtime"):
-            model.build_exec_args_bench(args)
-
-    @patch('ramalama.transports.base.platform.system')
-    @patch('ramalama.transports.base.platform.machine')
-    def test_mlx_perplexity_not_supported(self, mock_machine, mock_system):
-        """Test that MLX runtime raises NotImplementedError for perplexity"""
-        mock_system.return_value = "Darwin"
-        mock_machine.return_value = "arm64"
-
-        args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
-
-        model = Transport("test-model", "/tmp/store")
-
-        with pytest.raises(NotImplementedError, match="Perplexity calculation is not supported by the MLX runtime"):
-            model.build_exec_args_perplexity(args)
-

 class TestOCIModelSetupMounts:
    """Test the OCI model setup_mounts functionality that was refactored"""