1
0
mirror of https://github.com/containers/ramalama.git synced 2026-02-05 06:46:39 +01:00

Merge pull request #1986 from engelmi/add-perplexity-and-bench

Add perplexity and bench
This commit is contained in:
Daniel J Walsh
2025-10-01 11:03:41 -04:00
committed by GitHub
8 changed files with 31 additions and 294 deletions

View File

@@ -4,7 +4,7 @@ commands:
inference_engine:
name: "llama-server"
binary: "llama-server"
options:
options: &serve_run_options
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ '0.0.0.0' if args.container else args.host }}"
@@ -91,86 +91,26 @@ commands:
inference_engine:
name: "llama-server with chat"
binary: "llama-server"
options:
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ '0.0.0.0' if args.container else args.host }}"
- name: "--port"
description: "Port for the AI model server to listen on"
value: "{{ args.port }}"
- name: "--log-file"
description: "File path for the llama-server writing its logs to"
value: "{{ args.logfile }}"
options: *serve_run_options
- name: perplexity
inference_engine:
name: "llama-perplexity"
binary: "llama-perplexity"
options: &bench_perplexity_options
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "--mmproj"
description: "File path to the mmproj model"
value: "{{ model.mmproj_path }}"
required: false
if: "{{ model.mmproj_path }}"
- name: "--chat-template-file"
description: "File path to the chat template used for the model"
value: "{{ model.chat_template_path }}"
required: false
if: "{{ not model.mmproj_path }}"
- name: "--jinja"
description: "Flag indicating if the chat template uses Jinja"
if: "{{ not model.mmproj_path }}"
- name: "--no-warmup"
description: "Flag to disable empty run for warm up"
- name: "--reasoning-budget"
description: "Controls the amount of thinking allowed"
value: "0"
if: "{{ not args.thinking }}"
- name: "--alias"
description: "The alias used when running the AI model"
value: "{{ model.alias }}"
- name: "--ctx-size"
description: "Size of the prompt context"
value: "{{ args.ctx_size }}"
if: "{{ args.ctx_size > 0 }}"
- name: "--temp"
description: "Temperature"
value: "{{ args.temp }}"
- name: "--cache-reuse"
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
value: "{{ args.cache_reuse }}"
- name: "-v"
description: "Enable debug logs"
if: "{{ args.debug }}"
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "--flash-attn"
description: "Set Flash Attention use"
value: "on"
if: "{{ host.uses_nvidia or host.uses_metal() }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
- name: "--model-draft"
description: "Draft model for speculative decoding"
value: "{{ model.draft_model_path }}"
if: "{{ args.model_draft }}"
- name: "-ngld"
description: "Number of layers to offload to the GPU if available"
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
- name: "--threads"
description: "Number of Threads to use during generation"
value: "{{ args.threads }}"
- name: "--seed"
description: "Seed the global PRNG"
value: "{{ args.seed }}"
- name: "--log-colors"
description: "Add color to the logs"
value: "{{ 'on' }}"
if: "{{ host.should_colorize }}"
- name: "--rpc"
description: "Comma separated list of RPC servers"
value: "{{ host.rpc_nodes }}"
# Special case:
# Pass arbitrary runtime arguments to llama-server
- name: ""
description: "Arbitrary runtime arguments for llama-server"
value: "{{ args.runtime_args }}"
- name: bench
inference_engine:
name: "llama-bench"
binary: "llama-bench"
options: *bench_perplexity_options

View File

@@ -4,7 +4,7 @@ commands:
inference_engine:
name: "mlx server"
binary: "mlx_lm.server"
options:
options: &serve_run_options
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
@@ -32,27 +32,4 @@ commands:
inference_engine:
name: "mlx serve with chat"
binary: "mlx_lm.server"
options:
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "--temp"
description: "Temperature"
value: "{{ args.temp }}"
- name: "--seed"
description: "Seed the global PRNG"
value: "{{ args.seed }}"
- name: "--max-tokens"
description: "Size of the prompt context"
value: "{{ args.ctx_size }}"
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ args.host }}"
- name: "--port"
description: "Port for the AI model server to listen on"
value: "{{ args.port }}"
# Special case:
# Pass arbitrary runtime arguments to mlx server
- name: ""
description: "Arbitrary runtime arguments for mlx server"
value: "{{ args.runtime_args }}"
options: *serve_run_options

View File

@@ -4,7 +4,7 @@ commands:
inference_engine:
name: "vllm server"
binary: "/opt/venv/bin/python3 -m vllm.entrypoints.openai.api_server"
options:
options: &serve_run_options
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
@@ -29,24 +29,4 @@ commands:
inference_engine:
name: "vllm server with chat"
binary: "vllm start"
options:
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "--served-model-name"
description: "The name assigned to the run AI model"
value: "{{ model.model_name }}"
- name: "--max_model_len"
description: "Size of the model context"
value: "{{ args.ctx_size if args.ctx_size else 2048 }}"
- name: "--port"
description: "Port for the AI model server to listen on"
value: "{{ args.port }}"
- name: "--seed"
description: "Seed the global PRNG"
value: "{{ args.seed }}"
# Special case:
# Pass arbitrary runtime arguments to llama-server
- name: ""
description: "Arbitrary runtime arguments for llama-server"
value: "{{ args.runtime_args }}"
options: *serve_run_options

View File

@@ -429,7 +429,8 @@ def list_files_by_modification(args):
def bench_cli(args):
model = New(args.MODEL, args)
model.bench(args)
model.ensure_model_exists(args)
model.bench(args, assemble_command(args))
def add_network_argument(parser, dflt="none"):
@@ -1392,7 +1393,8 @@ def perplexity_parser(subparsers):
def perplexity_cli(args):
model = New(args.MODEL, args)
model.perplexity(args)
model.ensure_model_exists(args)
model.perplexity(args, assemble_command(args))
def inspect_parser(subparsers):

View File

@@ -19,8 +19,8 @@ SUPPORTED_RUNTIMES: TypeAlias = Literal["llama.cpp", "vllm", "mlx"]
COLOR_OPTIONS: TypeAlias = Literal["auto", "always", "never"]
DEFAULT_CONFIG_DIRS = [
"/usr/share/ramalama",
"/usr/local/share/ramalama",
f"{sys.prefix}/share/ramalama",
f"{sys.prefix}/local/share/ramalama",
"/etc/ramalama",
os.path.expanduser(os.path.join(os.getenv("XDG_CONFIG_HOME", "~/.config"), "ramalama")),
]

View File

@@ -104,7 +104,7 @@ class TransportBase(ABC):
raise self.__not_implemented_error("rm")
@abstractmethod
def bench(self, args):
def bench(self, args, cmd: list[str]):
raise self.__not_implemented_error("bench")
@abstractmethod
@@ -112,7 +112,7 @@ class TransportBase(ABC):
raise self.__not_implemented_error("run")
@abstractmethod
def perplexity(self, args):
def perplexity(self, args, cmd: list[str]):
raise self.__not_implemented_error("perplexity")
@abstractmethod
@@ -313,26 +313,6 @@ class Transport(TransportBase):
name = self.get_container_name(args)
self.base(args, name)
def gpu_args(self, args, runner=False):
gpu_args = []
if args.ngl < 0:
args.ngl = 999
if runner:
gpu_args += ["--ngl"] # double dash
else:
gpu_args += ["-ngl"] # single dash
gpu_args += [f'{args.ngl}']
if self.draft_model:
# Use the same arg as ngl to reduce configuration space
gpu_args += ["-ngld", f'{args.ngl}']
gpu_args += ["--threads", f"{args.threads}"]
return gpu_args
def exec_model_in_container(self, cmd_args, args):
if not args.container:
return False
@@ -381,10 +361,9 @@ class Transport(TransportBase):
[f"--mount=type=bind,src={draft_model},destination={MNT_FILE_DRAFT},ro{self.engine.relabel()}"]
)
def bench(self, args):
self.ensure_model_exists(args)
exec_args = self.build_exec_args_bench(args)
self.execute_command(exec_args, args)
def bench(self, args, cmd: list[str]):
set_accel_env_vars()
self.execute_command(cmd, args)
def run(self, args, server_cmd: list[str]):
# The Run command will first launch a daemonized service
@@ -506,25 +485,9 @@ class Transport(TransportBase):
except ProcessLookupError:
pass
def perplexity(self, args):
self.ensure_model_exists(args)
exec_args = self.build_exec_args_perplexity(args)
self.execute_command(exec_args, args)
def build_exec_args_perplexity(self, args):
if getattr(args, "runtime", None) == "mlx":
raise NotImplementedError("Perplexity calculation is not supported by the MLX runtime.")
# Default llama.cpp perplexity calculation
exec_args = ["llama-perplexity"]
def perplexity(self, args, cmd: list[str]):
set_accel_env_vars()
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
exec_args.extend(gpu_args)
exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
return exec_args
self.execute_command(cmd, args)
def exists(self) -> bool:
_, _, all = self.model_store.get_cached_files(self.model_tag)
@@ -541,21 +504,6 @@ class Transport(TransportBase):
self.pull(args)
def build_exec_args_bench(self, args):
if getattr(args, "runtime", None) == "mlx":
raise NotImplementedError("Benchmarking is not supported by the MLX runtime.")
# Default llama.cpp benchmarking
exec_args = ["llama-bench"]
set_accel_env_vars()
gpu_args = self.gpu_args(args=args)
if gpu_args is not None:
exec_args.extend(gpu_args)
exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
return exec_args
def validate_args(self, args):
# MLX validation
if getattr(args, "runtime", None) == "mlx":

View File

@@ -4,7 +4,7 @@ commands:
inference_engine:
name: "llama-server"
binary: "llama-server"
options:
options: &serve_run_options
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ '0.0.0.0' if args.container else args.host }}"
@@ -91,86 +91,4 @@ commands:
inference_engine:
name: "llama-server with chat"
binary: "llama-server"
options:
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ '0.0.0.0' if args.container else args.host }}"
- name: "--port"
description: "Port for the AI model server to listen on"
value: "{{ args.port }}"
- name: "--log-file"
description: "File path for the llama-server writing its logs to"
value: "{{ args.logfile }}"
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "--mmproj"
description: "File path to the mmproj model"
value: "{{ model.mmproj_path }}"
required: false
if: "{{ model.mmproj_path }}"
- name: "--chat-template-file"
description: "File path to the chat template used for the model"
value: "{{ model.chat_template_path }}"
required: false
if: "{{ not model.mmproj_path }}"
- name: "--jinja"
description: "Flag indicating if the chat template uses Jinja"
if: "{{ not model.mmproj_path }}"
- name: "--no-warmup"
description: "Flag to disable empty run for warm up"
- name: "--reasoning-budget"
description: "Controls the amount of thinking allowed"
value: "0"
if: "{{ not args.thinking }}"
- name: "--alias"
description: "The alias used when running the AI model"
value: "{{ model.alias }}"
- name: "--ctx-size"
description: "Size of the prompt context"
value: "{{ args.ctx_size }}"
if: "{{ args.ctx_size > 0 }}"
- name: "--temp"
description: "Temperature"
value: "{{ args.temp }}"
- name: "--cache-reuse"
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
value: "{{ args.cache_reuse }}"
- name: "-v"
description: "Enable debug logs"
if: "{{ args.debug }}"
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "--flash-attn"
description: "Set Flash Attention use"
value: "on"
if: "{{ host.uses_nvidia or host.uses_metal() }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
- name: "--model-draft"
description: "Draft model for speculative decoding"
value: "{{ model.draft_model_path }}"
if: "{{ args.model_draft }}"
- name: "-ngld"
description: "Number of layers to offload to the GPU if available"
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
- name: "--threads"
description: "Number of Threads to use during generation"
value: "{{ args.threads }}"
- name: "--seed"
description: "Seed the global PRNG"
value: "{{ args.seed }}"
- name: "--log-colors"
description: "Add color to the logs"
value: "{{ 'on' }}"
if: "{{ host.should_colorize }}"
- name: "--rpc"
description: "Comma separated list of RPC servers"
value: "{{ host.rpc_nodes }}"
# Special case:
# Pass arbitrary runtime arguments to llama-server
- name: ""
description: "Arbitrary runtime arguments for llama-server"
value: "{{ args.runtime_args }}"
options: *serve_run_options

View File

@@ -231,34 +231,6 @@ class TestMLXRuntime:
assert args.url == "http://127.0.0.1:8080/v1"
assert args.pid2kill == 123
@patch('ramalama.transports.base.platform.system')
@patch('ramalama.transports.base.platform.machine')
def test_mlx_benchmarking_not_supported(self, mock_machine, mock_system):
"""Test that MLX runtime raises NotImplementedError for benchmarking"""
mock_system.return_value = "Darwin"
mock_machine.return_value = "arm64"
args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
model = Transport("test-model", "/tmp/store")
with pytest.raises(NotImplementedError, match="Benchmarking is not supported by the MLX runtime"):
model.build_exec_args_bench(args)
@patch('ramalama.transports.base.platform.system')
@patch('ramalama.transports.base.platform.machine')
def test_mlx_perplexity_not_supported(self, mock_machine, mock_system):
"""Test that MLX runtime raises NotImplementedError for perplexity"""
mock_system.return_value = "Darwin"
mock_machine.return_value = "arm64"
args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
model = Transport("test-model", "/tmp/store")
with pytest.raises(NotImplementedError, match="Perplexity calculation is not supported by the MLX runtime"):
model.build_exec_args_perplexity(args)
class TestOCIModelSetupMounts:
"""Test the OCI model setup_mounts functionality that was refactored"""