mirror of
https://github.com/containers/ramalama.git
synced 2026-02-05 06:46:39 +01:00
Merge pull request #1986 from engelmi/add-perplexity-and-bench
Add perplexity and bench
This commit is contained in:
@@ -4,7 +4,7 @@ commands:
|
||||
inference_engine:
|
||||
name: "llama-server"
|
||||
binary: "llama-server"
|
||||
options:
|
||||
options: &serve_run_options
|
||||
- name: "--host"
|
||||
description: "IP address for the AI model server to listen on"
|
||||
value: "{{ '0.0.0.0' if args.container else args.host }}"
|
||||
@@ -91,86 +91,26 @@ commands:
|
||||
inference_engine:
|
||||
name: "llama-server with chat"
|
||||
binary: "llama-server"
|
||||
options:
|
||||
- name: "--host"
|
||||
description: "IP address for the AI model server to listen on"
|
||||
value: "{{ '0.0.0.0' if args.container else args.host }}"
|
||||
- name: "--port"
|
||||
description: "Port for the AI model server to listen on"
|
||||
value: "{{ args.port }}"
|
||||
- name: "--log-file"
|
||||
description: "File path for the llama-server writing its logs to"
|
||||
value: "{{ args.logfile }}"
|
||||
options: *serve_run_options
|
||||
- name: perplexity
|
||||
inference_engine:
|
||||
name: "llama-perplexity"
|
||||
binary: "llama-perplexity"
|
||||
options: &bench_perplexity_options
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
- name: "--mmproj"
|
||||
description: "File path to the mmproj model"
|
||||
value: "{{ model.mmproj_path }}"
|
||||
required: false
|
||||
if: "{{ model.mmproj_path }}"
|
||||
- name: "--chat-template-file"
|
||||
description: "File path to the chat template used for the model"
|
||||
value: "{{ model.chat_template_path }}"
|
||||
required: false
|
||||
if: "{{ not model.mmproj_path }}"
|
||||
- name: "--jinja"
|
||||
description: "Flag indicating if the chat template uses Jinja"
|
||||
if: "{{ not model.mmproj_path }}"
|
||||
- name: "--no-warmup"
|
||||
description: "Flag to disable empty run for warm up"
|
||||
- name: "--reasoning-budget"
|
||||
description: "Controls the amount of thinking allowed"
|
||||
value: "0"
|
||||
if: "{{ not args.thinking }}"
|
||||
- name: "--alias"
|
||||
description: "The alias used when running the AI model"
|
||||
value: "{{ model.alias }}"
|
||||
- name: "--ctx-size"
|
||||
description: "Size of the prompt context"
|
||||
value: "{{ args.ctx_size }}"
|
||||
if: "{{ args.ctx_size > 0 }}"
|
||||
- name: "--temp"
|
||||
description: "Temperature"
|
||||
value: "{{ args.temp }}"
|
||||
- name: "--cache-reuse"
|
||||
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
|
||||
value: "{{ args.cache_reuse }}"
|
||||
- name: "-v"
|
||||
description: "Enable debug logs"
|
||||
if: "{{ args.debug }}"
|
||||
- name: "--no-webui"
|
||||
description: "Disable the Web UI"
|
||||
if: "{{ args.webui == 'off' }}"
|
||||
- name: "--flash-attn"
|
||||
description: "Set Flash Attention use"
|
||||
value: "on"
|
||||
if: "{{ host.uses_nvidia or host.uses_metal() }}"
|
||||
- name: "-ngl"
|
||||
description: "Number of layers to offload to the GPU if available"
|
||||
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
|
||||
- name: "--model-draft"
|
||||
description: "Draft model for speculative decoding"
|
||||
value: "{{ model.draft_model_path }}"
|
||||
if: "{{ args.model_draft }}"
|
||||
- name: "-ngld"
|
||||
description: "Number of layers to offload to the GPU if available"
|
||||
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
|
||||
- name: "--threads"
|
||||
description: "Number of Threads to use during generation"
|
||||
value: "{{ args.threads }}"
|
||||
- name: "--seed"
|
||||
description: "Seed the global PRNG"
|
||||
value: "{{ args.seed }}"
|
||||
- name: "--log-colors"
|
||||
description: "Add color to the logs"
|
||||
value: "{{ 'on' }}"
|
||||
if: "{{ host.should_colorize }}"
|
||||
- name: "--rpc"
|
||||
description: "Comma separated list of RPC servers"
|
||||
value: "{{ host.rpc_nodes }}"
|
||||
# Special case:
|
||||
# Pass arbitrary runtime arguments to llama-server
|
||||
- name: ""
|
||||
description: "Arbitrary runtime arguments for llama-server"
|
||||
value: "{{ args.runtime_args }}"
|
||||
- name: bench
|
||||
inference_engine:
|
||||
name: "llama-bench"
|
||||
binary: "llama-bench"
|
||||
options: *bench_perplexity_options
|
||||
|
||||
@@ -4,7 +4,7 @@ commands:
|
||||
inference_engine:
|
||||
name: "mlx server"
|
||||
binary: "mlx_lm.server"
|
||||
options:
|
||||
options: &serve_run_options
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
@@ -32,27 +32,4 @@ commands:
|
||||
inference_engine:
|
||||
name: "mlx serve with chat"
|
||||
binary: "mlx_lm.server"
|
||||
options:
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
- name: "--temp"
|
||||
description: "Temperature"
|
||||
value: "{{ args.temp }}"
|
||||
- name: "--seed"
|
||||
description: "Seed the global PRNG"
|
||||
value: "{{ args.seed }}"
|
||||
- name: "--max-tokens"
|
||||
description: "Size of the prompt context"
|
||||
value: "{{ args.ctx_size }}"
|
||||
- name: "--host"
|
||||
description: "IP address for the AI model server to listen on"
|
||||
value: "{{ args.host }}"
|
||||
- name: "--port"
|
||||
description: "Port for the AI model server to listen on"
|
||||
value: "{{ args.port }}"
|
||||
# Special case:
|
||||
# Pass arbitrary runtime arguments to mlx server
|
||||
- name: ""
|
||||
description: "Arbitrary runtime arguments for mlx server"
|
||||
value: "{{ args.runtime_args }}"
|
||||
options: *serve_run_options
|
||||
|
||||
@@ -4,7 +4,7 @@ commands:
|
||||
inference_engine:
|
||||
name: "vllm server"
|
||||
binary: "/opt/venv/bin/python3 -m vllm.entrypoints.openai.api_server"
|
||||
options:
|
||||
options: &serve_run_options
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
@@ -29,24 +29,4 @@ commands:
|
||||
inference_engine:
|
||||
name: "vllm server with chat"
|
||||
binary: "vllm start"
|
||||
options:
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
- name: "--served-model-name"
|
||||
description: "The name assigned to the run AI model"
|
||||
value: "{{ model.model_name }}"
|
||||
- name: "--max_model_len"
|
||||
description: "Size of the model context"
|
||||
value: "{{ args.ctx_size if args.ctx_size else 2048 }}"
|
||||
- name: "--port"
|
||||
description: "Port for the AI model server to listen on"
|
||||
value: "{{ args.port }}"
|
||||
- name: "--seed"
|
||||
description: "Seed the global PRNG"
|
||||
value: "{{ args.seed }}"
|
||||
# Special case:
|
||||
# Pass arbitrary runtime arguments to llama-server
|
||||
- name: ""
|
||||
description: "Arbitrary runtime arguments for llama-server"
|
||||
value: "{{ args.runtime_args }}"
|
||||
options: *serve_run_options
|
||||
|
||||
@@ -429,7 +429,8 @@ def list_files_by_modification(args):
|
||||
|
||||
def bench_cli(args):
|
||||
model = New(args.MODEL, args)
|
||||
model.bench(args)
|
||||
model.ensure_model_exists(args)
|
||||
model.bench(args, assemble_command(args))
|
||||
|
||||
|
||||
def add_network_argument(parser, dflt="none"):
|
||||
@@ -1392,7 +1393,8 @@ def perplexity_parser(subparsers):
|
||||
|
||||
def perplexity_cli(args):
|
||||
model = New(args.MODEL, args)
|
||||
model.perplexity(args)
|
||||
model.ensure_model_exists(args)
|
||||
model.perplexity(args, assemble_command(args))
|
||||
|
||||
|
||||
def inspect_parser(subparsers):
|
||||
|
||||
@@ -19,8 +19,8 @@ SUPPORTED_RUNTIMES: TypeAlias = Literal["llama.cpp", "vllm", "mlx"]
|
||||
COLOR_OPTIONS: TypeAlias = Literal["auto", "always", "never"]
|
||||
|
||||
DEFAULT_CONFIG_DIRS = [
|
||||
"/usr/share/ramalama",
|
||||
"/usr/local/share/ramalama",
|
||||
f"{sys.prefix}/share/ramalama",
|
||||
f"{sys.prefix}/local/share/ramalama",
|
||||
"/etc/ramalama",
|
||||
os.path.expanduser(os.path.join(os.getenv("XDG_CONFIG_HOME", "~/.config"), "ramalama")),
|
||||
]
|
||||
|
||||
@@ -104,7 +104,7 @@ class TransportBase(ABC):
|
||||
raise self.__not_implemented_error("rm")
|
||||
|
||||
@abstractmethod
|
||||
def bench(self, args):
|
||||
def bench(self, args, cmd: list[str]):
|
||||
raise self.__not_implemented_error("bench")
|
||||
|
||||
@abstractmethod
|
||||
@@ -112,7 +112,7 @@ class TransportBase(ABC):
|
||||
raise self.__not_implemented_error("run")
|
||||
|
||||
@abstractmethod
|
||||
def perplexity(self, args):
|
||||
def perplexity(self, args, cmd: list[str]):
|
||||
raise self.__not_implemented_error("perplexity")
|
||||
|
||||
@abstractmethod
|
||||
@@ -313,26 +313,6 @@ class Transport(TransportBase):
|
||||
name = self.get_container_name(args)
|
||||
self.base(args, name)
|
||||
|
||||
def gpu_args(self, args, runner=False):
|
||||
gpu_args = []
|
||||
if args.ngl < 0:
|
||||
args.ngl = 999
|
||||
|
||||
if runner:
|
||||
gpu_args += ["--ngl"] # double dash
|
||||
else:
|
||||
gpu_args += ["-ngl"] # single dash
|
||||
|
||||
gpu_args += [f'{args.ngl}']
|
||||
|
||||
if self.draft_model:
|
||||
# Use the same arg as ngl to reduce configuration space
|
||||
gpu_args += ["-ngld", f'{args.ngl}']
|
||||
|
||||
gpu_args += ["--threads", f"{args.threads}"]
|
||||
|
||||
return gpu_args
|
||||
|
||||
def exec_model_in_container(self, cmd_args, args):
|
||||
if not args.container:
|
||||
return False
|
||||
@@ -381,10 +361,9 @@ class Transport(TransportBase):
|
||||
[f"--mount=type=bind,src={draft_model},destination={MNT_FILE_DRAFT},ro{self.engine.relabel()}"]
|
||||
)
|
||||
|
||||
def bench(self, args):
|
||||
self.ensure_model_exists(args)
|
||||
exec_args = self.build_exec_args_bench(args)
|
||||
self.execute_command(exec_args, args)
|
||||
def bench(self, args, cmd: list[str]):
|
||||
set_accel_env_vars()
|
||||
self.execute_command(cmd, args)
|
||||
|
||||
def run(self, args, server_cmd: list[str]):
|
||||
# The Run command will first launch a daemonized service
|
||||
@@ -506,25 +485,9 @@ class Transport(TransportBase):
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
def perplexity(self, args):
|
||||
self.ensure_model_exists(args)
|
||||
exec_args = self.build_exec_args_perplexity(args)
|
||||
self.execute_command(exec_args, args)
|
||||
|
||||
def build_exec_args_perplexity(self, args):
|
||||
if getattr(args, "runtime", None) == "mlx":
|
||||
raise NotImplementedError("Perplexity calculation is not supported by the MLX runtime.")
|
||||
|
||||
# Default llama.cpp perplexity calculation
|
||||
exec_args = ["llama-perplexity"]
|
||||
def perplexity(self, args, cmd: list[str]):
|
||||
set_accel_env_vars()
|
||||
gpu_args = self.gpu_args(args=args)
|
||||
if gpu_args is not None:
|
||||
exec_args.extend(gpu_args)
|
||||
|
||||
exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
|
||||
|
||||
return exec_args
|
||||
self.execute_command(cmd, args)
|
||||
|
||||
def exists(self) -> bool:
|
||||
_, _, all = self.model_store.get_cached_files(self.model_tag)
|
||||
@@ -541,21 +504,6 @@ class Transport(TransportBase):
|
||||
|
||||
self.pull(args)
|
||||
|
||||
def build_exec_args_bench(self, args):
|
||||
if getattr(args, "runtime", None) == "mlx":
|
||||
raise NotImplementedError("Benchmarking is not supported by the MLX runtime.")
|
||||
|
||||
# Default llama.cpp benchmarking
|
||||
exec_args = ["llama-bench"]
|
||||
set_accel_env_vars()
|
||||
gpu_args = self.gpu_args(args=args)
|
||||
if gpu_args is not None:
|
||||
exec_args.extend(gpu_args)
|
||||
|
||||
exec_args += ["-m", self._get_entry_model_path(args.container, False, args.dryrun)]
|
||||
|
||||
return exec_args
|
||||
|
||||
def validate_args(self, args):
|
||||
# MLX validation
|
||||
if getattr(args, "runtime", None) == "mlx":
|
||||
|
||||
@@ -4,7 +4,7 @@ commands:
|
||||
inference_engine:
|
||||
name: "llama-server"
|
||||
binary: "llama-server"
|
||||
options:
|
||||
options: &serve_run_options
|
||||
- name: "--host"
|
||||
description: "IP address for the AI model server to listen on"
|
||||
value: "{{ '0.0.0.0' if args.container else args.host }}"
|
||||
@@ -91,86 +91,4 @@ commands:
|
||||
inference_engine:
|
||||
name: "llama-server with chat"
|
||||
binary: "llama-server"
|
||||
options:
|
||||
- name: "--host"
|
||||
description: "IP address for the AI model server to listen on"
|
||||
value: "{{ '0.0.0.0' if args.container else args.host }}"
|
||||
- name: "--port"
|
||||
description: "Port for the AI model server to listen on"
|
||||
value: "{{ args.port }}"
|
||||
- name: "--log-file"
|
||||
description: "File path for the llama-server writing its logs to"
|
||||
value: "{{ args.logfile }}"
|
||||
- name: "--model"
|
||||
description: "The AI model to run"
|
||||
value: "{{ model.model_path }}"
|
||||
- name: "--mmproj"
|
||||
description: "File path to the mmproj model"
|
||||
value: "{{ model.mmproj_path }}"
|
||||
required: false
|
||||
if: "{{ model.mmproj_path }}"
|
||||
- name: "--chat-template-file"
|
||||
description: "File path to the chat template used for the model"
|
||||
value: "{{ model.chat_template_path }}"
|
||||
required: false
|
||||
if: "{{ not model.mmproj_path }}"
|
||||
- name: "--jinja"
|
||||
description: "Flag indicating if the chat template uses Jinja"
|
||||
if: "{{ not model.mmproj_path }}"
|
||||
- name: "--no-warmup"
|
||||
description: "Flag to disable empty run for warm up"
|
||||
- name: "--reasoning-budget"
|
||||
description: "Controls the amount of thinking allowed"
|
||||
value: "0"
|
||||
if: "{{ not args.thinking }}"
|
||||
- name: "--alias"
|
||||
description: "The alias used when running the AI model"
|
||||
value: "{{ model.alias }}"
|
||||
- name: "--ctx-size"
|
||||
description: "Size of the prompt context"
|
||||
value: "{{ args.ctx_size }}"
|
||||
if: "{{ args.ctx_size > 0 }}"
|
||||
- name: "--temp"
|
||||
description: "Temperature"
|
||||
value: "{{ args.temp }}"
|
||||
- name: "--cache-reuse"
|
||||
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
|
||||
value: "{{ args.cache_reuse }}"
|
||||
- name: "-v"
|
||||
description: "Enable debug logs"
|
||||
if: "{{ args.debug }}"
|
||||
- name: "--no-webui"
|
||||
description: "Disable the Web UI"
|
||||
if: "{{ args.webui == 'off' }}"
|
||||
- name: "--flash-attn"
|
||||
description: "Set Flash Attention use"
|
||||
value: "on"
|
||||
if: "{{ host.uses_nvidia or host.uses_metal() }}"
|
||||
- name: "-ngl"
|
||||
description: "Number of layers to offload to the GPU if available"
|
||||
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
|
||||
- name: "--model-draft"
|
||||
description: "Draft model for speculative decoding"
|
||||
value: "{{ model.draft_model_path }}"
|
||||
if: "{{ args.model_draft }}"
|
||||
- name: "-ngld"
|
||||
description: "Number of layers to offload to the GPU if available"
|
||||
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
|
||||
- name: "--threads"
|
||||
description: "Number of Threads to use during generation"
|
||||
value: "{{ args.threads }}"
|
||||
- name: "--seed"
|
||||
description: "Seed the global PRNG"
|
||||
value: "{{ args.seed }}"
|
||||
- name: "--log-colors"
|
||||
description: "Add color to the logs"
|
||||
value: "{{ 'on' }}"
|
||||
if: "{{ host.should_colorize }}"
|
||||
- name: "--rpc"
|
||||
description: "Comma separated list of RPC servers"
|
||||
value: "{{ host.rpc_nodes }}"
|
||||
# Special case:
|
||||
# Pass arbitrary runtime arguments to llama-server
|
||||
- name: ""
|
||||
description: "Arbitrary runtime arguments for llama-server"
|
||||
value: "{{ args.runtime_args }}"
|
||||
options: *serve_run_options
|
||||
|
||||
@@ -231,34 +231,6 @@ class TestMLXRuntime:
|
||||
assert args.url == "http://127.0.0.1:8080/v1"
|
||||
assert args.pid2kill == 123
|
||||
|
||||
@patch('ramalama.transports.base.platform.system')
|
||||
@patch('ramalama.transports.base.platform.machine')
|
||||
def test_mlx_benchmarking_not_supported(self, mock_machine, mock_system):
|
||||
"""Test that MLX runtime raises NotImplementedError for benchmarking"""
|
||||
mock_system.return_value = "Darwin"
|
||||
mock_machine.return_value = "arm64"
|
||||
|
||||
args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
|
||||
|
||||
model = Transport("test-model", "/tmp/store")
|
||||
|
||||
with pytest.raises(NotImplementedError, match="Benchmarking is not supported by the MLX runtime"):
|
||||
model.build_exec_args_bench(args)
|
||||
|
||||
@patch('ramalama.transports.base.platform.system')
|
||||
@patch('ramalama.transports.base.platform.machine')
|
||||
def test_mlx_perplexity_not_supported(self, mock_machine, mock_system):
|
||||
"""Test that MLX runtime raises NotImplementedError for perplexity"""
|
||||
mock_system.return_value = "Darwin"
|
||||
mock_machine.return_value = "arm64"
|
||||
|
||||
args = Namespace(runtime="mlx", MODEL="test-model", container=False, generate=False, dryrun=True)
|
||||
|
||||
model = Transport("test-model", "/tmp/store")
|
||||
|
||||
with pytest.raises(NotImplementedError, match="Perplexity calculation is not supported by the MLX runtime"):
|
||||
model.build_exec_args_perplexity(args)
|
||||
|
||||
|
||||
class TestOCIModelSetupMounts:
|
||||
"""Test the OCI model setup_mounts functionality that was refactored"""
|
||||
|
||||
Reference in New Issue
Block a user