1
0
mirror of https://github.com/containers/ramalama.git synced 2026-02-05 06:46:39 +01:00
Files
ramalama/test/unit/command/data/engines/llama.cpp.yaml
Oliver Walsh e956d11d70 Use default (auto) value for llama.cpp flash-attn
Also fix the uses_nvidia logic which was inverted.

Signed-off-by: Oliver Walsh <owalsh@redhat.com>
2026-01-29 12:21:11 +00:00

117 lines
4.7 KiB
YAML

schema_version: "1.0.0"
commands:
- name: serve
inference_engine:
name: "llama-server"
binary: "llama-server"
options: &serve_run_options
- name: "--host"
description: "IP address for the AI model server to listen on"
value: "{{ '0.0.0.0' if args.container else args.host }}"
- name: "--port"
description: "Port for the AI model server to listen on"
value: "{{ args.port }}"
- name: "--log-file"
description: "File path for the llama-server writing its logs to"
value: "{{ args.logfile }}"
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "--mmproj"
description: "File path to the mmproj model"
value: "{{ model.mmproj_path }}"
required: false
if: "{{ model.mmproj_path }}"
- name: "--chat-template-file"
description: "File path to the chat template used for the model"
value: "{{ model.chat_template_path }}"
required: false
if: "{{ not model.mmproj_path }}"
- name: "--jinja"
description: "Flag indicating if the chat template uses Jinja"
if: "{{ not model.mmproj_path }}"
- name: "--no-warmup"
description: "Flag to disable empty run for warm up"
- name: "--reasoning-budget"
description: "Controls the amount of thinking allowed"
value: "0"
if: "{{ not args.thinking }}"
- name: "--alias"
description: "The alias used when running the AI model"
value: "{{ model.alias }}"
- name: "--ctx-size"
description: "Size of the prompt context"
value: "{{ args.ctx_size }}"
if: "{{ args.ctx_size > 0 }}"
- name: "--temp"
description: "Temperature"
value: "{{ args.temp }}"
- name: "--cache-reuse"
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
value: "{{ args.cache_reuse }}"
- name: "-v"
description: "Enable debug logs"
if: "{{ args.debug }}"
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
- name: "--model-draft"
description: "Draft model for speculative decoding"
value: "{{ model.draft_model_path }}"
if: "{{ args.model_draft }}"
- name: "-ngld"
description: "Number of layers to offload to the GPU if available"
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
- name: "--threads"
description: "Number of Threads to use during generation"
value: "{{ args.threads }}"
- name: "--seed"
description: "Seed the global PRNG"
value: "{{ args.seed }}"
- name: "--log-colors"
description: "Add color to the logs"
value: "{{ 'on' }}"
if: "{{ host.should_colorize }}"
- name: "--rpc"
description: "Comma separated list of RPC servers"
value: "{{ host.rpc_nodes }}"
- name: "-n"
description: "Maximum number of tokens to generate (0 = unlimited)"
value: "{{ args.max_tokens }}"
if: "{{ args.max_tokens > 0 }}"
# Special case:
# Pass arbitrary runtime arguments to llama-server
- name: ""
description: "Arbitrary runtime arguments for llama-server"
value: "{{ args.runtime_args }}"
- name: run
inference_engine:
name: "llama-server with chat"
binary: "llama-server"
options: *serve_run_options
- name: perplexity
inference_engine:
name: "llama-perplexity"
binary: "llama-perplexity"
options: &bench_perplexity_options
- name: "--model"
description: "The AI model to run"
value: "{{ model.model_path }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
- name: "-ngld"
description: "Number of layers to offload to the GPU if available"
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
- name: "--threads"
description: "Number of Threads to use during generation"
value: "{{ args.threads }}"
- name: bench
inference_engine:
name: "llama-bench"
binary: "llama-bench"
options: *bench_perplexity_options