mirror of
https://github.com/containers/ramalama.git
synced 2026-02-05 06:46:39 +01:00
Also fix the uses_nvidia logic which was inverted. Signed-off-by: Oliver Walsh <owalsh@redhat.com>
117 lines
4.7 KiB
YAML
117 lines
4.7 KiB
YAML
schema_version: "1.0.0"
|
|
commands:
|
|
- name: serve
|
|
inference_engine:
|
|
name: "llama-server"
|
|
binary: "llama-server"
|
|
options: &serve_run_options
|
|
- name: "--host"
|
|
description: "IP address for the AI model server to listen on"
|
|
value: "{{ '0.0.0.0' if args.container else args.host }}"
|
|
- name: "--port"
|
|
description: "Port for the AI model server to listen on"
|
|
value: "{{ args.port }}"
|
|
- name: "--log-file"
|
|
description: "File path for the llama-server writing its logs to"
|
|
value: "{{ args.logfile }}"
|
|
- name: "--model"
|
|
description: "The AI model to run"
|
|
value: "{{ model.model_path }}"
|
|
- name: "--mmproj"
|
|
description: "File path to the mmproj model"
|
|
value: "{{ model.mmproj_path }}"
|
|
required: false
|
|
if: "{{ model.mmproj_path }}"
|
|
- name: "--chat-template-file"
|
|
description: "File path to the chat template used for the model"
|
|
value: "{{ model.chat_template_path }}"
|
|
required: false
|
|
if: "{{ not model.mmproj_path }}"
|
|
- name: "--jinja"
|
|
description: "Flag indicating if the chat template uses Jinja"
|
|
if: "{{ not model.mmproj_path }}"
|
|
- name: "--no-warmup"
|
|
description: "Flag to disable empty run for warm up"
|
|
- name: "--reasoning-budget"
|
|
description: "Controls the amount of thinking allowed"
|
|
value: "0"
|
|
if: "{{ not args.thinking }}"
|
|
- name: "--alias"
|
|
description: "The alias used when running the AI model"
|
|
value: "{{ model.alias }}"
|
|
- name: "--ctx-size"
|
|
description: "Size of the prompt context"
|
|
value: "{{ args.ctx_size }}"
|
|
if: "{{ args.ctx_size > 0 }}"
|
|
- name: "--temp"
|
|
description: "Temperature"
|
|
value: "{{ args.temp }}"
|
|
- name: "--cache-reuse"
|
|
description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
|
|
value: "{{ args.cache_reuse }}"
|
|
- name: "-v"
|
|
description: "Enable debug logs"
|
|
if: "{{ args.debug }}"
|
|
- name: "--no-webui"
|
|
description: "Disable the Web UI"
|
|
if: "{{ args.webui == 'off' }}"
|
|
- name: "-ngl"
|
|
description: "Number of layers to offload to the GPU if available"
|
|
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
|
|
- name: "--model-draft"
|
|
description: "Draft model for speculative decoding"
|
|
value: "{{ model.draft_model_path }}"
|
|
if: "{{ args.model_draft }}"
|
|
- name: "-ngld"
|
|
description: "Number of layers to offload to the GPU if available"
|
|
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
|
|
- name: "--threads"
|
|
description: "Number of Threads to use during generation"
|
|
value: "{{ args.threads }}"
|
|
- name: "--seed"
|
|
description: "Seed the global PRNG"
|
|
value: "{{ args.seed }}"
|
|
- name: "--log-colors"
|
|
description: "Add color to the logs"
|
|
value: "{{ 'on' }}"
|
|
if: "{{ host.should_colorize }}"
|
|
- name: "--rpc"
|
|
description: "Comma separated list of RPC servers"
|
|
value: "{{ host.rpc_nodes }}"
|
|
- name: "-n"
|
|
description: "Maximum number of tokens to generate (0 = unlimited)"
|
|
value: "{{ args.max_tokens }}"
|
|
if: "{{ args.max_tokens > 0 }}"
|
|
# Special case:
|
|
# Pass arbitrary runtime arguments to llama-server
|
|
- name: ""
|
|
description: "Arbitrary runtime arguments for llama-server"
|
|
value: "{{ args.runtime_args }}"
|
|
- name: run
|
|
inference_engine:
|
|
name: "llama-server with chat"
|
|
binary: "llama-server"
|
|
options: *serve_run_options
|
|
- name: perplexity
|
|
inference_engine:
|
|
name: "llama-perplexity"
|
|
binary: "llama-perplexity"
|
|
options: &bench_perplexity_options
|
|
- name: "--model"
|
|
description: "The AI model to run"
|
|
value: "{{ model.model_path }}"
|
|
- name: "-ngl"
|
|
description: "Number of layers to offload to the GPU if available"
|
|
value: "{{ 999 if args.ngl < 0 else args.ngl }}"
|
|
- name: "-ngld"
|
|
description: "Number of layers to offload to the GPU if available"
|
|
value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
|
|
- name: "--threads"
|
|
description: "Number of Threads to use during generation"
|
|
value: "{{ args.threads }}"
|
|
- name: bench
|
|
inference_engine:
|
|
name: "llama-bench"
|
|
binary: "llama-bench"
|
|
options: *bench_perplexity_options
|