ramalama/test/unit/command/data/engines/llama.cpp.yaml

schema_version: "1.0.0"
commands:
  - name: serve
    inference_engine:
      name: "llama-server"
      binary: "llama-server"
      options: &serve_run_options
        - name: "--host"
          description: "IP address for the AI model server to listen on"
          value: "{{ '0.0.0.0' if args.container else args.host }}"
        - name: "--port"
          description: "Port for the AI model server to listen on"
          value: "{{ args.port }}"
        - name: "--log-file"
          description: "File path for the llama-server writing its logs to"
          value: "{{ args.logfile }}"
        - name: "--model"
          description: "The AI model to run"
          value: "{{ model.model_path }}"
        - name: "--mmproj"
          description: "File path to the mmproj model"
          value: "{{ model.mmproj_path }}"
          required: false
          if: "{{ model.mmproj_path }}"
        - name: "--chat-template-file"
          description: "File path to the chat template used for the model"
          value: "{{ model.chat_template_path }}"
          required: false
          if: "{{ not model.mmproj_path }}"
        - name: "--jinja"
          description: "Flag indicating if the chat template uses Jinja"
          if: "{{ not model.mmproj_path }}"
        - name: "--no-warmup"
          description: "Flag to disable empty run for warm up"
        - name: "--reasoning-budget"
          description: "Controls the amount of thinking allowed"
          value: "0"
          if: "{{ not args.thinking }}"
        - name: "--alias"
          description: "The alias used when running the AI model"
          value: "{{ model.alias }}"
        - name: "--ctx-size"
          description: "Size of the prompt context"
          value: "{{ args.ctx_size }}"
          if: "{{ args.ctx_size > 0 }}"
        - name: "--temp"
          description: "Temperature"
          value: "{{ args.temp }}"
        - name: "--cache-reuse"
          description: "Minimum chunk size to attempt reusing from the cache via KV shifting"
          value: "{{ args.cache_reuse }}"
        - name: "-v"
          description: "Enable debug logs"
          if: "{{ args.debug }}"
        - name: "--no-webui"
          description: "Disable the Web UI"
          if: "{{ args.webui == 'off' }}"
        - name: "-ngl"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ 999 if args.ngl < 0 else args.ngl }}"
        - name: "--model-draft"
          description: "Draft model for speculative decoding"
          value: "{{ model.draft_model_path }}"
          if: "{{ args.model_draft }}"
        - name: "-ngld"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
        - name: "--threads"
          description: "Number of Threads to use during generation"
          value: "{{ args.threads }}"
        - name: "--seed"
          description: "Seed the global PRNG"
          value: "{{ args.seed }}"
        - name: "--log-colors"
          description: "Add color to the logs"
          value: "{{ 'on' }}"
          if: "{{ host.should_colorize }}"
        - name: "--rpc"
          description: "Comma separated list of RPC servers"
          value: "{{ host.rpc_nodes }}"
        - name: "-n"
          description: "Maximum number of tokens to generate (0 = unlimited)"
          value: "{{ args.max_tokens }}"
          if: "{{ args.max_tokens > 0 }}"
        # Special case:
        # Pass arbitrary runtime arguments to llama-server
        - name: ""
          description: "Arbitrary runtime arguments for llama-server"
          value: "{{ args.runtime_args }}"
  - name: run
    inference_engine:
      name: "llama-server with chat"
      binary: "llama-server"
      options: *serve_run_options
  - name: perplexity
    inference_engine:
      name: "llama-perplexity"
      binary: "llama-perplexity"
      options: &bench_perplexity_options
        - name: "--model"
          description: "The AI model to run"
          value: "{{ model.model_path }}"
        - name: "-ngl"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ 999 if args.ngl < 0 else args.ngl }}"
        - name: "-ngld"
          description: "Number of layers to offload to the GPU if available"
          value: "{{ None if not args.model_draft else 999 if args.ngl < 0 else args.ngl }}"
        - name: "--threads"
          description: "Number of Threads to use during generation"
          value: "{{ args.threads }}"
  - name: bench
    inference_engine:
      name: "llama-bench"
      binary: "llama-bench"
      options: *bench_perplexity_options