1
0
mirror of https://github.com/containers/ramalama.git synced 2026-02-05 06:46:39 +01:00

Use default (auto) value for llama.cpp flash-attn

Also fix the uses_nvidia logic which was inverted.

Signed-off-by: Oliver Walsh <owalsh@redhat.com>
This commit is contained in:
Oliver Walsh
2026-01-28 16:41:28 +00:00
parent 2fdd68abd9
commit e956d11d70
7 changed files with 7 additions and 26 deletions

View File

@@ -55,10 +55,6 @@ commands:
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "--flash-attn"
description: "Set Flash Attention use"
value: "on"
if: "{{ host.uses_nvidia or host.uses_metal }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"

View File

@@ -169,9 +169,10 @@ class RamalamaCommandContext:
model = cli_args.model
else:
model = None
host = RamalamaHostContext(
is_container,
check_nvidia() is None,
check_nvidia() is not None,
check_metal(argparse.Namespace(**{"container": is_container})),
should_colorize(),
os.getenv("RAMALAMA_LLAMACPP_RPC_NODES", None),

View File

@@ -16,7 +16,6 @@ from test.conftest import (
skip_if_docker,
skip_if_gh_actions_darwin,
skip_if_no_container,
skip_if_not_darwin,
skip_if_ppc64le,
skip_if_s390x,
)
@@ -182,10 +181,6 @@ def test_basic_dry_run():
[], r".*--cache-reuse 256", None, None, True,
id="check --cache-reuse default value", marks=skip_if_container
),
pytest.param(
[], r".*--flash-attn", None, None, True,
id="check --flash-attn", marks=[skip_if_container, skip_if_not_darwin]
),
pytest.param(
["--host", "127.0.0.1"],
r".*--host 127.0.0.1", None, None, True,

View File

@@ -73,9 +73,6 @@ verify_begin=".*run --rm"
run_ramalama -q --dryrun serve ${model}
assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0"
is "$output" ".*--cache-reuse 256" "should use cache"
if is_darwin; then
is "$output" ".*--flash-attn on" "use flash-attn on Darwin metal"
fi
run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model}
assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1"

View File

@@ -53,10 +53,6 @@ commands:
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "--flash-attn"
description: "Set Flash Attention use"
value: "on"
if: "{{ host.uses_nvidia or host.uses_metal }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"

View File

@@ -55,10 +55,6 @@ commands:
- name: "--no-webui"
description: "Disable the Web UI"
if: "{{ args.webui == 'off' }}"
- name: "--flash-attn"
description: "Set Flash Attention use"
value: "on"
if: "{{ host.uses_nvidia or host.uses_metal }}"
- name: "-ngl"
description: "Number of layers to offload to the GPU if available"
value: "{{ 999 if args.ngl < 0 else args.ngl }}"

View File

@@ -83,23 +83,23 @@ class FactoryInput:
[
(
FactoryInput(),
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
),
(
FactoryInput(has_mmproj=True),
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --mmproj /path/to/mmproj --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --mmproj /path/to/mmproj --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
),
(
FactoryInput(has_chat_template=False),
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501
),
(
FactoryInput(cli_args=CLIArgs(runtime_args="")),
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on", # noqa: E501
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on", # noqa: E501
),
(
FactoryInput(cli_args=CLIArgs(max_tokens=99, runtime_args="")),
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on -n 99", # noqa: E501
"llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on -n 99", # noqa: E501
),
],
)