diff --git a/inference-spec/engines/llama.cpp.yaml b/inference-spec/engines/llama.cpp.yaml index 2d8ec4b7..d1823a99 100644 --- a/inference-spec/engines/llama.cpp.yaml +++ b/inference-spec/engines/llama.cpp.yaml @@ -55,10 +55,6 @@ commands: - name: "--no-webui" description: "Disable the Web UI" if: "{{ args.webui == 'off' }}" - - name: "--flash-attn" - description: "Set Flash Attention use" - value: "on" - if: "{{ host.uses_nvidia or host.uses_metal }}" - name: "-ngl" description: "Number of layers to offload to the GPU if available" value: "{{ 999 if args.ngl < 0 else args.ngl }}" diff --git a/ramalama/command/context.py b/ramalama/command/context.py index e92e9a66..6ebb4c40 100644 --- a/ramalama/command/context.py +++ b/ramalama/command/context.py @@ -169,9 +169,10 @@ class RamalamaCommandContext: model = cli_args.model else: model = None + host = RamalamaHostContext( is_container, - check_nvidia() is None, + check_nvidia() is not None, check_metal(argparse.Namespace(**{"container": is_container})), should_colorize(), os.getenv("RAMALAMA_LLAMACPP_RPC_NODES", None), diff --git a/test/e2e/test_serve.py b/test/e2e/test_serve.py index 497d7964..ce316547 100644 --- a/test/e2e/test_serve.py +++ b/test/e2e/test_serve.py @@ -16,7 +16,6 @@ from test.conftest import ( skip_if_docker, skip_if_gh_actions_darwin, skip_if_no_container, - skip_if_not_darwin, skip_if_ppc64le, skip_if_s390x, ) @@ -182,10 +181,6 @@ def test_basic_dry_run(): [], r".*--cache-reuse 256", None, None, True, id="check --cache-reuse default value", marks=skip_if_container ), - pytest.param( - [], r".*--flash-attn", None, None, True, - id="check --flash-attn", marks=[skip_if_container, skip_if_not_darwin] - ), pytest.param( ["--host", "127.0.0.1"], r".*--host 127.0.0.1", None, None, True, diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index cac45ced..f9256e16 100755 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -73,9 +73,6 @@ verify_begin=".*run --rm" run_ramalama -q --dryrun serve ${model} assert "$output" =~ ".*--host 0.0.0.0" "Outside container sets host to 0.0.0.0" is "$output" ".*--cache-reuse 256" "should use cache" - if is_darwin; then - is "$output" ".*--flash-attn on" "use flash-attn on Darwin metal" - fi run_ramalama -q --dryrun serve --seed abcd --host 127.0.0.1 ${model} assert "$output" =~ ".*--host 127.0.0.1" "Outside container overrides host to 127.0.0.1" diff --git a/test/unit/command/data/engines/llama.cpp.missing.version.yaml b/test/unit/command/data/engines/llama.cpp.missing.version.yaml index 8d3a655e..c21eeaba 100644 --- a/test/unit/command/data/engines/llama.cpp.missing.version.yaml +++ b/test/unit/command/data/engines/llama.cpp.missing.version.yaml @@ -53,10 +53,6 @@ commands: - name: "--no-webui" description: "Disable the Web UI" if: "{{ args.webui == 'off' }}" - - name: "--flash-attn" - description: "Set Flash Attention use" - value: "on" - if: "{{ host.uses_nvidia or host.uses_metal }}" - name: "-ngl" description: "Number of layers to offload to the GPU if available" value: "{{ 999 if args.ngl < 0 else args.ngl }}" diff --git a/test/unit/command/data/engines/llama.cpp.yaml b/test/unit/command/data/engines/llama.cpp.yaml index 6c49407a..79c8d328 100644 --- a/test/unit/command/data/engines/llama.cpp.yaml +++ b/test/unit/command/data/engines/llama.cpp.yaml @@ -55,10 +55,6 @@ commands: - name: "--no-webui" description: "Disable the Web UI" if: "{{ args.webui == 'off' }}" - - name: "--flash-attn" - description: "Set Flash Attention use" - value: "on" - if: "{{ host.uses_nvidia or host.uses_metal }}" - name: "-ngl" description: "Number of layers to offload to the GPU if available" value: "{{ 999 if args.ngl < 0 else args.ngl }}" diff --git a/test/unit/command/test_factory.py b/test/unit/command/test_factory.py index e503efd7..23f01400 100644 --- a/test/unit/command/test_factory.py +++ b/test/unit/command/test_factory.py @@ -83,23 +83,23 @@ class FactoryInput: [ ( FactoryInput(), - "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 + "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 ), ( FactoryInput(has_mmproj=True), - "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --mmproj /path/to/mmproj --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 + "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --mmproj /path/to/mmproj --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 ), ( FactoryInput(has_chat_template=False), - "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 + "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on --another-arg 44 --more-args", # noqa: E501 ), ( FactoryInput(cli_args=CLIArgs(runtime_args="")), - "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on", # noqa: E501 + "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on", # noqa: E501 ), ( FactoryInput(cli_args=CLIArgs(max_tokens=99, runtime_args="")), - "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v --flash-attn on -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on -n 99", # noqa: E501 + "llama-server --host 0.0.0.0 --port 1337 --log-file /var/tmp/ramalama.log --model /path/to/model --chat-template-file /path/to/chat-template --jinja --no-warmup --reasoning-budget 0 --alias library/smollm --ctx-size 512 --temp 11 --cache-reuse 1024 -v -ngl 44 --model-draft /path/to/draft-model -ngld 44 --threads 8 --seed 12345 --log-colors on -n 99", # noqa: E501 ), ], )