1
0
mirror of https://github.com/containers/ramalama.git synced 2026-02-05 06:46:39 +01:00

Merge pull request #911 from leo-pony/main

Add support for llama.cpp engine to use ascend NPU device
This commit is contained in:
Daniel J Walsh
2025-03-06 12:11:30 -05:00
committed by GitHub
9 changed files with 212 additions and 5 deletions

View File

@@ -129,7 +129,7 @@ format:
.PHONY: codespell
codespell:
codespell --dictionary=- -w --skip="*/venv*"
codespell --dictionary=- --ignore-words-list "cann" -w --skip="*/venv*"
.PHONY: test-run
test-run:

View File

@@ -116,7 +116,7 @@ curl -fsSL https://raw.githubusercontent.com/containers/ramalama/s/install.sh |
| Apple Silicon GPU (podman-machine) | :white_check_mark: |
| Nvidia GPU (cuda) | :white_check_mark: |
| AMD GPU (rocm) | :white_check_mark: |
| Ascend NPU (Linux) | :white_check_mark: |
## COMMANDS
| Command | Description |

View File

@@ -0,0 +1,19 @@
# Base image with CANN for compilation
ARG ASCEND_VERSION=cann:8.0.0-910b-openeuler22.03-py3.10
FROM quay.io/ascend/${ASCEND_VERSION} AS builder
ARG GOLANG_VERSION
COPY ../scripts /scripts
RUN chmod +x /scripts/*.sh && \
sh -x /scripts/build_llama_and_whisper.sh "cann"
FROM quay.io/ascend/${ASCEND_VERSION}
# Copy the entire installation directory from the builder
COPY --from=builder /tmp/install /usr
ENV MODEL_PATH=/mnt/models/model.file
COPY --chmod=755 ../scripts /usr/bin
ENTRYPOINT [ \
"/bin/bash", \
"-c", \
"export LD_LIBRARY_PATH=/usr/lib:${LD_LIBRARY_PATH} && source /usr/local/Ascend/ascend-toolkit/set_env.sh && exec \"$@\"", "--" \
]

View File

@@ -63,13 +63,52 @@ dnf_install() {
. /opt/rh/gcc-toolset-12/enable
elif [ "$containerfile" = "intel-gpu" ]; then
dnf_install_intel_gpu
elif [ "$containerfile" = "cann" ]; then
# just for openeuler build environment, does not need to push to ollama github
dnf install -y git \
gcc \
gcc-c++ \
make \
cmake \
findutils \
yum \
curl-devel \
pigz
fi
dnf -y clean all
}
cmake_check_warnings() {
awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
# There has warning "CMake Warning:Manually-specified variables were not used by the project" during compile of custom ascend kernels of ggml cann backend.
# Should remove "cann" judge condition when this warning are fixed in llama.cpp/whisper.cpp
if [ "$containerfile" != "cann" ]; then
awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
else
awk '/CMake Warning:/ {print $0}'
fi
}
setup_build_env() {
if [ "$containerfile" = "cann" ]; then
# source build env
cann_in_sys_path=/usr/local/Ascend/ascend-toolkit;
cann_in_user_path=$HOME/Ascend/ascend-toolkit;
if [ -f "${cann_in_sys_path}/set_env.sh" ]; then
# shellcheck disable=SC1091
source ${cann_in_sys_path}/set_env.sh;
export LD_LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${cann_in_sys_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
export LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${LIBRARY_PATH};
elif [ -f "${cann_in_user_path}/set_env.sh" ]; then
# shellcheck disable=SC1091
source "$HOME/Ascend/ascend-toolkit/set_env.sh";
export LD_LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${cann_in_user_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
export LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${LIBRARY_PATH};
else
echo "No Ascend Toolkit found";
exit 1;
fi
fi
}
cmake_steps() {
@@ -80,7 +119,7 @@ cmake_steps() {
}
set_install_prefix() {
if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ]; then
if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ] || [ "$containerfile" = "cann" ]; then
install_prefix="/tmp/install"
else
install_prefix="/usr"
@@ -105,6 +144,9 @@ configure_common_flags() {
intel-gpu)
common_flags+=("-DGGML_SYCL=ON" "-DCMAKE_C_COMPILER=icx" "-DCMAKE_CXX_COMPILER=icpx")
;;
cann)
common_flags+=("-DGGML_CANN=ON" "-DSOC_TYPE=Ascend910B3")
;;
esac
}
@@ -164,6 +206,7 @@ main() {
if [ -n "$containerfile" ]; then
clone_and_build_ramalama
fi
setup_build_env
clone_and_build_whisper_cpp
common_flags+=("-DLLAMA_CURL=ON")
case "$containerfile" in

63
docs/ramalama-cann.7.md Normal file
View File

@@ -0,0 +1,63 @@
% ramalama 7
# Setting Up RamaLama with Ascend NPU Support on Linux systems
This guide walks through the steps required to set up RamaLama with Ascend NPU support.
- [Background](#background)
- [Hardware](#hardware)
- [Docker](#docker)
- [HISTORY](#todo)
## Background
**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
## Hardware
### Ascend NPU
**Verified devices**
| Ascend NPU | Status |
| ----------------------------- | ------- |
| Atlas A2 Training series | Support |
| Atlas 800I A2 Inference series | Support |
*Notes:*
- If you have trouble with Ascend NPU device, please create an issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
## Docker
### Install the Ascend driver
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
### Build Images
Go to `ramalama` directory and build using make.
```bash
make build IMAGE=cann
make install
```
You can test with:
```bash
ramalama --image quay.io/ramalama/cann:latest serve -d -p 8080 --device=/dev/davinci0 -name ollama://smollm:135m
```
In a window see the running podman container.
```
$ podman ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
80fc31c131b0 quay.io/ramalama/cann:latest "/bin/bash -c 'expor…" About an hour ago Up About an hour ame
```
Other using guides see Ramalama ([README.md](https://github.com/containers/ramalama/blob/main/README.md))
## HISTORY
Mar 2025, Originally compiled

View File

@@ -194,6 +194,14 @@ def show_gpus_available_cli(args):
errors.append({"Vendor": "Apple", "INFO": str(e)})
else: # Linux/Other OS GPU detection
try:
ascend_gpus = gpu_detector.get_ascend_npu()
# Since Ascend devices are not required, keep quiet if no Ascend devices are detected
if ascend_gpus:
gpu_info.extend(ascend_gpus)
except Exception as e:
errors.append({"Vendor": "Ascend", "INFO": str(e)})
try:
nvidia_gpus = gpu_detector.get_nvidia_gpu()
if nvidia_gpus:

View File

@@ -329,6 +329,15 @@ def get_gpu():
except Exception:
pass
# Ascend CASE
try:
command = ['npu-smi']
run_cmd(command).stdout.decode("utf-8")
os.environ["CANN_VISIBLE_DEVICES"] = "0"
return
except Exception:
pass
# ROCm/AMD CASE
i = 0
gpu_num = 0
@@ -359,7 +368,7 @@ def get_gpu():
def get_env_vars():
prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_")
prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_", "CANN_")
env_vars = {k: v for k, v in os.environ.items() if k.startswith(prefixes)}
# gpu_type, gpu_num = get_gpu()

View File

@@ -1,6 +1,8 @@
import glob
import logging
import platform
import re
import shutil
import subprocess
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -147,6 +149,54 @@ class GPUDetector:
logging.error(f"Unexpected error while detecting macOS GPU: {e}")
return [{"GPU": "Unknown", "Error": str(e)}]
def run_command_and_extract(self, cmd, pattern, error_msg):
"""Run a command and extract a value using regex. Raises ValueError if not found."""
try:
proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
match = re.search(pattern, proc.stdout)
if match:
return match.group(1)
else:
raise ValueError(error_msg)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to run command:{cmd} on linux. Error: {e}")
def get_ascend_npu(self):
"""Detects Ascend NPUs using npu-smi (Linux only)."""
if platform.system() != "Linux":
return # Skip on macOS and other platforms
if shutil.which("npu-smi") is None:
logging.info("The 'npu-smi' command to detect ascend npu is NOT available.")
return
try:
gpus = []
# get total npu number
total_count = int(
self.run_command_and_extract(
["npu-smi", "info", "-l"], r"Total Count\s+:\s*(\d+)", "Could not determine total NPU count."
)
)
for npu_id in range(total_count):
gpu_info = {"GPU": npu_id, "Vendor": "Ascend", "Env": "CANN_VISIBLE_DEVICES"}
# get memory of each card
hbm_capacity = int(
self.run_command_and_extract(
["npu-smi", "info", "-t", "memory", "-i", str(npu_id)],
r"HBM Capacity\(MB\)\s+:\s*(\d+)",
f"Could not find HBM Capacity for NPU {npu_id}.",
)
)
self._update_best_gpu(hbm_capacity, npu_id, "CANN_VISIBLE_DEVICES")
gpu_info["VRAM"] = hbm_capacity
gpus.append(gpu_info)
return gpus
except Exception as e:
error_msg = getattr(e, 'stderr', "Error (check if Ascend drivers are loaded).")
raise RuntimeError(f"Unable to detect Ascend NPU(s). Error: {error_msg}")
def detect_best_gpu(self, gpu_template):
"""
Compares Nvidia, AMD, Apple, and Intel GPUs and appends the best GPU
@@ -192,6 +242,16 @@ class GPUDetector:
except RuntimeError as e:
logging.warning(f"Warning: Intel detection failed: {e}")
try:
ascend_gpus = self.get_ascend_npu()
for gpu in ascend_gpus:
vram = int(gpu.get("VRAM", 0))
if vram > best_vram:
best_gpu = gpu
best_vram = vram
best_env = "CANN"
except RuntimeError as e:
logging.warning(f"Warning: Ascend detection failed: {e}")
elif system == "Darwin": # macOS
try:
macos_gpus = self.get_macos_gpu()

View File

@@ -208,6 +208,7 @@ class Model(ModelBase):
"CUDA_VISIBLE_DEVICES": "quay.io/ramalama/cuda",
"ASAHI_VISIBLE_DEVICES": "quay.io/ramalama/asahi",
"INTEL_VISIBLE_DEVICES": "quay.io/ramalama/intel-gpu",
"CANN_VISIBLE_DEVICES": "quay.io/ramalama/cann",
}
image = images.get(gpu_type, args.image)
@@ -321,6 +322,9 @@ class Model(ModelBase):
if os.path.exists("/dev/kfd"):
conman_args += ["--device", "/dev/kfd"]
if os.path.exists("/dev/davinci0"):
conman_args += ["--device", "/dev/davinci0"]
for k, v in get_env_vars().items():
# Special case for Cuda
if k == "CUDA_VISIBLE_DEVICES":
@@ -363,6 +367,7 @@ class Model(ModelBase):
or os.getenv("ASAHI_VISIBLE_DEVICES")
or os.getenv("CUDA_VISIBLE_DEVICES")
or os.getenv("INTEL_VISIBLE_DEVICES")
or os.getenv("CANN_VISIBLE_DEVICES")
or (
# linux and macOS report aarch64 (linux), arm64 (macOS)
podman_machine_accel