mirror of
https://github.com/containers/ramalama.git
synced 2026-02-05 06:46:39 +01:00
Merge pull request #911 from leo-pony/main
Add support for llama.cpp engine to use ascend NPU device
This commit is contained in:
2
Makefile
2
Makefile
@@ -129,7 +129,7 @@ format:
|
||||
|
||||
.PHONY: codespell
|
||||
codespell:
|
||||
codespell --dictionary=- -w --skip="*/venv*"
|
||||
codespell --dictionary=- --ignore-words-list "cann" -w --skip="*/venv*"
|
||||
|
||||
.PHONY: test-run
|
||||
test-run:
|
||||
|
||||
@@ -116,7 +116,7 @@ curl -fsSL https://raw.githubusercontent.com/containers/ramalama/s/install.sh |
|
||||
| Apple Silicon GPU (podman-machine) | :white_check_mark: |
|
||||
| Nvidia GPU (cuda) | :white_check_mark: |
|
||||
| AMD GPU (rocm) | :white_check_mark: |
|
||||
|
||||
| Ascend NPU (Linux) | :white_check_mark: |
|
||||
## COMMANDS
|
||||
|
||||
| Command | Description |
|
||||
|
||||
19
container-images/cann/Containerfile
Normal file
19
container-images/cann/Containerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
# Base image with CANN for compilation
|
||||
ARG ASCEND_VERSION=cann:8.0.0-910b-openeuler22.03-py3.10
|
||||
|
||||
FROM quay.io/ascend/${ASCEND_VERSION} AS builder
|
||||
ARG GOLANG_VERSION
|
||||
COPY ../scripts /scripts
|
||||
RUN chmod +x /scripts/*.sh && \
|
||||
sh -x /scripts/build_llama_and_whisper.sh "cann"
|
||||
|
||||
FROM quay.io/ascend/${ASCEND_VERSION}
|
||||
# Copy the entire installation directory from the builder
|
||||
COPY --from=builder /tmp/install /usr
|
||||
ENV MODEL_PATH=/mnt/models/model.file
|
||||
COPY --chmod=755 ../scripts /usr/bin
|
||||
ENTRYPOINT [ \
|
||||
"/bin/bash", \
|
||||
"-c", \
|
||||
"export LD_LIBRARY_PATH=/usr/lib:${LD_LIBRARY_PATH} && source /usr/local/Ascend/ascend-toolkit/set_env.sh && exec \"$@\"", "--" \
|
||||
]
|
||||
@@ -63,13 +63,52 @@ dnf_install() {
|
||||
. /opt/rh/gcc-toolset-12/enable
|
||||
elif [ "$containerfile" = "intel-gpu" ]; then
|
||||
dnf_install_intel_gpu
|
||||
elif [ "$containerfile" = "cann" ]; then
|
||||
# just for openeuler build environment, does not need to push to ollama github
|
||||
dnf install -y git \
|
||||
gcc \
|
||||
gcc-c++ \
|
||||
make \
|
||||
cmake \
|
||||
findutils \
|
||||
yum \
|
||||
curl-devel \
|
||||
pigz
|
||||
fi
|
||||
|
||||
dnf -y clean all
|
||||
}
|
||||
|
||||
cmake_check_warnings() {
|
||||
awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
|
||||
# There has warning "CMake Warning:Manually-specified variables were not used by the project" during compile of custom ascend kernels of ggml cann backend.
|
||||
# Should remove "cann" judge condition when this warning are fixed in llama.cpp/whisper.cpp
|
||||
if [ "$containerfile" != "cann" ]; then
|
||||
awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
|
||||
else
|
||||
awk '/CMake Warning:/ {print $0}'
|
||||
fi
|
||||
}
|
||||
|
||||
setup_build_env() {
|
||||
if [ "$containerfile" = "cann" ]; then
|
||||
# source build env
|
||||
cann_in_sys_path=/usr/local/Ascend/ascend-toolkit;
|
||||
cann_in_user_path=$HOME/Ascend/ascend-toolkit;
|
||||
if [ -f "${cann_in_sys_path}/set_env.sh" ]; then
|
||||
# shellcheck disable=SC1091
|
||||
source ${cann_in_sys_path}/set_env.sh;
|
||||
export LD_LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${cann_in_sys_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
|
||||
export LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${LIBRARY_PATH};
|
||||
elif [ -f "${cann_in_user_path}/set_env.sh" ]; then
|
||||
# shellcheck disable=SC1091
|
||||
source "$HOME/Ascend/ascend-toolkit/set_env.sh";
|
||||
export LD_LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${cann_in_user_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
|
||||
export LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${LIBRARY_PATH};
|
||||
else
|
||||
echo "No Ascend Toolkit found";
|
||||
exit 1;
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
cmake_steps() {
|
||||
@@ -80,7 +119,7 @@ cmake_steps() {
|
||||
}
|
||||
|
||||
set_install_prefix() {
|
||||
if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ]; then
|
||||
if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ] || [ "$containerfile" = "cann" ]; then
|
||||
install_prefix="/tmp/install"
|
||||
else
|
||||
install_prefix="/usr"
|
||||
@@ -105,6 +144,9 @@ configure_common_flags() {
|
||||
intel-gpu)
|
||||
common_flags+=("-DGGML_SYCL=ON" "-DCMAKE_C_COMPILER=icx" "-DCMAKE_CXX_COMPILER=icpx")
|
||||
;;
|
||||
cann)
|
||||
common_flags+=("-DGGML_CANN=ON" "-DSOC_TYPE=Ascend910B3")
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
@@ -164,6 +206,7 @@ main() {
|
||||
if [ -n "$containerfile" ]; then
|
||||
clone_and_build_ramalama
|
||||
fi
|
||||
setup_build_env
|
||||
clone_and_build_whisper_cpp
|
||||
common_flags+=("-DLLAMA_CURL=ON")
|
||||
case "$containerfile" in
|
||||
|
||||
63
docs/ramalama-cann.7.md
Normal file
63
docs/ramalama-cann.7.md
Normal file
@@ -0,0 +1,63 @@
|
||||
% ramalama 7
|
||||
|
||||
# Setting Up RamaLama with Ascend NPU Support on Linux systems
|
||||
|
||||
This guide walks through the steps required to set up RamaLama with Ascend NPU support.
|
||||
- [Background](#background)
|
||||
- [Hardware](#hardware)
|
||||
- [Docker](#docker)
|
||||
- [HISTORY](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
|
||||
|
||||
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
|
||||
|
||||
## Hardware
|
||||
|
||||
### Ascend NPU
|
||||
|
||||
**Verified devices**
|
||||
|
||||
| Ascend NPU | Status |
|
||||
| ----------------------------- | ------- |
|
||||
| Atlas A2 Training series | Support |
|
||||
| Atlas 800I A2 Inference series | Support |
|
||||
|
||||
*Notes:*
|
||||
|
||||
- If you have trouble with Ascend NPU device, please create an issue with **[CANN]** prefix/tag.
|
||||
- If you run successfully with your Ascend NPU device, please help update the upper table.
|
||||
|
||||
## Docker
|
||||
### Install the Ascend driver
|
||||
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
|
||||
|
||||
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
|
||||
|
||||
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
||||
|
||||
### Build Images
|
||||
Go to `ramalama` directory and build using make.
|
||||
```bash
|
||||
make build IMAGE=cann
|
||||
make install
|
||||
```
|
||||
|
||||
You can test with:
|
||||
```bash
|
||||
ramalama --image quay.io/ramalama/cann:latest serve -d -p 8080 --device=/dev/davinci0 -name ollama://smollm:135m
|
||||
```
|
||||
|
||||
In a window see the running podman container.
|
||||
```
|
||||
$ podman ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
80fc31c131b0 quay.io/ramalama/cann:latest "/bin/bash -c 'expor…" About an hour ago Up About an hour ame
|
||||
```
|
||||
|
||||
Other using guides see Ramalama ([README.md](https://github.com/containers/ramalama/blob/main/README.md))
|
||||
|
||||
## HISTORY
|
||||
Mar 2025, Originally compiled
|
||||
@@ -194,6 +194,14 @@ def show_gpus_available_cli(args):
|
||||
errors.append({"Vendor": "Apple", "INFO": str(e)})
|
||||
|
||||
else: # Linux/Other OS GPU detection
|
||||
try:
|
||||
ascend_gpus = gpu_detector.get_ascend_npu()
|
||||
# Since Ascend devices are not required, keep quiet if no Ascend devices are detected
|
||||
if ascend_gpus:
|
||||
gpu_info.extend(ascend_gpus)
|
||||
except Exception as e:
|
||||
errors.append({"Vendor": "Ascend", "INFO": str(e)})
|
||||
|
||||
try:
|
||||
nvidia_gpus = gpu_detector.get_nvidia_gpu()
|
||||
if nvidia_gpus:
|
||||
|
||||
@@ -329,6 +329,15 @@ def get_gpu():
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Ascend CASE
|
||||
try:
|
||||
command = ['npu-smi']
|
||||
run_cmd(command).stdout.decode("utf-8")
|
||||
os.environ["CANN_VISIBLE_DEVICES"] = "0"
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ROCm/AMD CASE
|
||||
i = 0
|
||||
gpu_num = 0
|
||||
@@ -359,7 +368,7 @@ def get_gpu():
|
||||
|
||||
|
||||
def get_env_vars():
|
||||
prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_")
|
||||
prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_", "CANN_")
|
||||
env_vars = {k: v for k, v in os.environ.items() if k.startswith(prefixes)}
|
||||
|
||||
# gpu_type, gpu_num = get_gpu()
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import glob
|
||||
import logging
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
@@ -147,6 +149,54 @@ class GPUDetector:
|
||||
logging.error(f"Unexpected error while detecting macOS GPU: {e}")
|
||||
return [{"GPU": "Unknown", "Error": str(e)}]
|
||||
|
||||
def run_command_and_extract(self, cmd, pattern, error_msg):
|
||||
"""Run a command and extract a value using regex. Raises ValueError if not found."""
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
match = re.search(pattern, proc.stdout)
|
||||
if match:
|
||||
return match.group(1)
|
||||
else:
|
||||
raise ValueError(error_msg)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(f"Failed to run command:{cmd} on linux. Error: {e}")
|
||||
|
||||
def get_ascend_npu(self):
|
||||
"""Detects Ascend NPUs using npu-smi (Linux only)."""
|
||||
if platform.system() != "Linux":
|
||||
return # Skip on macOS and other platforms
|
||||
if shutil.which("npu-smi") is None:
|
||||
logging.info("The 'npu-smi' command to detect ascend npu is NOT available.")
|
||||
return
|
||||
|
||||
try:
|
||||
gpus = []
|
||||
# get total npu number
|
||||
total_count = int(
|
||||
self.run_command_and_extract(
|
||||
["npu-smi", "info", "-l"], r"Total Count\s+:\s*(\d+)", "Could not determine total NPU count."
|
||||
)
|
||||
)
|
||||
for npu_id in range(total_count):
|
||||
gpu_info = {"GPU": npu_id, "Vendor": "Ascend", "Env": "CANN_VISIBLE_DEVICES"}
|
||||
# get memory of each card
|
||||
hbm_capacity = int(
|
||||
self.run_command_and_extract(
|
||||
["npu-smi", "info", "-t", "memory", "-i", str(npu_id)],
|
||||
r"HBM Capacity\(MB\)\s+:\s*(\d+)",
|
||||
f"Could not find HBM Capacity for NPU {npu_id}.",
|
||||
)
|
||||
)
|
||||
|
||||
self._update_best_gpu(hbm_capacity, npu_id, "CANN_VISIBLE_DEVICES")
|
||||
gpu_info["VRAM"] = hbm_capacity
|
||||
gpus.append(gpu_info)
|
||||
|
||||
return gpus
|
||||
except Exception as e:
|
||||
error_msg = getattr(e, 'stderr', "Error (check if Ascend drivers are loaded).")
|
||||
raise RuntimeError(f"Unable to detect Ascend NPU(s). Error: {error_msg}")
|
||||
|
||||
def detect_best_gpu(self, gpu_template):
|
||||
"""
|
||||
Compares Nvidia, AMD, Apple, and Intel GPUs and appends the best GPU
|
||||
@@ -192,6 +242,16 @@ class GPUDetector:
|
||||
except RuntimeError as e:
|
||||
logging.warning(f"Warning: Intel detection failed: {e}")
|
||||
|
||||
try:
|
||||
ascend_gpus = self.get_ascend_npu()
|
||||
for gpu in ascend_gpus:
|
||||
vram = int(gpu.get("VRAM", 0))
|
||||
if vram > best_vram:
|
||||
best_gpu = gpu
|
||||
best_vram = vram
|
||||
best_env = "CANN"
|
||||
except RuntimeError as e:
|
||||
logging.warning(f"Warning: Ascend detection failed: {e}")
|
||||
elif system == "Darwin": # macOS
|
||||
try:
|
||||
macos_gpus = self.get_macos_gpu()
|
||||
|
||||
@@ -208,6 +208,7 @@ class Model(ModelBase):
|
||||
"CUDA_VISIBLE_DEVICES": "quay.io/ramalama/cuda",
|
||||
"ASAHI_VISIBLE_DEVICES": "quay.io/ramalama/asahi",
|
||||
"INTEL_VISIBLE_DEVICES": "quay.io/ramalama/intel-gpu",
|
||||
"CANN_VISIBLE_DEVICES": "quay.io/ramalama/cann",
|
||||
}
|
||||
|
||||
image = images.get(gpu_type, args.image)
|
||||
@@ -321,6 +322,9 @@ class Model(ModelBase):
|
||||
if os.path.exists("/dev/kfd"):
|
||||
conman_args += ["--device", "/dev/kfd"]
|
||||
|
||||
if os.path.exists("/dev/davinci0"):
|
||||
conman_args += ["--device", "/dev/davinci0"]
|
||||
|
||||
for k, v in get_env_vars().items():
|
||||
# Special case for Cuda
|
||||
if k == "CUDA_VISIBLE_DEVICES":
|
||||
@@ -363,6 +367,7 @@ class Model(ModelBase):
|
||||
or os.getenv("ASAHI_VISIBLE_DEVICES")
|
||||
or os.getenv("CUDA_VISIBLE_DEVICES")
|
||||
or os.getenv("INTEL_VISIBLE_DEVICES")
|
||||
or os.getenv("CANN_VISIBLE_DEVICES")
|
||||
or (
|
||||
# linux and macOS report aarch64 (linux), arm64 (macOS)
|
||||
podman_machine_accel
|
||||
|
||||
Reference in New Issue
Block a user