Merge pull request #911 from leo-pony/main

Add support for llama.cpp engine to use ascend NPU device
2026-02-05 06:46:39 +01:00 · 2025-03-06 12:11:30 -05:00
parent 31930cd08b 93c023d4a7
commit adc53fea4e
9 changed files with 212 additions and 5 deletions
--- a/2
+++ b/2
@@ -129,7 +129,7 @@ format:

 .PHONY: codespell
 codespell:
-	codespell --dictionary=- -w --skip="*/venv*"
+	codespell --dictionary=-  --ignore-words-list "cann" -w --skip="*/venv*"

 .PHONY: test-run
 test-run:
--- a/README.md
+++ b/README.md
@@ -116,7 +116,7 @@ curl -fsSL https://raw.githubusercontent.com/containers/ramalama/s/install.sh |
 | Apple Silicon GPU (podman-machine) | :white_check_mark: |
 | Nvidia GPU (cuda)                  | :white_check_mark: |
 | AMD GPU (rocm)                     | :white_check_mark: |
-
+| Ascend NPU (Linux)                 | :white_check_mark: |
 ## COMMANDS

 | Command                                                | Description                                                |
--- a/container-images/cann/Containerfile
+++ b/container-images/cann/Containerfile
@@ -0,0 +1,19 @@
+# Base image with CANN for compilation
+ARG ASCEND_VERSION=cann:8.0.0-910b-openeuler22.03-py3.10
+
+FROM quay.io/ascend/${ASCEND_VERSION} AS builder
+ARG GOLANG_VERSION
+COPY ../scripts /scripts
+RUN chmod +x /scripts/*.sh && \
+    sh -x /scripts/build_llama_and_whisper.sh "cann"
+
+FROM quay.io/ascend/${ASCEND_VERSION}
+# Copy the entire installation directory from the builder
+COPY --from=builder /tmp/install /usr
+ENV MODEL_PATH=/mnt/models/model.file
+COPY --chmod=755 ../scripts /usr/bin
+ENTRYPOINT [ \
+    "/bin/bash", \
+    "-c", \
+    "export LD_LIBRARY_PATH=/usr/lib:${LD_LIBRARY_PATH} && source /usr/local/Ascend/ascend-toolkit/set_env.sh && exec \"$@\"", "--" \
+]
--- a/container-images/scripts/build_llama_and_whisper.sh
+++ b/container-images/scripts/build_llama_and_whisper.sh
@@ -63,13 +63,52 @@ dnf_install() {
    . /opt/rh/gcc-toolset-12/enable
  elif [ "$containerfile" = "intel-gpu" ]; then
    dnf_install_intel_gpu
+  elif [ "$containerfile" = "cann" ]; then
+    # just for openeuler build environment, does not need to push to ollama github
+    dnf install -y git \
+        gcc \
+        gcc-c++ \
+        make \
+        cmake \
+        findutils \
+        yum \
+        curl-devel \
+        pigz
  fi

  dnf -y clean all
 }

 cmake_check_warnings() {
-  awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
+  # There has warning "CMake Warning:Manually-specified variables were not used by the project" during compile of custom ascend kernels of ggml cann backend.
+  # Should remove "cann" judge condition when this warning are fixed in llama.cpp/whisper.cpp
+  if [ "$containerfile" != "cann" ]; then
+    awk -v rc=0 '/CMake Warning:/ { rc=1 } 1; END {exit rc}'
+  else
+    awk '/CMake Warning:/ {print $0}'
+  fi
+}
+
+setup_build_env() {
+  if [ "$containerfile" = "cann" ]; then
+    # source build env
+    cann_in_sys_path=/usr/local/Ascend/ascend-toolkit;
+    cann_in_user_path=$HOME/Ascend/ascend-toolkit;
+    if [ -f "${cann_in_sys_path}/set_env.sh" ]; then
+        # shellcheck disable=SC1091
+        source ${cann_in_sys_path}/set_env.sh;
+        export LD_LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${cann_in_sys_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
+        export LIBRARY_PATH=${cann_in_sys_path}/latest/lib64:${LIBRARY_PATH};
+    elif [ -f "${cann_in_user_path}/set_env.sh" ]; then
+        # shellcheck disable=SC1091
+        source "$HOME/Ascend/ascend-toolkit/set_env.sh";
+        export LD_LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${cann_in_user_path}/latest/aarch64-linux/devlib:${LD_LIBRARY_PATH};
+        export LIBRARY_PATH=${cann_in_user_path}/latest/lib64:${LIBRARY_PATH};
+    else
+        echo "No Ascend Toolkit found";
+        exit 1;
+    fi
+  fi
 }

 cmake_steps() {
@@ -80,7 +119,7 @@ cmake_steps() {
 }

 set_install_prefix() {
-  if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ]; then
+  if [ "$containerfile" = "cuda" ] || [ "$containerfile" = "intel-gpu" ] || [ "$containerfile" = "cann" ]; then
    install_prefix="/tmp/install"
  else
    install_prefix="/usr"
@@ -105,6 +144,9 @@ configure_common_flags() {
    intel-gpu)
      common_flags+=("-DGGML_SYCL=ON" "-DCMAKE_C_COMPILER=icx" "-DCMAKE_CXX_COMPILER=icpx")
      ;;
+    cann)
+      common_flags+=("-DGGML_CANN=ON" "-DSOC_TYPE=Ascend910B3")
+      ;;
  esac
 }

@@ -164,6 +206,7 @@ main() {
  if [ -n "$containerfile" ]; then 
      clone_and_build_ramalama
  fi
+  setup_build_env
  clone_and_build_whisper_cpp
  common_flags+=("-DLLAMA_CURL=ON")
  case "$containerfile" in
--- a/docs/ramalama-cann.7.md
+++ b/docs/ramalama-cann.7.md
@@ -0,0 +1,63 @@
+% ramalama 7
+
+# Setting Up RamaLama with Ascend NPU Support on Linux systems
+
+This guide walks through the steps required to set up RamaLama with Ascend NPU support.
+ - [Background](#background)
+ - [Hardware](#hardware)
+ - [Docker](#docker)
+ - [HISTORY](#todo)
+
+## Background
+
+**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
+
+**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
+
+## Hardware
+
+### Ascend NPU
+
+**Verified devices**
+
+| Ascend NPU                     | Status  |
+| -----------------------------  | ------- |
+| Atlas A2 Training series       | Support |
+| Atlas 800I A2 Inference series | Support |
+
+*Notes:*
+
+- If you have trouble with Ascend NPU device, please create an issue with **[CANN]** prefix/tag.
+- If you run successfully with your Ascend NPU device, please help update the upper table.
+
+## Docker
+### Install the Ascend driver
+This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
+
+For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
+
+Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
+
+### Build Images
+Go to `ramalama` directory and build using make.
+```bash
+make build IMAGE=cann
+make install
+```
+
+You can test with:
+```bash
+ramalama --image quay.io/ramalama/cann:latest serve -d -p 8080 --device=/dev/davinci0 -name ollama://smollm:135m
+```
+
+In a window see the running podman container.
+```
+$ podman ps
+CONTAINER ID   IMAGE                                                         COMMAND                  CREATED             STATUS             PORTS                                          NAMES
+80fc31c131b0   quay.io/ramalama/cann:latest                                  "/bin/bash -c 'expor…"   About an hour ago   Up About an hour                                                  ame
+```
+
+Other using guides see Ramalama ([README.md](https://github.com/containers/ramalama/blob/main/README.md))
+
+## HISTORY
+Mar 2025, Originally compiled
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -194,6 +194,14 @@ def show_gpus_available_cli(args):
            errors.append({"Vendor": "Apple", "INFO": str(e)})

    else:  # Linux/Other OS GPU detection
+        try:
+            ascend_gpus = gpu_detector.get_ascend_npu()
+            # Since Ascend devices are not required, keep quiet if no Ascend devices are detected
+            if ascend_gpus:
+                gpu_info.extend(ascend_gpus)
+        except Exception as e:
+            errors.append({"Vendor": "Ascend", "INFO": str(e)})
+
        try:
            nvidia_gpus = gpu_detector.get_nvidia_gpu()
            if nvidia_gpus:
--- a/ramalama/common.py
+++ b/ramalama/common.py
@@ -329,6 +329,15 @@ def get_gpu():
    except Exception:
        pass

+    # Ascend CASE
+    try:
+        command = ['npu-smi']
+        run_cmd(command).stdout.decode("utf-8")
+        os.environ["CANN_VISIBLE_DEVICES"] = "0"
+        return
+    except Exception:
+        pass
+
    # ROCm/AMD CASE
    i = 0
    gpu_num = 0
@@ -359,7 +368,7 @@ def get_gpu():


 def get_env_vars():
-    prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_")
+    prefixes = ("ASAHI_", "CUDA_", "HIP_", "HSA_", "INTEL_", "CANN_")
    env_vars = {k: v for k, v in os.environ.items() if k.startswith(prefixes)}

    # gpu_type, gpu_num = get_gpu()
--- a/ramalama/gpu_detector.py
+++ b/ramalama/gpu_detector.py
@@ -1,6 +1,8 @@
 import glob
 import logging
 import platform
+import re
+import shutil
 import subprocess

 logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -147,6 +149,54 @@ class GPUDetector:
            logging.error(f"Unexpected error while detecting macOS GPU: {e}")
            return [{"GPU": "Unknown", "Error": str(e)}]

+    def run_command_and_extract(self, cmd, pattern, error_msg):
+        """Run a command and extract a value using regex. Raises ValueError if not found."""
+        try:
+            proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            match = re.search(pattern, proc.stdout)
+            if match:
+                return match.group(1)
+            else:
+                raise ValueError(error_msg)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to run command:{cmd} on linux. Error: {e}")
+
+    def get_ascend_npu(self):
+        """Detects Ascend NPUs using npu-smi (Linux only)."""
+        if platform.system() != "Linux":
+            return  # Skip on macOS and other platforms
+        if shutil.which("npu-smi") is None:
+            logging.info("The 'npu-smi' command to detect ascend npu is NOT available.")
+            return
+
+        try:
+            gpus = []
+            # get total npu number
+            total_count = int(
+                self.run_command_and_extract(
+                    ["npu-smi", "info", "-l"], r"Total Count\s+:\s*(\d+)", "Could not determine total NPU count."
+                )
+            )
+            for npu_id in range(total_count):
+                gpu_info = {"GPU": npu_id, "Vendor": "Ascend", "Env": "CANN_VISIBLE_DEVICES"}
+                # get memory of each card
+                hbm_capacity = int(
+                    self.run_command_and_extract(
+                        ["npu-smi", "info", "-t", "memory", "-i", str(npu_id)],
+                        r"HBM Capacity\(MB\)\s+:\s*(\d+)",
+                        f"Could not find HBM Capacity for NPU {npu_id}.",
+                    )
+                )
+
+                self._update_best_gpu(hbm_capacity, npu_id, "CANN_VISIBLE_DEVICES")
+                gpu_info["VRAM"] = hbm_capacity
+                gpus.append(gpu_info)
+
+            return gpus
+        except Exception as e:
+            error_msg = getattr(e, 'stderr', "Error (check if Ascend drivers are loaded).")
+            raise RuntimeError(f"Unable to detect Ascend NPU(s). Error: {error_msg}")
+
    def detect_best_gpu(self, gpu_template):
        """
        Compares Nvidia, AMD, Apple, and Intel GPUs and appends the best GPU
@@ -192,6 +242,16 @@ class GPUDetector:
            except RuntimeError as e:
                logging.warning(f"Warning: Intel detection failed: {e}")

+            try:
+                ascend_gpus = self.get_ascend_npu()
+                for gpu in ascend_gpus:
+                    vram = int(gpu.get("VRAM", 0))
+                    if vram > best_vram:
+                        best_gpu = gpu
+                        best_vram = vram
+                        best_env = "CANN"
+            except RuntimeError as e:
+                logging.warning(f"Warning: Ascend detection failed: {e}")
        elif system == "Darwin":  # macOS
            try:
                macos_gpus = self.get_macos_gpu()
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -208,6 +208,7 @@ class Model(ModelBase):
            "CUDA_VISIBLE_DEVICES": "quay.io/ramalama/cuda",
            "ASAHI_VISIBLE_DEVICES": "quay.io/ramalama/asahi",
            "INTEL_VISIBLE_DEVICES": "quay.io/ramalama/intel-gpu",
+            "CANN_VISIBLE_DEVICES": "quay.io/ramalama/cann",
        }

        image = images.get(gpu_type, args.image)
@@ -321,6 +322,9 @@ class Model(ModelBase):
            if os.path.exists("/dev/kfd"):
                conman_args += ["--device", "/dev/kfd"]

+            if os.path.exists("/dev/davinci0"):
+                conman_args += ["--device", "/dev/davinci0"]
+
            for k, v in get_env_vars().items():
                # Special case for Cuda
                if k == "CUDA_VISIBLE_DEVICES":
@@ -363,6 +367,7 @@ class Model(ModelBase):
            or os.getenv("ASAHI_VISIBLE_DEVICES")
            or os.getenv("CUDA_VISIBLE_DEVICES")
            or os.getenv("INTEL_VISIBLE_DEVICES")
+            or os.getenv("CANN_VISIBLE_DEVICES")
            or (
                # linux and macOS report aarch64 (linux), arm64 (macOS)
                podman_machine_accel