【AI】aarch64 kylin llama.cpp CUDA 适配 Chinese-LLaMA-Alpaca-2 启用 Tesla P4 8G GPU支持【失败】

GPU

GPU架构信息

Pascal架构:

compute_60: GeForce GTX 10xx系列、TITAN Xp, Tesla P100等 compute_61: Quadro GP100, Tesla P40等 compute_62: GeForce GTX 1050/1050 Ti等

修改Makefile，指定compute_61

*** Makefile	2024-01-25 16:28:19.316971200 +0800
--- Makefile.bak	2024-01-25 16:28:10.006971200 +0800
*************** endif #LLAMA_CUDA_NVCC
*** 380,386 ****
  ifdef CUDA_DOCKER_ARCH
  	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
  else ifndef CUDA_POWER_ARCH
! 	MK_NVCCFLAGS += -arch=compute_61
  endif # CUDA_DOCKER_ARCH
  ifdef LLAMA_CUDA_FORCE_DMMV
  	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
--- 380,386 ----
  ifdef CUDA_DOCKER_ARCH
  	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
  else ifndef CUDA_POWER_ARCH
! 	MK_NVCCFLAGS += -arch=native
  endif # CUDA_DOCKER_ARCH
  ifdef LLAMA_CUDA_FORCE_DMMV
  	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV

配置编译环境

echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf
ldconfig

编译llama.cpp，启用GPU支持

export PATH=/usr/local/cuda/bin:$PATH

make LLAMA_CUBLAS=1 -j32 CC=/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang CXX=/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++

报错：gcc: 错误：unrecognized command-line option ‘-Wunreachable-code-break’; did you mean ‘-Wunreachable-code’?

神奇，用的clang，报gcc错误。。。

搜索Makefile发现一下代码

修改，差异如下

*** Makefile	2024-01-25 16:47:46.126971200 +0800
--- Makefile.bak	2024-01-25 16:28:10.006971200 +0800
*************** ifneq ($(filter aarch64%,$(UNAME_M)),)
*** 285,292 ****
  	ifdef JETSON_RELEASE_INFO
  		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
  			JETSON_EOL_MODULE_DETECT = 1
! 			CC = /root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang
! 			cxx = /root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++
  		endif
  	endif
  endif
--- 285,292 ----
  	ifdef JETSON_RELEASE_INFO
  		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
  			JETSON_EOL_MODULE_DETECT = 1
! 			CC = aarch64-unknown-linux-gnu-gcc
! 			cxx = aarch64-unknown-linux-gnu-g++
  		endif
  	endif
  endif
*************** endif #LLAMA_CUDA_NVCC
*** 380,386 ****
  ifdef CUDA_DOCKER_ARCH
  	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
  else ifndef CUDA_POWER_ARCH
! 	MK_NVCCFLAGS += -arch=compute_61
  endif # CUDA_DOCKER_ARCH
  ifdef LLAMA_CUDA_FORCE_DMMV
  	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
--- 380,386 ----
  ifdef CUDA_DOCKER_ARCH
  	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
  else ifndef CUDA_POWER_ARCH
! 	MK_NVCCFLAGS += -arch=native
  endif # CUDA_DOCKER_ARCH
  ifdef LLAMA_CUDA_FORCE_DMMV
  	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV

无效

手动测试

nvcc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -mcpu=native -use_fast_math --forward-unknown-to-host-compiler -arch=compute_61 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -Wno-pedantic -Xcompiler "-Wunreachable-code -Wextra-semi" -c ggml-cuda.cu -o ggml-cuda.o

报错：ggml.h(319): error: identifier "half" is undefined

修改gglm.h，插入代码如下

/usr/local/cuda/include/cuda_fp16.h定义了half

再次手动测试，成功

修改scripts/get-flags.mk

继续报错

/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -mcpu=native -Wunreachable-code -Wextra-semi examples/export-lora/export-lora.cpp ggml.o ggml-cuda.o ggml-alloc.o ggml-backend.o ggml-quants.o -o export-lora -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:626：gguf] 错误 1
make: *** 正在等待未完成的任务....
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:704：q8dot] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:701：vdot] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:693：benchmark-matmult] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)

似乎gcc环境问题，当前是gcc 10

切换到gcc 7，成功编译

export PATH=/usr/local/cuda/bin:/root/bisheng-compiler-1.3.3-aarch64-linux/bin:$PATH

make LLAMA_CUBLAS=1 -j8 CC=clang CXX=clang++ CFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs" CXXFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs"

编译出来了

可以推测问题原因：nvcc在执行编译是，默认调用了gcc 10作为编译器

启动测试（崩溃）CUDA error: CUDA-capable device(s) is/are busy or unavailable

sbsa-linux版本问题导致的？应该是aarch64-linux？

后面再研究。。。

参考：

【AI】RTX2060 6G Ubuntu 22.04.1 LTS (Jammy Jellyfish) 部署Chinese-LLaMA-Alpaca-2 【2】启用GPU支持-CSDN博客

【AI】S2500 64C*2 arm64 aarch64 kylin server 编译llama.cpp 使用chinese-alpaca-2-7b模型 CPU版本更多的核心没有带来更好的性能-CSDN博客

https://github.com/RfidResearchGroup/proxmark3/issues/1368