GPU
GPU架构信息
Pascal架构:
compute_60
: GeForce GTX 10xx系列、TITAN Xp, Tesla P100等
compute_61
: Quadro GP100, Tesla P40等
compute_62
: GeForce GTX 1050/1050 Ti等
修改Makefile,指定compute_61
*** Makefile 2024-01-25 16:28:19.316971200 +0800
--- Makefile.bak 2024-01-25 16:28:10.006971200 +0800
*************** endif #LLAMA_CUDA_NVCC
*** 380,386 ****
ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifndef CUDA_POWER_ARCH
! MK_NVCCFLAGS += -arch=compute_61
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
--- 380,386 ----
ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifndef CUDA_POWER_ARCH
! MK_NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
配置编译环境
echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf
ldconfig
编译llama.cpp,启用GPU支持
export PATH=/usr/local/cuda/bin:$PATH
make LLAMA_CUBLAS=1 -j32 CC=/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang CXX=/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++
报错:gcc: 错误:unrecognized command-line option ‘-Wunreachable-code-break’; did you mean ‘-Wunreachable-code’?
神奇,用的clang,报gcc错误。。。
搜索Makefile发现一下代码
修改,差异如下
*** Makefile 2024-01-25 16:47:46.126971200 +0800
--- Makefile.bak 2024-01-25 16:28:10.006971200 +0800
*************** ifneq ($(filter aarch64%,$(UNAME_M)),)
*** 285,292 ****
ifdef JETSON_RELEASE_INFO
ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
JETSON_EOL_MODULE_DETECT = 1
! CC = /root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang
! cxx = /root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++
endif
endif
endif
--- 285,292 ----
ifdef JETSON_RELEASE_INFO
ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
JETSON_EOL_MODULE_DETECT = 1
! CC = aarch64-unknown-linux-gnu-gcc
! cxx = aarch64-unknown-linux-gnu-g++
endif
endif
endif
*************** endif #LLAMA_CUDA_NVCC
*** 380,386 ****
ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifndef CUDA_POWER_ARCH
! MK_NVCCFLAGS += -arch=compute_61
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
--- 380,386 ----
ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifndef CUDA_POWER_ARCH
! MK_NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
无效
手动测试
nvcc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -mcpu=native -use_fast_math --forward-unknown-to-host-compiler -arch=compute_61 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -Wno-pedantic -Xcompiler "-Wunreachable-code -Wextra-semi" -c ggml-cuda.cu -o ggml-cuda.o
报错:ggml.h(319): error: identifier "half" is undefined
修改gglm.h,插入代码如下
/usr/local/cuda/include/cuda_fp16.h定义了half
再次手动测试,成功
修改scripts/get-flags.mk
继续报错
/root/bisheng-compiler-1.3.3-aarch64-linux/bin/clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -mcpu=native -Wunreachable-code -Wextra-semi examples/export-lora/export-lora.cpp ggml.o ggml-cuda.o ggml-alloc.o ggml-backend.o ggml-quants.o -o export-lora -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:626:gguf] 错误 1
make: *** 正在等待未完成的任务....
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:704:q8dot] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:701:vdot] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Makefile:693:benchmark-matmult] 错误 1
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_free(int, void*, unsigned long)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a670): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a6ec): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: ggml-cuda.o: in function `ggml_cuda_pool_malloc(int, unsigned long, unsigned long*)':
tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a8b0): undefined reference to `__aarch64_swp1_acq'
/usr/bin/ld: tmpxft_00041b59_00000000-6_ggml-cuda.cudafe1.cpp:(.text+0x1a954): undefined reference to `__aarch64_swp1_acq'
clang-10: error: linker command failed with exit code 1 (use -v to see invocation)
似乎gcc环境问题,当前是gcc 10
切换到gcc 7,成功编译
export PATH=/usr/local/cuda/bin:/root/bisheng-compiler-1.3.3-aarch64-linux/bin:$PATH
make LLAMA_CUBLAS=1 -j8 CC=clang CXX=clang++ CFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs" CXXFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs"
编译出来了
可以推测问题原因:nvcc在执行编译是,默认调用了gcc 10作为编译器
启动测试(崩溃)CUDA error: CUDA-capable device(s) is/are busy or unavailable
sbsa-linux版本问题导致的?应该是aarch64-linux?
后面再研究。。。
参考:
【AI】RTX2060 6G Ubuntu 22.04.1 LTS (Jammy Jellyfish) 部署Chinese-LLaMA-Alpaca-2 【2】启用GPU支持-CSDN博客
【AI】S2500 64C*2 arm64 aarch64 kylin server 编译llama.cpp 使用chinese-alpaca-2-7b模型 CPU版本 更多的核心没有带来更好的性能-CSDN博客
https://github.com/RfidResearchGroup/proxmark3/issues/1368