run docker container
docker run -itd -v /home/adlong/llm_models:/root/models -p 8000:8000 -p 8001:8001 -p 8002:22 --name llama_server --gpus all ubuntu
build llama.cpp
apt install -y nano git cmake wget build-essential nvidia-cuda-toolkit
cd ~
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
mkdir build
cd build
cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
download gguf model from Hugging Face
goto: https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main
download llama-2-7b-chat.Q5_K_M.gguf
cp llama-2-7b-chat.Q5_K_M.gguf ~/.models
run llama.cpp server
cd ~
./llama.cpp/build/bin/server -m models/llama-2-7b-chat.Q5_K_M.gguf -c 2048 --port 8001 --host 0.0.0.0
open: http:/127.0.0.1:8001/