start gpu docker container
docker run -itd -v /home/adlong/llama:/root/llama -p 8000:8000 -p 8001:8001 -p 8002:22 --name llama2 --gpus all ubuntu
clone github/llama2
apt install -y nano git python3-pip wget
ln -s /usr/bin/python3 /usr/bin/python
cd ~
git clone https://github.com/facebookresearch/llama
download 7b-chat model
just 7b-chat model can run in 3090 gpu.
get pre-signed URL
go to: https://llama.meta.com/llama-downloads, accept that License, you will receive a pre-signed URL in your email
download models with pre-signed URL
cd llama
pip install transformers accelerate sentencepiece
ln ./tokenizer.model ./llama-2-7b-chat/tokenizer.model
sh download.sh
model weights to run
cd llama
TRANSFORM=`python -c "import transformers;print('/'.join(transformers.__file__.split('/')[:-1])+'/models/llama/convert_llama_weights_to_hf.py')"`
pip install protobuf && python $TRANSFORM --input_dir ./llama-2-7b-chat --model_size 7B --output_dir ./llama-2-7b-chat-hf
run llama
nano test.py
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer
model_dir = "./llama-2-7b-chat-hf"
model = LlamaForCausalLM.from_pretrained(model_dir)
tokenizer = LlamaTokenizer.from_pretrained(model_dir)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device_map="auto",
)
sequences = pipeline(
'I have tomatoes, basil and cheese at home. What can I cook for dinner?\n',
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=400,
)
for seq in sequences:
print(f"{seq['generated_text']}")
python test.py