run llama2 local - iohannes

start gpu docker container

docker run -itd -v /home/adlong/llama:/root/llama -p 8000:8000 -p 8001:8001 -p 8002:22 --name llama2 --gpus all ubuntu

clone github/llama2

apt install -y nano git python3-pip wget
ln -s /usr/bin/python3 /usr/bin/python
cd ~
git clone https://github.com/facebookresearch/llama

download 7b-chat model

just 7b-chat model can run in 3090 gpu.

get pre-signed URL

go to: https://llama.meta.com/llama-downloads, accept that License, you will receive a pre-signed URL in your email

download models with pre-signed URL

cd llama
pip install transformers accelerate sentencepiece
ln ./tokenizer.model ./llama-2-7b-chat/tokenizer.model
sh download.sh

model weights to run

cd llama
TRANSFORM=`python -c "import transformers;print('/'.join(transformers.__file__.split('/')[:-1])+'/models/llama/convert_llama_weights_to_hf.py')"`
pip install protobuf && python $TRANSFORM --input_dir ./llama-2-7b-chat --model_size 7B --output_dir ./llama-2-7b-chat-hf

run llama

nano test.py

import torch
import transformers

from transformers import LlamaForCausalLM, LlamaTokenizer

model_dir = "./llama-2-7b-chat-hf"
model = LlamaForCausalLM.from_pretrained(model_dir)

tokenizer = LlamaTokenizer.from_pretrained(model_dir)

pipeline = transformers.pipeline(
"text-generation",

model=model,

tokenizer=tokenizer,

torch_dtype=torch.float16,

device_map="auto",

)

sequences = pipeline(
'I have tomatoes, basil and cheese at home. What can I cook for dinner?\n',

do_sample=True,

top_k=10,

num_return_sequences=1,

eos_token_id=tokenizer.eos_token_id,

max_length=400,

)

for seq in sequences:
  print(f"{seq['generated_text']}")

python test.py