After following these instructions to run on CPU:
https://dev.to/nithinibhandari1999/how-to-run-llama-2-on-your-local-computer-42g1
The modified Python code to run the inference on a GPU is here.
After following these instructions to run on CPU:
https://dev.to/nithinibhandari1999/how-to-run-llama-2-on-your-local-computer-42g1
The modified Python code to run the inference on a GPU is here.
| import time | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| timeStart = time.time() | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "meta-llama/Llama-2-7b-chat-hf" | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "meta-llama/Llama-2-7b-chat-hf", | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| trust_remote_code=True) | |
| print("Load model time: ", -timeStart + time.time()) | |
| while(True): | |
| input_str = input('Enter: ') | |
| input_token_length = input('Enter length: ') | |
| if(input_str == 'exit'): | |
| break | |
| timeStart = time.time() | |
| inputs = tokenizer.encode( | |
| input_str, | |
| return_tensors="pt" | |
| ) | |
| inputs = inputs.to('cuda') | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=int(input_token_length), | |
| ) | |
| output_str = tokenizer.decode(outputs[0]) | |
| print(output_str) | |
| print("Time taken: ", -timeStart + time.time()) |