from ctransformers import AutoModelForCausalLM
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", model_type="mistral", gpu_layers=50)
print(llm("AI is going to"))
Quantized versions are here:
https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF
Q4_K_M
https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/blob/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
see https://github.com/mrseanryan/gpt-workflow/tree/master/local-llm-q