llama.cpp/examples/hf_pull/main.py
2024-02-25 21:09:41 -05:00

39 lines
No EOL
937 B
Python

import llama_cpp
import llama_cpp.llama_tokenizer
llama = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False
)
response = llama.create_chat_completion(
messages=[
{
"role": "user",
"content": "What is the capital of France?"
}
],
response_format={
"type": "json_object",
"schema": {
"type": "object",
"properties": {
"country": {"type": "string"},
"capital": {"type": "string"}
},
"required": ["country", "capital"],
}
},
stream=True
)
for chunk in response:
delta = chunk["choices"][0]["delta"]
if "content" not in delta:
continue
print(delta["content"], end="", flush=True)
print()