39 lines
937 B
Python
39 lines
937 B
Python
|
import llama_cpp
|
||
|
import llama_cpp.llama_tokenizer
|
||
|
|
||
|
|
||
|
llama = llama_cpp.Llama.from_pretrained(
|
||
|
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
|
||
|
filename="*q8_0.gguf",
|
||
|
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
|
||
|
verbose=False
|
||
|
)
|
||
|
|
||
|
response = llama.create_chat_completion(
|
||
|
messages=[
|
||
|
{
|
||
|
"role": "user",
|
||
|
"content": "What is the capital of France?"
|
||
|
}
|
||
|
],
|
||
|
response_format={
|
||
|
"type": "json_object",
|
||
|
"schema": {
|
||
|
"type": "object",
|
||
|
"properties": {
|
||
|
"country": {"type": "string"},
|
||
|
"capital": {"type": "string"}
|
||
|
},
|
||
|
"required": ["country", "capital"],
|
||
|
}
|
||
|
},
|
||
|
stream=True
|
||
|
)
|
||
|
|
||
|
for chunk in response:
|
||
|
delta = chunk["choices"][0]["delta"]
|
||
|
if "content" not in delta:
|
||
|
continue
|
||
|
print(delta["content"], end="", flush=True)
|
||
|
|
||
|
print()
|