llama.cpp/examples/hf_pull/main.py

import llama_cpp
import llama_cpp.llama_tokenizer


llama = llama_cpp.Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q8_0.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
    verbose=False
)

response = llama.create_chat_completion(
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
    response_format={
        "type": "json_object",
        "schema": {
            "type": "object",
            "properties": {
                "country": {"type": "string"},
                "capital": {"type": "string"}
            },
            "required": ["country", "capital"],
        }
    },
    stream=True
)

for chunk in response:
    delta = chunk["choices"][0]["delta"]
    if "content" not in delta:
        continue
    print(delta["content"], end="", flush=True)

print()
docs(examples): Add huggingface pull example 2024-02-26 02:09:41 +00:00			`import llama_cpp`
			`import llama_cpp.llama_tokenizer`


			`llama = llama_cpp.Llama.from_pretrained(`
			`repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",`
			`filename="*q8_0.gguf",`
			`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),`
			`verbose=False`
			`)`

			`response = llama.create_chat_completion(`
			`messages=[`
			`{`
			`"role": "user",`
			`"content": "What is the capital of France?"`
			`}`
			`],`
			`response_format={`
			`"type": "json_object",`
			`"schema": {`
			`"type": "object",`
			`"properties": {`
			`"country": {"type": "string"},`
			`"capital": {"type": "string"}`
			`},`
			`"required": ["country", "capital"],`
			`}`
			`},`
			`stream=True`
			`)`

			`for chunk in response:`
			`delta = chunk["choices"][0]["delta"]`
			`if "content" not in delta:`
			`continue`
			`print(delta["content"], end="", flush=True)`

			`print()`