31 lines
754 B
Python
31 lines
754 B
Python
|
"""llama-cpp-python server from scratch in a single file.
|
||
|
"""
|
||
|
|
||
|
# import llama_cpp
|
||
|
|
||
|
# path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
|
||
|
|
||
|
# model_params = llama_cpp.llama_model_default_params()
|
||
|
# model = llama_cpp.llama_load_model_from_file(path, model_params)
|
||
|
|
||
|
# if model is None:
|
||
|
# raise RuntimeError(f"Failed to load model from file: {path}")
|
||
|
|
||
|
|
||
|
# ctx_params = llama_cpp.llama_context_default_params()
|
||
|
# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
|
||
|
|
||
|
# if ctx is None:
|
||
|
# raise RuntimeError("Failed to create context")
|
||
|
|
||
|
|
||
|
from fastapi import FastAPI
|
||
|
|
||
|
app = FastAPI()
|
||
|
|
||
|
import openai.types.chat as types
|
||
|
|
||
|
@app.post("/v1/chat/completions")
|
||
|
def create_chat_completions():
|
||
|
return {"message": "Hello World"}
|