diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index fa39047..db9705f 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -14,11 +14,12 @@ import llama_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool -from fastapi import Depends, FastAPI, APIRouter, Request, Response +from fastapi import Depends, FastAPI, APIRouter, Request, Response, HTTPException, status from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from fastapi.routing import APIRoute +from fastapi.security import HTTPBearer from pydantic import BaseModel, Field from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse @@ -163,6 +164,10 @@ class Settings(BaseSettings): default=True, description="Whether to interrupt requests when a new request is received.", ) + api_key: Optional[str] = Field( + default=None, + description="API key for authentication. If set all requests need to be authenticated." + ) class ErrorResponse(TypedDict): @@ -314,6 +319,9 @@ class RouteErrorHandler(APIRoute): elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000) response.headers["openai-processing-ms"] = f"{elapsed_time_ms}" return response + except HTTPException as unauthorized: + # api key check failed + raise unauthorized except Exception as exc: json_body = await request.json() try: @@ -658,6 +666,27 @@ def _logit_bias_tokens_to_input_ids( return to_bias +# Setup Bearer authentication scheme +bearer_scheme = HTTPBearer(auto_error=False) + + +async def authenticate(settings: Settings = Depends(get_settings), authorization: Optional[str] = Depends(bearer_scheme)): + # Skip API key check if it's not set in settings + if settings.api_key is None: + return True + + # check bearer credentials against the api_key + if authorization and authorization.credentials == settings.api_key: + # api key is valid + return authorization.credentials + + # raise http error 401 + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + ) + + @router.post( "/v1/completions", summary="Completion" @@ -667,6 +696,7 @@ async def create_completion( request: Request, body: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), + authenticated: str = Depends(authenticate), ) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 @@ -740,7 +770,9 @@ class CreateEmbeddingRequest(BaseModel): summary="Embedding" ) async def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) + request: CreateEmbeddingRequest, + llama: llama_cpp.Llama = Depends(get_llama), + authenticated: str = Depends(authenticate), ): return await run_in_threadpool( llama.create_embedding, **request.model_dump(exclude={"user"}) @@ -834,6 +866,7 @@ async def create_chat_completion( body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), settings: Settings = Depends(get_settings), + authenticated: str = Depends(authenticate), ) -> llama_cpp.ChatCompletion: exclude = { "n", @@ -895,6 +928,7 @@ class ModelList(TypedDict): @router.get("/v1/models", summary="Models") async def get_models( settings: Settings = Depends(get_settings), + authenticated: str = Depends(authenticate), ) -> ModelList: assert llama is not None return {