feat: Add endpoints for tokenize, detokenize and count tokens ()

* Add endpoint to count tokens

* Add tokenize and detokenize endpoints

* Change response key to tokens for tokenize endpoint

* Fix dependency bug

* Cleanup

* Remove example added by mistake

* Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
This commit is contained in:
Felipe Lorenz 2024-03-08 21:09:00 -05:00 committed by GitHub
parent 1f3156d4f2
commit c139f8b5d5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 105 additions and 2 deletions
llama_cpp/server

View file

@ -41,6 +41,11 @@ from llama_cpp.server.types import (
CreateEmbeddingRequest, CreateEmbeddingRequest,
CreateChatCompletionRequest, CreateChatCompletionRequest,
ModelList, ModelList,
TokenizeInputRequest,
TokenizeInputResponse,
TokenizeInputCountResponse,
DetokenizeInputRequest,
DetokenizeInputResponse,
) )
from llama_cpp.server.errors import RouteErrorHandler from llama_cpp.server.errors import RouteErrorHandler
@ -196,6 +201,9 @@ async def authenticate(
) )
openai_v1_tag = "OpenAI V1"
@router.post( @router.post(
"/v1/completions", "/v1/completions",
summary="Completion", summary="Completion",
@ -227,11 +235,13 @@ async def authenticate(
}, },
} }
}, },
tags=[openai_v1_tag],
) )
@router.post( @router.post(
"/v1/engines/copilot-codex/completions", "/v1/engines/copilot-codex/completions",
include_in_schema=False, include_in_schema=False,
dependencies=[Depends(authenticate)], dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
) )
async def create_completion( async def create_completion(
request: Request, request: Request,
@ -297,7 +307,10 @@ async def create_completion(
@router.post( @router.post(
"/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)] "/v1/embeddings",
summary="Embedding",
dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
) )
async def create_embedding( async def create_embedding(
request: CreateEmbeddingRequest, request: CreateEmbeddingRequest,
@ -339,6 +352,7 @@ async def create_embedding(
}, },
} }
}, },
tags=[openai_v1_tag],
) )
async def create_chat_completion( async def create_chat_completion(
request: Request, request: Request,
@ -391,7 +405,12 @@ async def create_chat_completion(
return iterator_or_completion return iterator_or_completion
@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]) @router.get(
"/v1/models",
summary="Models",
dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
)
async def get_models( async def get_models(
llama_proxy: LlamaProxy = Depends(get_llama_proxy), llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> ModelList: ) -> ModelList:
@ -407,3 +426,51 @@ async def get_models(
for model_alias in llama_proxy for model_alias in llama_proxy
], ],
} }
extras_tag = "Extras"
@router.post(
"/extras/tokenize",
summary="Tokenize",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def tokenize(
body: TokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> TokenizeInputResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
return {"tokens": tokens}
@router.post(
"/extras/tokenize/count",
summary="Tokenize Count",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def count_query_tokens(
body: TokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> TokenizeInputCountResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
return {"count": len(tokens)}
@router.post(
"/extras/detokenize",
summary="Detokenize",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def detokenize(
body: DetokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> DetokenizeInputResponse:
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
return {"text": text}

View file

@ -264,3 +264,39 @@ class ModelData(TypedDict):
class ModelList(TypedDict): class ModelList(TypedDict):
object: Literal["list"] object: Literal["list"]
data: List[ModelData] data: List[ModelData]
class TokenizeInputRequest(BaseModel):
model: Optional[str] = model_field
input: Optional[str] = Field(description="The input to tokenize.")
model_config = {
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
}
class TokenizeInputResponse(BaseModel):
tokens: List[int] = Field(description="A list of tokens.")
model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
class TokenizeInputCountResponse(BaseModel):
count: int = Field(description="The number of tokens in the input.")
model_config = {"json_schema_extra": {"example": {"count": 5}}}
class DetokenizeInputRequest(BaseModel):
model: Optional[str] = model_field
tokens: List[int] = Field(description="A list of toekns to detokenize.")
model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
class DetokenizeInputResponse(BaseModel):
text: str = Field(description="The detokenized text.")
model_config = {
"json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
}