326 lines
10 KiB
Go
326 lines
10 KiB
Go
|
package llm
|
||
|
|
||
|
/*
|
||
|
#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common
|
||
|
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||
|
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
|
||
|
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
|
||
|
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
|
||
|
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
|
||
|
#cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||
|
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
||
|
#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||
|
#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
|
||
|
#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
|
||
|
#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
|
||
|
#cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
|
||
|
#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
|
||
|
#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
|
||
|
#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
|
||
|
#cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
|
||
|
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||
|
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||
|
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a
|
||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
|
||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
|
||
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
|
||
|
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
|
||
|
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
|
||
|
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
|
||
|
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a
|
||
|
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a
|
||
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
||
|
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
|
||
|
#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include "examples/server/server.h"
|
||
|
|
||
|
*/
|
||
|
import "C"
|
||
|
import (
|
||
|
"bytes"
|
||
|
"context"
|
||
|
"encoding/json"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"log"
|
||
|
"os"
|
||
|
"runtime"
|
||
|
"sync"
|
||
|
"time"
|
||
|
"unsafe"
|
||
|
|
||
|
"github.com/jmorganca/ollama/api"
|
||
|
)
|
||
|
|
||
|
func errWrap(resp C.ext_server_err) error {
|
||
|
if resp.code == 0 {
|
||
|
return nil
|
||
|
}
|
||
|
err := fmt.Errorf(C.GoString(resp.err))
|
||
|
C.free(unsafe.Pointer(resp.err))
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
type llamaExtServer struct {
|
||
|
api.Options
|
||
|
}
|
||
|
|
||
|
// Note: current implementation does not support concurrent instantiations
|
||
|
var mutex sync.Mutex
|
||
|
|
||
|
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) {
|
||
|
if !mutex.TryLock() {
|
||
|
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||
|
mutex.Lock()
|
||
|
}
|
||
|
server := &llamaExtServer{opts}
|
||
|
fileInfo, err := os.Stat(model)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
var sparams C.ext_server_params
|
||
|
sparams.model = C.CString(model)
|
||
|
defer C.free(unsafe.Pointer(sparams.model))
|
||
|
|
||
|
numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
|
||
|
|
||
|
sparams.embedding = true
|
||
|
sparams.n_ctx = C.uint(opts.NumCtx)
|
||
|
sparams.n_batch = C.uint(opts.NumBatch)
|
||
|
sparams.n_gpu_layers = C.int(numGPU)
|
||
|
sparams.main_gpu = C.int(opts.MainGPU)
|
||
|
sparams.n_parallel = 2 // TODO - wire up concurrency
|
||
|
|
||
|
// Always use the value encoded in the model
|
||
|
sparams.rope_freq_base = 0.0
|
||
|
sparams.rope_freq_scale = 0.0
|
||
|
|
||
|
sparams.lora_adapters = nil
|
||
|
for i := 0; i < len(adapters); i++ {
|
||
|
la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter))
|
||
|
defer C.free(unsafe.Pointer(la))
|
||
|
la.adapter = C.CString(adapters[i])
|
||
|
defer C.free(unsafe.Pointer(la.adapter))
|
||
|
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
|
||
|
la.next = nil
|
||
|
if i == 0 {
|
||
|
sparams.lora_adapters = la
|
||
|
} else {
|
||
|
tmp := sparams.lora_adapters
|
||
|
for ; tmp.next != nil; tmp = tmp.next {
|
||
|
}
|
||
|
tmp.next = la
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// TODO - implement ME
|
||
|
// if len(projectors) > 0 {
|
||
|
// // TODO: applying multiple projectors is not supported by the llama.cpp server yet
|
||
|
// params = append(params, "--mmproj", projectors[0])
|
||
|
// }
|
||
|
|
||
|
if opts.NumThread > 0 {
|
||
|
sparams.n_threads = C.uint(opts.NumThread)
|
||
|
} else {
|
||
|
sparams.n_threads = C.uint(runtime.NumCPU())
|
||
|
}
|
||
|
|
||
|
sparams.memory_f16 = false
|
||
|
if opts.F16KV {
|
||
|
sparams.memory_f16 = true
|
||
|
}
|
||
|
sparams.use_mlock = false
|
||
|
if opts.UseMLock {
|
||
|
sparams.use_mlock = true
|
||
|
}
|
||
|
sparams.use_mmap = true
|
||
|
if !opts.UseMMap {
|
||
|
sparams.use_mmap = false
|
||
|
}
|
||
|
sparams.numa = false
|
||
|
if opts.UseNUMA {
|
||
|
sparams.numa = true
|
||
|
}
|
||
|
|
||
|
log.Printf("Initializing internal llama server")
|
||
|
err = errWrap(C.llama_server_init(&sparams))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
log.Printf("Starting internal llama main loop")
|
||
|
C.llama_server_start()
|
||
|
return server, nil
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
||
|
|
||
|
request := map[string]any{
|
||
|
"prompt": predict.Prompt,
|
||
|
"stream": true,
|
||
|
"n_predict": llm.NumPredict,
|
||
|
"n_keep": llm.NumKeep,
|
||
|
"temperature": llm.Temperature,
|
||
|
"top_k": llm.TopK,
|
||
|
"top_p": llm.TopP,
|
||
|
"tfs_z": llm.TFSZ,
|
||
|
"typical_p": llm.TypicalP,
|
||
|
"repeat_last_n": llm.RepeatLastN,
|
||
|
"repeat_penalty": llm.RepeatPenalty,
|
||
|
"presence_penalty": llm.PresencePenalty,
|
||
|
"frequency_penalty": llm.FrequencyPenalty,
|
||
|
"mirostat": llm.Mirostat,
|
||
|
"mirostat_tau": llm.MirostatTau,
|
||
|
"mirostat_eta": llm.MirostatEta,
|
||
|
"penalize_nl": llm.PenalizeNewline,
|
||
|
"seed": llm.Seed,
|
||
|
"stop": llm.Stop,
|
||
|
}
|
||
|
|
||
|
if predict.Format == "json" {
|
||
|
request["grammar"] = jsonGrammar
|
||
|
}
|
||
|
|
||
|
// Handling JSON marshaling with special characters unescaped.
|
||
|
buffer := &bytes.Buffer{}
|
||
|
enc := json.NewEncoder(buffer)
|
||
|
enc.SetEscapeHTML(false)
|
||
|
|
||
|
if err := enc.Encode(request); err != nil {
|
||
|
return fmt.Errorf("failed to marshal data: %w", err)
|
||
|
}
|
||
|
|
||
|
req := C.CString(buffer.String())
|
||
|
defer C.free(unsafe.Pointer(req))
|
||
|
|
||
|
cmpCtx := C.llama_server_completion(req)
|
||
|
if cmpCtx.task_id < 0 {
|
||
|
defer C.free(unsafe.Pointer(cmpCtx.err))
|
||
|
return fmt.Errorf(C.GoString(cmpCtx.err))
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
select {
|
||
|
case <-ctx.Done():
|
||
|
// This handles the request cancellation
|
||
|
return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
|
||
|
default:
|
||
|
result := C.llama_server_completion_next_result(cmpCtx.task_id)
|
||
|
if result.result_json != nil {
|
||
|
defer C.free(unsafe.Pointer(result.result_json))
|
||
|
}
|
||
|
var p prediction
|
||
|
if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
|
||
|
err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
|
||
|
return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
|
||
|
}
|
||
|
|
||
|
if p.Content != "" {
|
||
|
fn(PredictResult{
|
||
|
// Model: predict.Model, // XXX remove or replace?
|
||
|
CreatedAt: time.Now().UTC(),
|
||
|
Content: p.Content,
|
||
|
})
|
||
|
}
|
||
|
|
||
|
if p.Stop {
|
||
|
fn(PredictResult{
|
||
|
// Model: predict.Model, // XXX remove or replace?
|
||
|
CreatedAt: time.Now().UTC(),
|
||
|
TotalDuration: time.Since(predict.CheckpointStart),
|
||
|
Done: true,
|
||
|
PromptEvalCount: p.Timings.PromptN,
|
||
|
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
|
||
|
EvalCount: p.Timings.PredictedN,
|
||
|
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
|
||
|
})
|
||
|
return nil
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||
|
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("marshaling encode data: %w", err)
|
||
|
}
|
||
|
req := C.CString(string(data))
|
||
|
defer C.free(unsafe.Pointer(req))
|
||
|
var resp C.ext_server_resp
|
||
|
err = errWrap(C.llama_server_tokenize(req, &resp))
|
||
|
if resp.json_resp != nil {
|
||
|
defer C.free(unsafe.Pointer(resp.json_resp))
|
||
|
}
|
||
|
|
||
|
var encoded TokenizeResponse
|
||
|
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil {
|
||
|
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
|
||
|
}
|
||
|
|
||
|
return encoded.Tokens, err
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||
|
if len(tokens) == 0 {
|
||
|
return "", nil
|
||
|
}
|
||
|
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("marshaling decode data: %w", err)
|
||
|
}
|
||
|
|
||
|
req := C.CString(string(data))
|
||
|
defer C.free(unsafe.Pointer(req))
|
||
|
var resp C.ext_server_resp
|
||
|
err = errWrap(C.llama_server_detokenize(req, &resp))
|
||
|
if resp.json_resp != nil {
|
||
|
defer C.free(unsafe.Pointer(resp.json_resp))
|
||
|
}
|
||
|
|
||
|
var decoded DetokenizeResponse
|
||
|
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil {
|
||
|
return "", fmt.Errorf("unmarshal encode response: %w", err2)
|
||
|
}
|
||
|
|
||
|
return decoded.Content, err
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||
|
data, err := json.Marshal(TokenizeRequest{Content: input})
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("error marshaling embed data: %w", err)
|
||
|
}
|
||
|
|
||
|
req := C.CString(string(data))
|
||
|
defer C.free(unsafe.Pointer(req))
|
||
|
var resp C.ext_server_resp
|
||
|
err = errWrap(C.llama_server_embedding(req, &resp))
|
||
|
if resp.json_resp != nil {
|
||
|
defer C.free(unsafe.Pointer(resp.json_resp))
|
||
|
}
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
var embedding EmbeddingResponse
|
||
|
if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil {
|
||
|
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||
|
}
|
||
|
|
||
|
return embedding.Embedding, nil
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Ping(ctx context.Context) error {
|
||
|
// TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (llm *llamaExtServer) Close() {
|
||
|
C.llama_server_stop()
|
||
|
mutex.Unlock()
|
||
|
}
|