ollama/llm/shim_ext_server.go

//go:build !darwin

package llm

/*

#include <stdlib.h>
#include "dynamic_shim.h"

*/
import "C"
import (
	"context"
	"embed"
	"errors"
	"fmt"
	"io/fs"
	"log"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"unsafe"

	"github.com/jmorganca/ollama/api"
)

//go:embed llama.cpp/gguf/build/lib/*
var libEmbed embed.FS

var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")

type shimExtServer struct {
	s       C.struct_dynamic_llama_server
	options api.Options
}

// Note: current implementation does not support concurrent instantiations
var shimMutex sync.Mutex
var llm *shimExtServer

func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
}
func (llm *shimExtServer) llama_server_start() {
	C.dynamic_shim_llama_server_start(llm.s)
}
func (llm *shimExtServer) llama_server_stop() {
	C.dynamic_shim_llama_server_stop(llm.s)
}

func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
}
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
}
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
}
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
}

func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
}

func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
	shimMutex.Lock()
	defer shimMutex.Unlock()
	libPath := C.CString(library)
	defer C.free(unsafe.Pointer(libPath))
	resp := newExtServerResp(128)
	defer freeExtServerResp(resp)
	var srv C.struct_dynamic_llama_server
	C.dynamic_shim_init(libPath, &srv, &resp)
	if resp.id < 0 {
		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
	}
	llm = &shimExtServer{
		s:       srv,
		options: opts,
	}
	log.Printf("Loading Dynamic Shim llm server: %s", library)
	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
}

func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
	return predict(llm, llm.options, ctx, pred, fn)
}

func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
	return encode(llm, ctx, prompt)
}

func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
	return decode(llm, ctx, tokens)
}

func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
	return embedding(llm, ctx, input)
}

func (llm *shimExtServer) Close() {
	close(llm)
}

func nativeInit(workdir string) error {
	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
	if err != nil {
		if err == payloadMissing {
			log.Printf("%s", payloadMissing)
			return nil
		}
		return err
	}
	for _, lib := range libs {
		libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
		AvailableShims[libName] = lib
	}

	// Only check ROCm access if we have the dynamic lib loaded
	if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
		// Verify we have permissions - either running as root, or we have group access to the driver
		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
		if err != nil {
			if errors.Is(err, fs.ErrPermission) {
				log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
				return err
			} else if errors.Is(err, fs.ErrNotExist) {
				// expected behavior without a radeon card
				return nil
			}

			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
		}
		fd.Close()

	}

	return nil
}
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`//go:build !darwin`

			`package llm`

			`/*`

			`#include <stdlib.h>`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`#include "dynamic_shim.h"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
			`*/`
			`import "C"`
			`import (`
			`"context"`
			`"embed"`
			`"errors"`
			`"fmt"`
			`"io/fs"`
			`"log"`
			`"os"`
			`"path/filepath"`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`"strings"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"sync"`
			`"unsafe"`

			`"github.com/jmorganca/ollama/api"`
			`)`

Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`//go:embed llama.cpp/gguf/build/lib/*`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`var libEmbed embed.FS`

			`var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")`

			`type shimExtServer struct {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`s C.struct_dynamic_llama_server`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`options api.Options`
			`}`

			`// Note: current implementation does not support concurrent instantiations`
			`var shimMutex sync.Mutex`
			`var llm *shimExtServer`

			`func (llm shimExtServer) llama_server_init(sparams C.ext_server_params_t, err *C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_init(llm.s, sparams, err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm *shimExtServer) llama_server_start() {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_start(llm.s)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm *shimExtServer) llama_server_stop() {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_stop(llm.s)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`func (llm shimExtServer) llama_server_completion(json_req C.char, resp *C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_completion_next_result(task_id C.int, resp C.ext_server_task_result_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_completion_cancel(task_id C.int, err C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_release_task_result(result C.ext_server_task_result_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_release_task_result(llm.s, result)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`func (llm shimExtServer) llama_server_tokenize(json_req C.char, json_resp *C.char, err C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_detokenize(json_req C.char, json_resp *C.char, err C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_embedding(json_req C.char, json_resp *C.char, err C.ext_server_resp_t) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`func (llm shimExtServer) llama_server_release_json_resp(json_resp *C.char) {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {`
			`shimMutex.Lock()`
			`defer shimMutex.Unlock()`
			`libPath := C.CString(library)`
			`defer C.free(unsafe.Pointer(libPath))`
			`resp := newExtServerResp(128)`
			`defer freeExtServerResp(resp)`
			`var srv C.struct_dynamic_llama_server`
			`C.dynamic_shim_init(libPath, &srv, &resp)`
			`if resp.id < 0 {`
			`return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`llm = &shimExtServer{`
			`s: srv,`
			`options: opts,`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`log.Printf("Loading Dynamic Shim llm server: %s", library)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`return newExtServer(llm, model, adapters, projectors, numLayers, opts)`
			`}`

			`func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {`
			`return predict(llm, llm.options, ctx, pred, fn)`
			`}`

			`func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {`
			`return encode(llm, ctx, prompt)`
			`}`

			`func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {`
			`return decode(llm, ctx, tokens)`
			`}`

			`func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {`
			`return embedding(llm, ctx, input)`
			`}`

			`func (llm *shimExtServer) Close() {`
			`close(llm)`
			`}`

			`func nativeInit(workdir string) error {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/server")`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if err != nil {`
Carry ggml-metal.metal as payload 2023-12-19 02:32:04 +00:00			`if err == payloadMissing {`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`log.Printf("%s", payloadMissing)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`return nil`
			`}`
			`return err`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`}`
			`for _, lib := range libs {`
			`libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]`
			`AvailableShims[libName] = lib`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`// Only check ROCm access if we have the dynamic lib loaded`
			`if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {`
			`// Verify we have permissions - either running as root, or we have group access to the driver`
			`fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)`
			`if err != nil {`
			`if errors.Is(err, fs.ErrPermission) {`
			`log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")`
			`return err`
			`} else if errors.Is(err, fs.ErrNotExist) {`
			`// expected behavior without a radeon card`
			`return nil`
			`}`

			`return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`fd.Close()`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00
			`}`

			`return nil`
			`}`