add llama.cpp go bindings

2023-07-03 16:32:48 -04:00 · 2023-07-03 16:32:48 -04:00 · 6093a88c1a
commit 6093a88c1a
parent 76cb60d496
18 changed files with 841 additions and 79 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # Ollama
-A fast runtime for large language models, powered by [llama.cpp](https://github.com/ggerganov/llama.cpp).
+An easy, fast runtime for large language models, powered by `llama.cpp`.
 > _Note: this project is a work in progress. Certain models that can be run with `ollama` are intended for research and/or non-commercial use only._
@ -38,6 +38,13 @@ Or directly via downloaded model files:
 ollama run ~/Downloads/orca-mini-13b.ggmlv3.q4_0.bin
 ```
 ## Building
 ```
 go generate ./...
 go build .
 ```
 ## Documentation
 - [Development](docs/development.md)
--- a/api/client.go
+++ b/api/client.go
@ -8,7 +8,7 @@ import (
 	"io"
 	"net/http"
-	"github.com/ollama/ollama/signature"
+	"github.com/jmorganca/ollama/signature"
 )
 type Client struct {
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -3,7 +3,6 @@ package cmd
 import (
 	"context"
 	"fmt"
 	"io/ioutil"
 	"log"
 	"net"
 	"net/http"
@ -13,8 +12,8 @@ import (
 	"github.com/spf13/cobra"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
-	"github.com/ollama/ollama/server"
+	"github.com/jmorganca/ollama/server"
 )
 func NewAPIClient(cmd *cobra.Command) (*api.Client, error) {
@ -36,7 +35,7 @@ func NewAPIClient(cmd *cobra.Command) (*api.Client, error) {
 	if k != "" {
 		fn := path.Join(home, ".ollama/keys/", k)
-		rawKey, err = ioutil.ReadFile(fn)
+		rawKey, err = os.ReadFile(fn)
 		if err != nil {
 			return nil, err
 		}
@ -59,7 +58,7 @@ func NewCLI() *cobra.Command {
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
 	rootCmd := &cobra.Command{
-		Use:   "gollama",
+		Use:   "ollama",
 		Short: "Run any large language model on any machine.",
 		CompletionOptions: cobra.CompletionOptions{
 			DisableDefaultCmd: true,
--- a/go.mod
+++ b/go.mod
@ -1,11 +1,9 @@
-module github.com/ollama/ollama
+module github.com/jmorganca/ollama
 go 1.20
 require (
 	github.com/gin-gonic/gin v1.9.1
 	github.com/go-skynet/go-llama.cpp v0.0.0-20230630201504-ecd358d2f144
 	github.com/r3labs/sse v0.0.0-20210224172625-26fe804710bc
 	github.com/spf13/cobra v1.7.0
 	golang.org/x/crypto v0.10.0
 )
@ -19,6 +17,7 @@ require (
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-playground/validator/v10 v10.14.0 // indirect
 	github.com/goccy/go-json v0.10.2 // indirect
 	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.4 // indirect
@ -35,6 +34,5 @@ require (
 	golang.org/x/sys v0.9.0 // indirect
 	golang.org/x/text v0.10.0 // indirect
 	google.golang.org/protobuf v1.30.0 // indirect
 	gopkg.in/cenkalti/backoff.v1 v1.1.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -6,6 +6,7 @@ github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhD
 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
 github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
@ -13,18 +14,19 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE
 github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
 github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
 github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
 github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
 github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
 github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
 github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
 github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
 github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
 github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230630201504-ecd358d2f144 h1:fszkmZG3pW9/bqhuWB6sfJMArJPx1RPzjZSqNdhuSQ0=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230630201504-ecd358d2f144/go.mod h1:tzi97YvT1bVQ+iTG39LvpDkKG1WbizgtljC+orSoM40=
 github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
 github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
@ -44,8 +46,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
 github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/r3labs/sse v0.0.0-20210224172625-26fe804710bc/go.mod h1:S8xSOnV3CgpNrWd0GQ/OoQfMtlg2uPRSuTzcSGrzwK8=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
 github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
@ -55,12 +57,12 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
@ -69,27 +71,23 @@ github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZ
 golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
 golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/net v0.0.0-20191116160921-f9c825593386/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
 golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
 golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/term v0.9.0 h1:GRRCnKYhdQrD8kfRAdQ6Zcw1P0OcELxGLKJvtjVMZ28=
 golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58=
 golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
 google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
-gopkg.in/cenkalti/backoff.v1 v1.1.0/go.mod h1:J6Vskwqd+OMVJl8C33mmtxTBs2gyzfv7UDAkHu8BrjI=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/lib/.gitignore
+++ b/lib/.gitignore
@ -1 +0,0 @@
 build
--- a/lib/README.md
+++ b/lib/README.md
@ -1,10 +0,0 @@
 # Bindings
 These are Llama.cpp bindings
 ## Build
 ```
 cmake -S . -B build
 cmake --build build
 ```
--- a/lib/binding.h
+++ b/lib/binding.h
@ -1,41 +0,0 @@
 #ifdef __cplusplus
 #include <vector>
 #include <string>
 extern "C" {
 #endif
 #include <stdbool.h>
 extern unsigned char tokenCallback(void *, char *);
 int load_state(void *ctx, char *statefile, char*modes);
 int eval(void* params_ptr, void *ctx, char*text);
 void save_state(void *ctx, char *dst, char*modes);
 void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit, bool numa);
 int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
 int get_token_embeddings(void* params_ptr, void* state_pr,  int *tokens, int tokenSize, float * res_embeddings);
 void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
                            int top_k, float top_p, float temp, float repeat_penalty, 
                            int repeat_last_n, bool ignore_eos, bool memory_f16, 
                            int n_batch, int n_keep, const char** antiprompt, int antiprompt_count,
                            float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, const char *logit_bias, const char *session_file, bool prompt_cache_all, bool mlock, bool mmap, const char *maingpu, const char *tensorsplit , bool prompt_cache_ro);
 void llama_free_params(void* params_ptr);
 void llama_binding_free_model(void* state);
 int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug);
 #ifdef __cplusplus
 }
 std::vector<std::string> create_vector(const char** strings, int count);
 void delete_vector(std::vector<std::string>* vec);
 #endif
--- a/llama/.gitignore
+++ b/llama/.gitignore
@ -0,0 +1 @@
 build
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@ -9,13 +9,19 @@ FetchContent_Declare(
 FetchContent_MakeAvailable(llama_cpp)
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    set(LLAMA_METAL ON)
    add_compile_definitions(GGML_USE_METAL)
 endif()
 project(binding)
-set(LLAMA_METAL ON CACHE BOOL "Enable Llama Metal by default on macOS")
+add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
 add_library(binding binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
 target_compile_features(binding PRIVATE cxx_std_11)
 target_include_directories(binding PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR})
 target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples)
 target_link_libraries(binding llama ggml_static)
 configure_file(${llama_cpp_BINARY_DIR}/libllama.a ${CMAKE_CURRENT_BINARY_DIR}/libllama.a COPYONLY)
 configure_file(${llama_cpp_BINARY_DIR}/libggml_static.a ${CMAKE_CURRENT_BINARY_DIR}/libggml_static.a COPYONLY)
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
@ -1,3 +1,25 @@
 // MIT License
 // Copyright (c) 2023 go-skynet authors
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 #include "common.h"
 #include "llama.h"
--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
@ -0,0 +1,71 @@
 // MIT License
 // Copyright (c) 2023 go-skynet authors
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 #ifdef __cplusplus
 #include <string>
 #include <vector>
 extern "C" {
 #endif
 #include <stdbool.h>
 extern unsigned char tokenCallback(void *, char *);
 int load_state(void *ctx, char *statefile, char *modes);
 int eval(void *params_ptr, void *ctx, char *text);
 void save_state(void *ctx, char *dst, char *modes);
 void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
                 bool mlock, bool embeddings, bool mmap, bool low_vram,
                 bool vocab_only, int n_gpu, int n_batch, const char *maingpu,
                 const char *tensorsplit, bool numa);
 int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings);
 int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
                         int tokenSize, float *res_embeddings);
 void *llama_allocate_params(
    const char *prompt, int seed, int threads, int tokens, int top_k,
    float top_p, float temp, float repeat_penalty, int repeat_last_n,
    bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
    float frequency_penalty, float presence_penalty, int mirostat,
    float mirostat_eta, float mirostat_tau, bool penalize_nl,
    const char *logit_bias, const char *session_file, bool prompt_cache_all,
    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
    bool prompt_cache_ro);
 void llama_free_params(void *params_ptr);
 void llama_binding_free_model(void *state);
 int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug);
 #ifdef __cplusplus
 }
 std::vector<std::string> create_vector(const char **strings, int count);
 void delete_vector(std::vector<std::string> *vec);
 #endif
--- a/llama/llama.go
+++ b/llama/llama.go
@ -0,0 +1,302 @@
 // MIT License
 // Copyright (c) 2023 go-skynet authors
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 //go:generate cmake -S . -B build
 //go:generate cmake --build build
 package llama
 // #cgo LDFLAGS: -Lbuild -lbinding -lllama -lggml_static -lstdc++
 // #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 // #cgo darwin CXXFLAGS: -std=c++11
 // #include "binding/binding.h"
 import "C"
 import (
 	"fmt"
 	"os"
 	"strings"
 	"sync"
 	"unsafe"
 )
 type LLama struct {
 	state       unsafe.Pointer
 	embeddings  bool
 	contextSize int
 }
 func New(model string, opts ...ModelOption) (*LLama, error) {
 	mo := NewModelOptions(opts...)
 	modelPath := C.CString(model)
 	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA))
 	if result == nil {
 		return nil, fmt.Errorf("failed loading model")
 	}
 	ll := &LLama{state: result, contextSize: mo.ContextSize, embeddings: mo.Embeddings}
 	return ll, nil
 }
 func (l *LLama) Free() {
 	C.llama_binding_free_model(l.state)
 }
 func (l *LLama) LoadState(state string) error {
 	d := C.CString(state)
 	w := C.CString("rb")
 	result := C.load_state(l.state, d, w)
 	if result != 0 {
 		return fmt.Errorf("error while loading state")
 	}
 	return nil
 }
 func (l *LLama) SaveState(dst string) error {
 	d := C.CString(dst)
 	w := C.CString("wb")
 	C.save_state(l.state, d, w)
 	_, err := os.Stat(dst)
 	return err
 }
 // Token Embeddings
 func (l *LLama) TokenEmbeddings(tokens []int, opts ...PredictOption) ([]float32, error) {
 	if !l.embeddings {
 		return []float32{}, fmt.Errorf("model loaded without embeddings")
 	}
 	po := NewPredictOptions(opts...)
 	outSize := po.Tokens
 	if po.Tokens == 0 {
 		outSize = 9999999
 	}
 	floats := make([]float32, outSize)
 	myArray := (*C.int)(C.malloc(C.size_t(len(tokens)) * C.sizeof_int))
 	// Copy the values from the Go slice to the C array
 	for i, v := range tokens {
 		(*[1<<31 - 1]int32)(unsafe.Pointer(myArray))[i] = int32(v)
 	}
 	params := C.llama_allocate_params(C.CString(""), C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
 		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
 		C.int(po.Batch), C.int(po.NKeep), nil, C.int(0),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
 		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
 		C.CString(po.MainGPU), C.CString(po.TensorSplit),
 		C.bool(po.PromptCacheRO),
 	)
 	ret := C.get_token_embeddings(params, l.state, myArray, C.int(len(tokens)), (*C.float)(&floats[0]))
 	if ret != 0 {
 		return floats, fmt.Errorf("embedding inference failed")
 	}
 	return floats, nil
 }
 // Embeddings
 func (l *LLama) Embeddings(text string, opts ...PredictOption) ([]float32, error) {
 	if !l.embeddings {
 		return []float32{}, fmt.Errorf("model loaded without embeddings")
 	}
 	po := NewPredictOptions(opts...)
 	input := C.CString(text)
 	if po.Tokens == 0 {
 		po.Tokens = 99999999
 	}
 	floats := make([]float32, po.Tokens)
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
 	for i, s := range po.StopPrompts {
 		cs := C.CString(s)
 		reversePrompt[i] = cs
 		pass = &reversePrompt[0]
 	}
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
 		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
 		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
 		C.CString(po.MainGPU), C.CString(po.TensorSplit),
 		C.bool(po.PromptCacheRO),
 	)
 	ret := C.get_embeddings(params, l.state, (*C.float)(&floats[0]))
 	if ret != 0 {
 		return floats, fmt.Errorf("embedding inference failed")
 	}
 	return floats, nil
 }
 func (l *LLama) Eval(text string, opts ...PredictOption) error {
 	po := NewPredictOptions(opts...)
 	input := C.CString(text)
 	if po.Tokens == 0 {
 		po.Tokens = 99999999
 	}
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
 	for i, s := range po.StopPrompts {
 		cs := C.CString(s)
 		reversePrompt[i] = cs
 		pass = &reversePrompt[0]
 	}
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
 		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
 		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
 		C.CString(po.MainGPU), C.CString(po.TensorSplit),
 		C.bool(po.PromptCacheRO),
 	)
 	ret := C.eval(params, l.state, input)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	C.llama_free_params(params)
 	return nil
 }
 func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	po := NewPredictOptions(opts...)
 	if po.TokenCallback != nil {
 		setCallback(l.state, po.TokenCallback)
 	}
 	input := C.CString(text)
 	if po.Tokens == 0 {
 		po.Tokens = 99999999
 	}
 	out := make([]byte, po.Tokens)
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
 	for i, s := range po.StopPrompts {
 		cs := C.CString(s)
 		reversePrompt[i] = cs
 		pass = &reversePrompt[0]
 	}
 	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
 		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
 		C.bool(po.IgnoreEOS), C.bool(po.F16KV),
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
 		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
 		C.CString(po.MainGPU), C.CString(po.TensorSplit),
 		C.bool(po.PromptCacheRO),
 	)
 	ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
 	if ret != 0 {
 		return "", fmt.Errorf("inference failed")
 	}
 	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
 	res = strings.TrimPrefix(res, " ")
 	res = strings.TrimPrefix(res, text)
 	res = strings.TrimPrefix(res, "\n")
 	for _, s := range po.StopPrompts {
 		res = strings.TrimRight(res, s)
 	}
 	C.llama_free_params(params)
 	if po.TokenCallback != nil {
 		setCallback(l.state, nil)
 	}
 	return res, nil
 }
 // CGo only allows us to use static calls from C to Go, we can't just dynamically pass in func's.
 // This is the next best thing, we register the callbacks in this map and call tokenCallback from
 // the C code. We also attach a finalizer to LLama, so it will unregister the callback when the
 // garbage collection frees it.
 // SetTokenCallback registers a callback for the individual tokens created when running Predict. It
 // will be called once for each token. The callback shall return true as long as the model should
 // continue predicting the next token. When the callback returns false the predictor will return.
 // The tokens are just converted into Go strings, they are not trimmed or otherwise changed. Also
 // the tokens may not be valid UTF-8.
 // Pass in nil to remove a callback.
 //
 // It is save to call this method while a prediction is running.
 func (l *LLama) SetTokenCallback(callback func(token string) bool) {
 	setCallback(l.state, callback)
 }
 var (
 	m         sync.Mutex
 	callbacks = map[uintptr]func(string) bool{}
 )
 //export tokenCallback
 func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
 	m.Lock()
 	defer m.Unlock()
 	if callback, ok := callbacks[uintptr(statePtr)]; ok {
 		return callback(C.GoString(token))
 	}
 	return true
 }
 // setCallback can be used to register a token callback for LLama. Pass in a nil callback to
 // remove the callback.
 func setCallback(statePtr unsafe.Pointer, callback func(string) bool) {
 	m.Lock()
 	defer m.Unlock()
 	if callback == nil {
 		delete(callbacks, uintptr(statePtr))
 	} else {
 		callbacks[uintptr(statePtr)] = callback
 	}
 }
--- a/llama/llama_cublas.go
+++ b/llama/llama_cublas.go
@ -0,0 +1,9 @@
 //go:build cublas
 // +build cublas
 package llama
 /*
 #cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
 */
 import "C"
--- a/llama/llama_openblas.go
+++ b/llama/llama_openblas.go
@ -0,0 +1,9 @@
 //go:build openblas
 // +build openblas
 package llama
 /*
 #cgo LDFLAGS: -lopenblas
 */
 import "C"
--- a/llama/options.go
+++ b/llama/options.go
@ -0,0 +1,392 @@
 // MIT License
 // Copyright (c) 2023 go-skynet authors
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 // The above copyright notice and this permission notice shall be included in all
 // copies or substantial portions of the Software.
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 package llama
 type ModelOptions struct {
 	ContextSize int
 	Seed        int
 	NBatch      int
 	F16Memory   bool
 	MLock       bool
 	MMap        bool
 	VocabOnly   bool
 	LowVRAM     bool
 	Embeddings  bool
 	NUMA        bool
 	NGPULayers  int
 	MainGPU     string
 	TensorSplit string
 }
 type PredictOptions struct {
 	Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
 	TopP, Temperature, Penalty                        float64
 	F16KV                                             bool
 	DebugMode                                         bool
 	StopPrompts                                       []string
 	IgnoreEOS                                         bool
 	TailFreeSamplingZ float64
 	TypicalP          float64
 	FrequencyPenalty  float64
 	PresencePenalty   float64
 	Mirostat          int
 	MirostatETA       float64
 	MirostatTAU       float64
 	PenalizeNL        bool
 	LogitBias         string
 	TokenCallback     func(string) bool
 	PathPromptCache             string
 	MLock, MMap, PromptCacheAll bool
 	PromptCacheRO               bool
 	MainGPU                     string
 	TensorSplit                 string
 }
 type PredictOption func(p *PredictOptions)
 type ModelOption func(p *ModelOptions)
 var DefaultModelOptions ModelOptions = ModelOptions{
 	ContextSize: 512,
 	Seed:        0,
 	F16Memory:   false,
 	MLock:       false,
 	Embeddings:  false,
 	MMap:        true,
 	LowVRAM:     false,
 }
 var DefaultOptions PredictOptions = PredictOptions{
 	Seed:              -1,
 	Threads:           4,
 	Tokens:            128,
 	Penalty:           1.1,
 	Repeat:            64,
 	Batch:             512,
 	NKeep:             64,
 	TopK:              40,
 	TopP:              0.95,
 	TailFreeSamplingZ: 1.0,
 	TypicalP:          1.0,
 	Temperature:       0.8,
 	FrequencyPenalty:  0.0,
 	PresencePenalty:   0.0,
 	Mirostat:          0,
 	MirostatTAU:       5.0,
 	MirostatETA:       0.1,
 	MMap:              true,
 }
 // SetContext sets the context size.
 func SetContext(c int) ModelOption {
 	return func(p *ModelOptions) {
 		p.ContextSize = c
 	}
 }
 func SetModelSeed(c int) ModelOption {
 	return func(p *ModelOptions) {
 		p.Seed = c
 	}
 }
 // SetContext sets the context size.
 func SetMMap(b bool) ModelOption {
 	return func(p *ModelOptions) {
 		p.MMap = b
 	}
 }
 // SetNBatch sets the  n_Batch
 func SetNBatch(n_batch int) ModelOption {
 	return func(p *ModelOptions) {
 		p.NBatch = n_batch
 	}
 }
 // Set sets the tensor split for the GPU
 func SetTensorSplit(maingpu string) ModelOption {
 	return func(p *ModelOptions) {
 		p.TensorSplit = maingpu
 	}
 }
 // SetMainGPU sets the main_gpu
 func SetMainGPU(maingpu string) ModelOption {
 	return func(p *ModelOptions) {
 		p.MainGPU = maingpu
 	}
 }
 // SetPredictionTensorSplit sets the tensor split for the GPU
 func SetPredictionTensorSplit(maingpu string) PredictOption {
 	return func(p *PredictOptions) {
 		p.TensorSplit = maingpu
 	}
 }
 // SetPredictionMainGPU sets the main_gpu
 func SetPredictionMainGPU(maingpu string) PredictOption {
 	return func(p *PredictOptions) {
 		p.MainGPU = maingpu
 	}
 }
 var VocabOnly ModelOption = func(p *ModelOptions) {
 	p.VocabOnly = true
 }
 var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
 	p.LowVRAM = true
 }
 var EnableNUMA ModelOption = func(p *ModelOptions) {
 	p.NUMA = true
 }
 var EnableEmbeddings ModelOption = func(p *ModelOptions) {
 	p.Embeddings = true
 }
 var EnableF16Memory ModelOption = func(p *ModelOptions) {
 	p.F16Memory = true
 }
 var EnableF16KV PredictOption = func(p *PredictOptions) {
 	p.F16KV = true
 }
 var Debug PredictOption = func(p *PredictOptions) {
 	p.DebugMode = true
 }
 var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
 	p.PromptCacheAll = true
 }
 var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
 	p.PromptCacheRO = true
 }
 var EnableMLock ModelOption = func(p *ModelOptions) {
 	p.MLock = true
 }
 // Create a new PredictOptions object with the given options.
 func NewModelOptions(opts ...ModelOption) ModelOptions {
 	p := DefaultModelOptions
 	for _, opt := range opts {
 		opt(&p)
 	}
 	return p
 }
 var IgnoreEOS PredictOption = func(p *PredictOptions) {
 	p.IgnoreEOS = true
 }
 // SetMlock sets the memory lock.
 func SetMlock(b bool) PredictOption {
 	return func(p *PredictOptions) {
 		p.MLock = b
 	}
 }
 // SetMemoryMap sets memory mapping.
 func SetMemoryMap(b bool) PredictOption {
 	return func(p *PredictOptions) {
 		p.MMap = b
 	}
 }
 // SetGPULayers sets the number of GPU layers to use to offload computation
 func SetGPULayers(n int) ModelOption {
 	return func(p *ModelOptions) {
 		p.NGPULayers = n
 	}
 }
 // SetTokenCallback sets the prompts that will stop predictions.
 func SetTokenCallback(fn func(string) bool) PredictOption {
 	return func(p *PredictOptions) {
 		p.TokenCallback = fn
 	}
 }
 // SetStopWords sets the prompts that will stop predictions.
 func SetStopWords(stop ...string) PredictOption {
 	return func(p *PredictOptions) {
 		p.StopPrompts = stop
 	}
 }
 // SetSeed sets the random seed for sampling text generation.
 func SetSeed(seed int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Seed = seed
 	}
 }
 // SetThreads sets the number of threads to use for text generation.
 func SetThreads(threads int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Threads = threads
 	}
 }
 // SetTokens sets the number of tokens to generate.
 func SetTokens(tokens int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Tokens = tokens
 	}
 }
 // SetTopK sets the value for top-K sampling.
 func SetTopK(topk int) PredictOption {
 	return func(p *PredictOptions) {
 		p.TopK = topk
 	}
 }
 // SetTopP sets the value for nucleus sampling.
 func SetTopP(topp float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.TopP = topp
 	}
 }
 // SetTemperature sets the temperature value for text generation.
 func SetTemperature(temp float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.Temperature = temp
 	}
 }
 // SetPathPromptCache sets the session file to store the prompt cache.
 func SetPathPromptCache(f string) PredictOption {
 	return func(p *PredictOptions) {
 		p.PathPromptCache = f
 	}
 }
 // SetPenalty sets the repetition penalty for text generation.
 func SetPenalty(penalty float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.Penalty = penalty
 	}
 }
 // SetRepeat sets the number of times to repeat text generation.
 func SetRepeat(repeat int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Repeat = repeat
 	}
 }
 // SetBatch sets the batch size.
 func SetBatch(size int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Batch = size
 	}
 }
 // SetKeep sets the number of tokens from initial prompt to keep.
 func SetNKeep(n int) PredictOption {
 	return func(p *PredictOptions) {
 		p.NKeep = n
 	}
 }
 // Create a new PredictOptions object with the given options.
 func NewPredictOptions(opts ...PredictOption) PredictOptions {
 	p := DefaultOptions
 	for _, opt := range opts {
 		opt(&p)
 	}
 	return p
 }
 // SetTailFreeSamplingZ sets the tail free sampling, parameter z.
 func SetTailFreeSamplingZ(tfz float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.TailFreeSamplingZ = tfz
 	}
 }
 // SetTypicalP sets the typicality parameter, p_typical.
 func SetTypicalP(tp float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.TypicalP = tp
 	}
 }
 // SetFrequencyPenalty sets the frequency penalty parameter, freq_penalty.
 func SetFrequencyPenalty(fp float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.FrequencyPenalty = fp
 	}
 }
 // SetPresencePenalty sets the presence penalty parameter, presence_penalty.
 func SetPresencePenalty(pp float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.PresencePenalty = pp
 	}
 }
 // SetMirostat sets the mirostat parameter.
 func SetMirostat(m int) PredictOption {
 	return func(p *PredictOptions) {
 		p.Mirostat = m
 	}
 }
 // SetMirostatETA sets the mirostat ETA parameter.
 func SetMirostatETA(me float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.MirostatETA = me
 	}
 }
 // SetMirostatTAU sets the mirostat TAU parameter.
 func SetMirostatTAU(mt float64) PredictOption {
 	return func(p *PredictOptions) {
 		p.MirostatTAU = mt
 	}
 }
 // SetPenalizeNL sets whether to penalize newlines or not.
 func SetPenalizeNL(pnl bool) PredictOption {
 	return func(p *PredictOptions) {
 		p.PenalizeNL = pnl
 	}
 }
 // SetLogitBias sets the logit bias parameter.
 func SetLogitBias(lb string) PredictOption {
 	return func(p *PredictOptions) {
 		p.LogitBias = lb
 	}
 }
--- a/main.go
+++ b/main.go
@ -1,7 +1,7 @@
 package main
 import (
-	"github.com/ollama/ollama/cmd"
+	"github.com/jmorganca/ollama/cmd"
 )
 func main() {
--- a/server/routes.go
+++ b/server/routes.go
@ -9,9 +9,9 @@ import (
 	"runtime"
 	"github.com/gin-gonic/gin"
-	llama "github.com/go-skynet/go-llama.cpp"
+	llama "github.com/jmorganca/ollama/llama"
-	"github.com/ollama/ollama/api"
+	"github.com/jmorganca/ollama/api"
 )
 func Serve(ln net.Listener) error {