ollama/llama/llama.go

283 lines
7.2 KiB
Go
Raw Normal View History

2023-07-03 16:32:48 -04:00
package llama
2023-07-07 15:29:17 -07:00
/*
2023-07-11 09:50:02 -07:00
#cgo CPPFLAGS: -O3 -DNDEBUG=1
#cgo CXXFLAGS: -std=c++11
#cgo darwin CPPFLAGS: -DGGML_USE_METAL=1 -DGGML_METAL_NDEBUG=1
2023-07-07 15:29:17 -07:00
#cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#include <stdlib.h>
#include "llama.h"
struct llama_sample_options
{
float repeat_penalty;
float frequency_penalty;
float presence_penalty;
float temperature;
int32_t top_k;
float top_p;
float tfs_z;
float typical_p;
int mirostat;
float mirostat_tau;
float mirostat_eta;
};
llama_token llama_sample(
struct llama_context *ctx,
struct llama_token_data *candidates,
size_t n_candidates,
const llama_token *last_tokens,
size_t n_last_tokens,
struct llama_sample_options *opts)
{
llama_token_data_array candidates_p = {
candidates,
n_candidates,
false,
};
llama_sample_repetition_penalty(
ctx, &candidates_p,
last_tokens, n_last_tokens,
opts->repeat_penalty);
llama_sample_frequency_and_presence_penalties(
ctx, &candidates_p,
last_tokens, n_last_tokens,
opts->frequency_penalty, opts->presence_penalty);
if (opts->temperature <= 0) {
return llama_sample_token_greedy(ctx, &candidates_p);
}
2023-07-06 17:49:05 -04:00
2023-07-07 15:29:17 -07:00
if (opts->mirostat == 1) {
int mirostat_m = 100;
float mirostat_mu = 2.0f * opts->mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token_mirostat(
ctx, &candidates_p,
opts->mirostat_tau, opts->mirostat_eta,
mirostat_m, &mirostat_mu);
} else if (opts->mirostat == 2) {
float mirostat_mu = 2.0f * opts->mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token_mirostat_v2(
ctx, &candidates_p,
opts->mirostat_tau, opts->mirostat_eta,
&mirostat_mu);
} else {
llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token(ctx, &candidates_p);
}
}
*/
import "C"
2023-07-03 16:32:48 -04:00
import (
2023-07-14 18:30:32 -07:00
"bytes"
2023-07-07 15:29:17 -07:00
"errors"
2023-07-12 18:18:06 -07:00
"fmt"
2023-07-07 15:29:17 -07:00
"io"
"os"
2023-07-03 16:32:48 -04:00
"strings"
2023-07-12 18:18:06 -07:00
"time"
2023-07-14 18:30:32 -07:00
"unicode/utf8"
2023-07-03 16:32:48 -04:00
"unsafe"
2023-07-07 15:29:17 -07:00
"github.com/jmorganca/ollama/api"
2023-07-03 16:32:48 -04:00
)
2023-07-07 15:29:17 -07:00
type llama struct {
params *C.struct_llama_context_params
model *C.struct_llama_model
ctx *C.struct_llama_context
2023-07-03 16:32:48 -04:00
2023-07-07 15:29:17 -07:00
api.Options
}
2023-07-04 00:47:00 -04:00
2023-07-07 15:29:17 -07:00
func New(model string, opts api.Options) (*llama, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
2023-07-03 16:32:48 -04:00
}
2023-07-07 15:29:17 -07:00
llm := llama{Options: opts}
2023-07-11 09:50:02 -07:00
C.llama_backend_init(C.bool(llm.UseNUMA))
2023-07-07 15:29:17 -07:00
params := C.llama_context_default_params()
params.seed = C.uint(llm.Seed)
params.n_ctx = C.int(llm.NumCtx)
params.n_batch = C.int(llm.NumBatch)
params.n_gpu_layers = C.int(llm.NumGPU)
params.main_gpu = C.int(llm.MainGPU)
params.low_vram = C.bool(llm.LowVRAM)
params.f16_kv = C.bool(llm.F16KV)
params.logits_all = C.bool(llm.LogitsAll)
params.vocab_only = C.bool(llm.VocabOnly)
params.use_mmap = C.bool(llm.UseMMap)
params.use_mlock = C.bool(llm.UseMLock)
params.embedding = C.bool(llm.EmbeddingOnly)
llm.params = &params
cModel := C.CString(model)
defer C.free(unsafe.Pointer(cModel))
llm.model = C.llama_load_model_from_file(cModel, params)
2023-07-11 20:32:26 -07:00
if llm.model == nil {
return nil, errors.New("failed to load model")
}
2023-07-07 15:29:17 -07:00
llm.ctx = C.llama_new_context_with_model(llm.model, params)
2023-07-11 20:32:26 -07:00
if llm.ctx == nil {
return nil, errors.New("failed to create context")
}
2023-07-07 15:29:17 -07:00
// warm up the model
bos := []C.llama_token{C.llama_token_bos()}
C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
C.llama_reset_timings(llm.ctx)
return &llm, nil
2023-07-03 16:32:48 -04:00
}
2023-07-07 15:29:17 -07:00
func (llm *llama) Close() {
defer C.llama_free_model(llm.model)
defer C.llama_free(llm.ctx)
C.llama_print_timings(llm.ctx)
2023-07-03 16:32:48 -04:00
}
func (llm *llama) Predict(ctx []int, prompt string, fn func(api.GenerateResponse)) error {
if input := llm.tokenize(prompt); input != nil {
embd := make([]C.llama_token, len(ctx))
for i := range ctx {
embd[i] = C.llama_token(ctx[i])
}
return llm.generate(append(embd, input...), fn)
2023-07-03 16:32:48 -04:00
}
2023-07-07 15:29:17 -07:00
return errors.New("llama: tokenize")
}
2023-07-06 16:37:11 -07:00
2023-07-07 15:29:17 -07:00
func (llm *llama) tokenize(prompt string) []C.llama_token {
cPrompt := C.CString(prompt)
defer C.free(unsafe.Pointer(cPrompt))
2023-07-06 11:24:54 -07:00
2023-07-07 15:29:17 -07:00
tokens := make([]C.llama_token, llm.NumCtx)
if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
return tokens[:n]
2023-07-03 16:32:48 -04:00
}
return nil
}
2023-07-07 15:29:17 -07:00
func (llm *llama) detokenize(tokens ...C.llama_token) string {
var sb strings.Builder
for _, token := range tokens {
sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
2023-07-03 16:32:48 -04:00
}
2023-07-07 15:29:17 -07:00
return sb.String()
2023-07-03 16:32:48 -04:00
}
2023-07-12 18:18:06 -07:00
func (llm *llama) generate(input []C.llama_token, fn func(api.GenerateResponse)) error {
2023-07-07 15:29:17 -07:00
var opts C.struct_llama_sample_options
opts.repeat_penalty = C.float(llm.RepeatPenalty)
opts.frequency_penalty = C.float(llm.FrequencyPenalty)
opts.presence_penalty = C.float(llm.PresencePenalty)
opts.temperature = C.float(llm.Temperature)
opts.top_k = C.int(llm.TopK)
opts.top_p = C.float(llm.TopP)
opts.tfs_z = C.float(llm.TFSZ)
opts.typical_p = C.float(llm.TypicalP)
opts.mirostat = C.int(llm.Mirostat)
opts.mirostat_tau = C.float(llm.MirostatTau)
opts.mirostat_eta = C.float(llm.MirostatEta)
2023-07-12 18:18:06 -07:00
output := deque[C.llama_token]{capacity: llm.NumCtx}
2023-07-07 15:29:17 -07:00
context := deque[int]{capacity: llm.NumCtx / 2}
for _, in := range input {
context.PushLeft(int(in))
}
2023-07-14 18:30:32 -07:00
var b bytes.Buffer
2023-07-07 15:29:17 -07:00
for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
2023-07-12 18:18:06 -07:00
if retval := C.llama_eval(llm.ctx, unsafe.SliceData(input), C.int(len(input)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
2023-07-07 15:29:17 -07:00
return errors.New("llama: eval")
}
2023-07-12 18:18:06 -07:00
token, err := llm.sample(output, &opts)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
2023-07-12 09:35:56 -07:00
return err
2023-07-07 15:29:17 -07:00
}
2023-07-14 18:30:32 -07:00
b.WriteString(llm.detokenize(token))
if utf8.Valid(b.Bytes()) || b.Len() >= utf8.UTFMax {
// call the callback
fn(api.GenerateResponse{
Response: b.String(),
})
2023-07-12 18:18:06 -07:00
2023-07-14 18:30:32 -07:00
output.PushLeft(token)
context.PushLeft(int(token))
b.Reset()
}
2023-07-12 18:18:06 -07:00
input = []C.llama_token{token}
}
2023-07-07 15:29:17 -07:00
2023-07-12 18:18:06 -07:00
dur := func(ms float64) time.Duration {
d, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil {
panic(err)
}
2023-07-07 15:29:17 -07:00
2023-07-12 18:18:06 -07:00
return d
2023-07-03 16:32:48 -04:00
}
2023-07-12 18:18:06 -07:00
timings := C.llama_get_timings(llm.ctx)
fn(api.GenerateResponse{
Done: true,
Context: context.Data(),
2023-07-12 18:18:06 -07:00
PromptEvalCount: int(timings.n_p_eval),
PromptEvalDuration: dur(float64(timings.t_p_eval_ms)),
EvalCount: int(timings.n_eval),
EvalDuration: dur(float64(timings.t_eval_ms)),
})
2023-07-07 15:29:17 -07:00
return nil
2023-07-03 16:32:48 -04:00
}
2023-07-12 18:18:06 -07:00
func (llm *llama) sample(output deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
2023-07-07 15:29:17 -07:00
numVocab := int(C.llama_n_vocab(llm.ctx))
logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)
2023-07-12 18:18:06 -07:00
candidates := deque[C.struct_llama_token_data]{capacity: numVocab}
for i := 0; i < candidates.Cap(); i++ {
candidates.PushLeft(C.struct_llama_token_data{
2023-07-07 15:29:17 -07:00
id: C.int(i),
logit: logits[i],
p: 0,
})
}
2023-07-03 16:32:48 -04:00
2023-07-07 15:29:17 -07:00
token := C.llama_sample(
llm.ctx,
2023-07-14 17:29:16 -07:00
unsafe.SliceData(candidates.Data()), C.size_t(candidates.Len()),
unsafe.SliceData(output.Data()), C.size_t(output.Len()),
2023-07-07 15:29:17 -07:00
opts)
if token != C.llama_token_eos() {
return token, nil
2023-07-03 16:32:48 -04:00
}
2023-07-07 15:29:17 -07:00
return 0, io.EOF
2023-07-03 16:32:48 -04:00
}