package llama /* #cgo CPPFLAGS: -O3 -DNDEBUG=1 -DGGML_USE_K_QUANTS #cgo CXXFLAGS: -std=c++11 #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders #include #include "llama.h" struct llama_sample_options { float repeat_penalty; float frequency_penalty; float presence_penalty; float temperature; int32_t top_k; float top_p; float tfs_z; float typical_p; int mirostat; float mirostat_tau; float mirostat_eta; }; llama_token llama_sample( struct llama_context *ctx, struct llama_token_data *candidates, size_t n_candidates, const llama_token *last_tokens, size_t n_last_tokens, struct llama_sample_options *opts) { llama_token_data_array candidates_p = { candidates, n_candidates, false, }; llama_sample_repetition_penalty( ctx, &candidates_p, last_tokens, n_last_tokens, opts->repeat_penalty); llama_sample_frequency_and_presence_penalties( ctx, &candidates_p, last_tokens, n_last_tokens, opts->frequency_penalty, opts->presence_penalty); if (opts->temperature <= 0) { return llama_sample_token_greedy(ctx, &candidates_p); } if (opts->mirostat == 1) { int mirostat_m = 100; float mirostat_mu = 2.0f * opts->mirostat_tau; llama_sample_temperature(ctx, &candidates_p, opts->temperature); return llama_sample_token_mirostat( ctx, &candidates_p, opts->mirostat_tau, opts->mirostat_eta, mirostat_m, &mirostat_mu); } else if (opts->mirostat == 2) { float mirostat_mu = 2.0f * opts->mirostat_tau; llama_sample_temperature(ctx, &candidates_p, opts->temperature); return llama_sample_token_mirostat_v2( ctx, &candidates_p, opts->mirostat_tau, opts->mirostat_eta, &mirostat_mu); } else { llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1); llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1); llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1); llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1); llama_sample_temperature(ctx, &candidates_p, opts->temperature); return llama_sample_token(ctx, &candidates_p); } } */ import "C" import ( "bytes" "errors" "fmt" "io" "os" "strings" "time" "unicode/utf8" "unsafe" "github.com/jmorganca/ollama/api" ) type LLM struct { params *C.struct_llama_context_params model *C.struct_llama_model ctx *C.struct_llama_context api.Options } func New(model string, opts api.Options) (*LLM, error) { if _, err := os.Stat(model); err != nil { return nil, err } llm := LLM{Options: opts} C.llama_backend_init(C.bool(llm.UseNUMA)) params := C.llama_context_default_params() params.seed = C.uint(llm.Seed) params.n_ctx = C.int(llm.NumCtx) params.n_batch = C.int(llm.NumBatch) params.n_gpu_layers = C.int(llm.NumGPU) params.main_gpu = C.int(llm.MainGPU) params.low_vram = C.bool(llm.LowVRAM) params.f16_kv = C.bool(llm.F16KV) params.logits_all = C.bool(llm.LogitsAll) params.vocab_only = C.bool(llm.VocabOnly) params.use_mmap = C.bool(llm.UseMMap) params.use_mlock = C.bool(llm.UseMLock) params.embedding = C.bool(llm.EmbeddingOnly) llm.params = ¶ms cModel := C.CString(model) defer C.free(unsafe.Pointer(cModel)) llm.model = C.llama_load_model_from_file(cModel, params) if llm.model == nil { return nil, errors.New("failed to load model") } llm.ctx = C.llama_new_context_with_model(llm.model, params) if llm.ctx == nil { return nil, errors.New("failed to create context") } // warm up the model bos := []C.llama_token{C.llama_token_bos()} C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread)) C.llama_reset_timings(llm.ctx) return &llm, nil } func (llm *LLM) Close() { defer C.llama_free_model(llm.model) defer C.llama_free(llm.ctx) C.llama_print_timings(llm.ctx) } func (llm *LLM) Predict(ctx []int, prompt string, fn func(api.GenerateResponse)) error { if input := llm.tokenize(prompt); input != nil { embd := make([]C.llama_token, len(ctx)) for i := range ctx { embd[i] = C.llama_token(ctx[i]) } return llm.generate(append(embd, input...), fn) } return errors.New("llama: tokenize") } func (llm *LLM) tokenize(prompt string) []C.llama_token { cPrompt := C.CString(prompt) defer C.free(unsafe.Pointer(cPrompt)) tokens := make([]C.llama_token, len(prompt)+1) if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 { return tokens[:n] } return nil } func (llm *LLM) detokenize(tokens ...C.llama_token) string { var sb strings.Builder for _, token := range tokens { sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token))) } return sb.String() } func (llm *LLM) generate(input []C.llama_token, fn func(api.GenerateResponse)) error { var opts C.struct_llama_sample_options opts.repeat_penalty = C.float(llm.RepeatPenalty) opts.frequency_penalty = C.float(llm.FrequencyPenalty) opts.presence_penalty = C.float(llm.PresencePenalty) opts.temperature = C.float(llm.Temperature) opts.top_k = C.int(llm.TopK) opts.top_p = C.float(llm.TopP) opts.tfs_z = C.float(llm.TFSZ) opts.typical_p = C.float(llm.TypicalP) opts.mirostat = C.int(llm.Mirostat) opts.mirostat_tau = C.float(llm.MirostatTau) opts.mirostat_eta = C.float(llm.MirostatEta) output := deque[C.llama_token]{capacity: llm.NumCtx} context := deque[int]{capacity: llm.NumCtx / 2} for _, in := range input { context.PushLeft(int(in)) } var b bytes.Buffer for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) { if retval := C.llama_eval(llm.ctx, unsafe.SliceData(input), C.int(len(input)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 { return errors.New("llama: eval") } token, err := llm.sample(output, &opts) if errors.Is(err, io.EOF) { break } else if err != nil { return err } b.WriteString(llm.detokenize(token)) if utf8.Valid(b.Bytes()) || b.Len() >= utf8.UTFMax { // call the callback fn(api.GenerateResponse{ Response: b.String(), }) output.PushLeft(token) context.PushLeft(int(token)) b.Reset() } input = []C.llama_token{token} } dur := func(ms float64) time.Duration { d, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) if err != nil { panic(err) } return d } timings := C.llama_get_timings(llm.ctx) fn(api.GenerateResponse{ Done: true, Context: context.Data(), PromptEvalCount: int(timings.n_p_eval), PromptEvalDuration: dur(float64(timings.t_p_eval_ms)), EvalCount: int(timings.n_eval), EvalDuration: dur(float64(timings.t_eval_ms)), }) return nil } func (llm *LLM) sample(output deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) { numVocab := int(C.llama_n_vocab(llm.ctx)) logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab) candidates := deque[C.struct_llama_token_data]{capacity: numVocab} for i := 0; i < candidates.Cap(); i++ { candidates.PushLeft(C.struct_llama_token_data{ id: C.int(i), logit: logits[i], p: 0, }) } token := C.llama_sample( llm.ctx, unsafe.SliceData(candidates.Data()), C.size_t(candidates.Len()), unsafe.SliceData(output.Data()), C.size_t(output.Len()), opts) if token != C.llama_token_eos() { return token, nil } return 0, io.EOF }