ollama/llm/llm.go

package llm

// #cgo CFLAGS: -Illama.cpp
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #include <stdlib.h>
// #include "llama.h"
import "C"
import (
	"fmt"
	"strings"
	"unsafe"
)

// SystemInfo is an unused example of calling llama.cpp functions using CGo
func SystemInfo() string {
	return C.GoString(C.llama_print_system_info())
}

func Quantize(infile, outfile string, ftype fileType) error {
	cinfile := C.CString(infile)
	defer C.free(unsafe.Pointer(cinfile))

	coutfile := C.CString(outfile)
	defer C.free(unsafe.Pointer(coutfile))

	params := C.llama_model_quantize_default_params()
	params.nthread = -1
	params.ftype = ftype.Value()

	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
		return fmt.Errorf("llama_model_quantize: %d", rc)
	}

	return nil
}

type llamaModel struct {
	m *C.struct_llama_model
}

func newLlamaModel(p string) *llamaModel {
	cs := C.CString(p)
	defer C.free(unsafe.Pointer(cs))

	return &llamaModel{
		C.llama_load_model_from_file(
			cs,
			C.llama_model_default_params(),
		),
	}
}

func (llm *llamaModel) Close() {
	C.llama_free_model(llm.m)
}

func (llm *llamaModel) Tokenize(s string) []int {
	cs := C.CString(s)
	defer C.free(unsafe.Pointer(cs))

	tokens := make([]int, len(s)+2)
	if n := C.llama_tokenize(llm.m, cs, C.int(len(s)), (*C.llama_token)(unsafe.Pointer(&tokens[0])), C.int(len(s)+2), false, true); n > 0 {
		return tokens[:n]
	}

	return nil
}

func (llm *llamaModel) Detokenize(i32s []int) string {
	var sb strings.Builder
	for _, i32 := range i32s {
		c := make([]byte, 512)
		if n := C.llama_token_to_piece(llm.m, C.llama_token(i32), (*C.char)(unsafe.Pointer(&c[0])), C.int(len(c)), false); n > 0 {
			sb.WriteString(unsafe.String(&c[0], n))
		}
	}

	return sb.String()
}
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`package llm`

Switch back to subprocessing for llama.cpp This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently. 2024-03-14 17:24:13 +00:00			`// #cgo CFLAGS: -Illama.cpp`
			`// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++`
			`// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++`
			`// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++`
Add import declaration for windows,arm64 to llm.go 2024-04-27 04:24:53 +00:00			`// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++`
Switch back to subprocessing for llama.cpp This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently. 2024-03-14 17:24:13 +00:00			`// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++`
			`// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++`
cgo quantize 2024-04-05 15:49:04 +00:00			`// #include <stdlib.h>`
Switch back to subprocessing for llama.cpp This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently. 2024-03-14 17:24:13 +00:00			`// #include "llama.h"`
			`import "C"`
cgo quantize 2024-04-05 15:49:04 +00:00			`import (`
			`"fmt"`
use ffi for tokenizing/detokenizing 2024-05-11 19:49:24 +00:00			`"strings"`
cgo quantize 2024-04-05 15:49:04 +00:00			`"unsafe"`
			`)`
Switch back to subprocessing for llama.cpp This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently. 2024-03-14 17:24:13 +00:00
			`// SystemInfo is an unused example of calling llama.cpp functions using CGo`
			`func SystemInfo() string {`
			`return C.GoString(C.llama_print_system_info())`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`}`
cgo quantize 2024-04-05 15:49:04 +00:00
comments 2024-04-23 22:18:45 +00:00			`func Quantize(infile, outfile string, ftype fileType) error {`
cgo quantize 2024-04-05 15:49:04 +00:00			`cinfile := C.CString(infile)`
			`defer C.free(unsafe.Pointer(cinfile))`

			`coutfile := C.CString(outfile)`
			`defer C.free(unsafe.Pointer(coutfile))`

			`params := C.llama_model_quantize_default_params()`
			`params.nthread = -1`
quantize any fp16/fp32 model - FROM /path/to/{safetensors,pytorch} - FROM /path/to/fp{16,32}.bin - FROM model:fp{16,32} 2024-04-12 20:55:12 +00:00			`params.ftype = ftype.Value()`
cgo quantize 2024-04-05 15:49:04 +00:00
quantize any fp16/fp32 model - FROM /path/to/{safetensors,pytorch} - FROM /path/to/fp{16,32}.bin - FROM model:fp{16,32} 2024-04-12 20:55:12 +00:00			`if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {`
			`return fmt.Errorf("llama_model_quantize: %d", rc)`
cgo quantize 2024-04-05 15:49:04 +00:00			`}`

			`return nil`
			`}`
use ffi for tokenizing/detokenizing 2024-05-11 19:49:24 +00:00
			`type llamaModel struct {`
			`m *C.struct_llama_model`
			`}`

			`func newLlamaModel(p string) *llamaModel {`
			`cs := C.CString(p)`
			`defer C.free(unsafe.Pointer(cs))`

			`return &llamaModel{`
			`C.llama_load_model_from_file(`
			`cs,`
			`C.llama_model_default_params(),`
			`),`
			`}`
			`}`

			`func (llm *llamaModel) Close() {`
			`C.llama_free_model(llm.m)`
			`}`

			`func (llm *llamaModel) Tokenize(s string) []int {`
			`cs := C.CString(s)`
			`defer C.free(unsafe.Pointer(cs))`

			`tokens := make([]int, len(s)+2)`
			`if n := C.llama_tokenize(llm.m, cs, C.int(len(s)), (*C.llama_token)(unsafe.Pointer(&tokens[0])), C.int(len(s)+2), false, true); n > 0 {`
			`return tokens[:n]`
			`}`

			`return nil`
			`}`

			`func (llm *llamaModel) Detokenize(i32s []int) string {`
			`var sb strings.Builder`
			`for _, i32 := range i32s {`
			`c := make([]byte, 512)`
			`if n := C.llama_token_to_piece(llm.m, C.llama_token(i32), (*C.char)(unsafe.Pointer(&c[0])), C.int(len(c)), false); n > 0 {`
			`sb.WriteString(unsafe.String(&c[0], n))`
			`}`
			`}`

			`return sb.String()`
			`}`