42998d797d
* remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm
11 lines
467 B
Go
11 lines
467 B
Go
//go:build darwin
|
|
// +build darwin
|
|
|
|
package llm
|
|
|
|
//go:generate git submodule init
|
|
//go:generate git submodule update --force ggml
|
|
//go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch
|
|
//go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch
|
|
//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
|
//go:generate cmake --build ggml/build/gpu --target server --config Release
|