bool
This commit is contained in:
parent
66fe77f084
commit
55cd3ddcca
8 changed files with 82 additions and 83 deletions
|
@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if envconfig.NoHistory {
|
if envconfig.NoHistory() {
|
||||||
scanner.HistoryDisable()
|
scanner.HistoryDisable()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,21 +17,6 @@ import (
|
||||||
|
|
||||||
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
|
||||||
|
|
||||||
// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
|
|
||||||
func Debug() bool {
|
|
||||||
if s := clean("OLLAMA_DEBUG"); s != "" {
|
|
||||||
b, err := strconv.ParseBool(s)
|
|
||||||
if err != nil {
|
|
||||||
// non-empty value is truthy
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
return b
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
||||||
// Default is scheme "http" and host "127.0.0.1:11434"
|
// Default is scheme "http" and host "127.0.0.1:11434"
|
||||||
func Host() *url.URL {
|
func Host() *url.URL {
|
||||||
|
@ -77,7 +62,7 @@ func Host() *url.URL {
|
||||||
|
|
||||||
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
||||||
func Origins() (origins []string) {
|
func Origins() (origins []string) {
|
||||||
if s := clean("OLLAMA_ORIGINS"); s != "" {
|
if s := getenv("OLLAMA_ORIGINS"); s != "" {
|
||||||
origins = strings.Split(s, ",")
|
origins = strings.Split(s, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,9 +99,37 @@ func Models() string {
|
||||||
return filepath.Join(home, ".ollama", "models")
|
return filepath.Join(home, ".ollama", "models")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Bool(k string) func() bool {
|
||||||
|
return func() bool {
|
||||||
|
if s := getenv(k); s != "" {
|
||||||
|
b, err := strconv.ParseBool(s)
|
||||||
|
if err != nil {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Debug enabled additional debug information.
|
||||||
|
Debug = Bool("OLLAMA_DEBUG")
|
||||||
|
// FlashAttention enables the experimental flash attention feature.
|
||||||
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
||||||
|
// NoHistory disables readline history.
|
||||||
|
NoHistory = Bool("OLLAMA_NOHISTORY")
|
||||||
|
// NoPrune disables pruning of model blobs on startup.
|
||||||
|
NoPrune = Bool("OLLAMA_NOPRUNE")
|
||||||
|
// SchedSpread allows scheduling models across all GPUs.
|
||||||
|
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
||||||
|
// IntelGPU enables experimental Intel GPU detection.
|
||||||
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
// Experimental flash attention
|
|
||||||
FlashAttention bool
|
|
||||||
// Set via OLLAMA_KEEP_ALIVE in the environment
|
// Set via OLLAMA_KEEP_ALIVE in the environment
|
||||||
KeepAlive time.Duration
|
KeepAlive time.Duration
|
||||||
// Set via OLLAMA_LLM_LIBRARY in the environment
|
// Set via OLLAMA_LLM_LIBRARY in the environment
|
||||||
|
@ -125,22 +138,12 @@ var (
|
||||||
MaxRunners int
|
MaxRunners int
|
||||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||||
MaxQueuedRequests int
|
MaxQueuedRequests int
|
||||||
// Set via OLLAMA_MODELS in the environment
|
|
||||||
ModelsDir string
|
|
||||||
// Set via OLLAMA_NOHISTORY in the environment
|
|
||||||
NoHistory bool
|
|
||||||
// Set via OLLAMA_NOPRUNE in the environment
|
|
||||||
NoPrune bool
|
|
||||||
// Set via OLLAMA_NUM_PARALLEL in the environment
|
// Set via OLLAMA_NUM_PARALLEL in the environment
|
||||||
NumParallel int
|
NumParallel int
|
||||||
// Set via OLLAMA_RUNNERS_DIR in the environment
|
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||||
RunnersDir string
|
RunnersDir string
|
||||||
// Set via OLLAMA_SCHED_SPREAD in the environment
|
|
||||||
SchedSpread bool
|
|
||||||
// Set via OLLAMA_TMPDIR in the environment
|
// Set via OLLAMA_TMPDIR in the environment
|
||||||
TmpDir string
|
TmpDir string
|
||||||
// Set via OLLAMA_INTEL_GPU in the environment
|
|
||||||
IntelGpu bool
|
|
||||||
|
|
||||||
// Set via CUDA_VISIBLE_DEVICES in the environment
|
// Set via CUDA_VISIBLE_DEVICES in the environment
|
||||||
CudaVisibleDevices string
|
CudaVisibleDevices string
|
||||||
|
@ -163,19 +166,19 @@ type EnvVar struct {
|
||||||
func AsMap() map[string]EnvVar {
|
func AsMap() map[string]EnvVar {
|
||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
|
||||||
}
|
}
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
|
@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
|
||||||
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
|
||||||
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
|
||||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
|
||||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
@ -197,8 +200,8 @@ func Values() map[string]string {
|
||||||
return vals
|
return vals
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean quotes and spaces from the value
|
// getenv returns an environment variable stripped of leading and trailing quotes or spaces
|
||||||
func clean(key string) string {
|
func getenv(key string) string {
|
||||||
return strings.Trim(os.Getenv(key), "\"' ")
|
return strings.Trim(os.Getenv(key), "\"' ")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,14 +216,7 @@ func init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadConfig() {
|
func LoadConfig() {
|
||||||
if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
|
RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
|
||||||
d, err := strconv.ParseBool(fa)
|
|
||||||
if err == nil {
|
|
||||||
FlashAttention = d
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
|
|
||||||
if runtime.GOOS == "windows" && RunnersDir == "" {
|
if runtime.GOOS == "windows" && RunnersDir == "" {
|
||||||
// On Windows we do not carry the payloads inside the main executable
|
// On Windows we do not carry the payloads inside the main executable
|
||||||
appExe, err := os.Executable()
|
appExe, err := os.Executable()
|
||||||
|
@ -256,11 +252,11 @@ func LoadConfig() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TmpDir = clean("OLLAMA_TMPDIR")
|
TmpDir = getenv("OLLAMA_TMPDIR")
|
||||||
|
|
||||||
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
|
||||||
|
|
||||||
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||||
val, err := strconv.Atoi(onp)
|
val, err := strconv.Atoi(onp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||||
|
@ -269,24 +265,7 @@ func LoadConfig() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
|
maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
|
||||||
NoHistory = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
|
|
||||||
s, err := strconv.ParseBool(spread)
|
|
||||||
if err == nil {
|
|
||||||
SchedSpread = s
|
|
||||||
} else {
|
|
||||||
SchedSpread = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
|
||||||
NoPrune = true
|
|
||||||
}
|
|
||||||
|
|
||||||
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
|
|
||||||
if maxRunners != "" {
|
if maxRunners != "" {
|
||||||
m, err := strconv.Atoi(maxRunners)
|
m, err := strconv.Atoi(maxRunners)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -305,20 +284,16 @@ func LoadConfig() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ka := clean("OLLAMA_KEEP_ALIVE")
|
ka := getenv("OLLAMA_KEEP_ALIVE")
|
||||||
if ka != "" {
|
if ka != "" {
|
||||||
loadKeepAlive(ka)
|
loadKeepAlive(ka)
|
||||||
}
|
}
|
||||||
|
|
||||||
if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
|
CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
|
||||||
IntelGpu = set
|
HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
|
||||||
}
|
RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
|
||||||
|
GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
|
||||||
CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
|
HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
|
||||||
HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
|
|
||||||
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
|
|
||||||
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
|
|
||||||
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadKeepAlive(ka string) {
|
func loadKeepAlive(ka string) {
|
||||||
|
|
|
@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
|
||||||
require.True(t, Debug())
|
require.True(t, Debug())
|
||||||
|
|
||||||
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
|
t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
|
||||||
LoadConfig()
|
require.True(t, FlashAttention())
|
||||||
require.True(t, FlashAttention)
|
|
||||||
t.Setenv("OLLAMA_KEEP_ALIVE", "")
|
t.Setenv("OLLAMA_KEEP_ALIVE", "")
|
||||||
LoadConfig()
|
LoadConfig()
|
||||||
require.Equal(t, 5*time.Minute, KeepAlive)
|
require.Equal(t, 5*time.Minute, KeepAlive)
|
||||||
|
@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBool(t *testing.T) {
|
||||||
|
cases := map[string]struct {
|
||||||
|
value string
|
||||||
|
expect bool
|
||||||
|
}{
|
||||||
|
"empty": {"", false},
|
||||||
|
"true": {"true", true},
|
||||||
|
"false": {"false", false},
|
||||||
|
"1": {"1", true},
|
||||||
|
"0": {"0", false},
|
||||||
|
"random": {"random", true},
|
||||||
|
"something": {"something", true},
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, tt := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
t.Setenv("OLLAMA_BOOL", tt.value)
|
||||||
|
if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
|
||||||
|
t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Intel
|
// Intel
|
||||||
if envconfig.IntelGpu {
|
if envconfig.IntelGPU() {
|
||||||
oHandles = initOneAPIHandles()
|
oHandles = initOneAPIHandles()
|
||||||
// On windows we bundle the oneapi library one level above the runner dir
|
// On windows we bundle the oneapi library one level above the runner dir
|
||||||
depPath = ""
|
depPath = ""
|
||||||
|
|
|
@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--memory-f32")
|
params = append(params, "--memory-f32")
|
||||||
}
|
}
|
||||||
|
|
||||||
flashAttnEnabled := envconfig.FlashAttention
|
flashAttnEnabled := envconfig.FlashAttention()
|
||||||
|
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
// only cuda (compute capability 7+) and metal support flash attention
|
// only cuda (compute capability 7+) and metal support flash attention
|
||||||
|
|
|
@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !envconfig.NoPrune && old != nil {
|
if !envconfig.NoPrune() && old != nil {
|
||||||
if err := old.RemoveLayers(); err != nil {
|
if err := old.RemoveLayers(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||||
// build deleteMap to prune unused layers
|
// build deleteMap to prune unused layers
|
||||||
deleteMap := make(map[string]struct{})
|
deleteMap := make(map[string]struct{})
|
||||||
|
|
||||||
if !envconfig.NoPrune {
|
if !envconfig.NoPrune() {
|
||||||
manifest, _, err = GetManifest(mp)
|
manifest, _, err = GetManifest(mp)
|
||||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if !envconfig.NoPrune {
|
if !envconfig.NoPrune() {
|
||||||
// clean up unused layers and manifests
|
// clean up unused layers and manifests
|
||||||
if err := PruneLayers(); err != nil {
|
if err := PruneLayers(); err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
|
||||||
// First attempt to fit the model into a single GPU
|
// First attempt to fit the model into a single GPU
|
||||||
for _, p := range numParallelToTry {
|
for _, p := range numParallelToTry {
|
||||||
req.opts.NumCtx = req.origNumCtx * p
|
req.opts.NumCtx = req.origNumCtx * p
|
||||||
if !envconfig.SchedSpread {
|
if !envconfig.SchedSpread() {
|
||||||
for _, g := range sgl {
|
for _, g := range sgl {
|
||||||
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
||||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||||
|
|
Loading…
Reference in a new issue