From d1692fd3e0b4a80ff55ba052b430207134df4714 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Wed, 15 May 2024 15:43:16 -0700 Subject: [PATCH] fix the cpu estimatedTotal memory + get the expiry time for loading models (#4461) --- llm/server.go | 1 + server/routes.go | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/llm/server.go b/llm/server.go index 11969997..ccb1e419 100644 --- a/llm/server.go +++ b/llm/server.go @@ -89,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner = serverForCpu() gpuCount = 0 + _, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) } else { if gpus[0].Library == "metal" { memInfo, err := gpu.GetCPUMem() diff --git a/server/routes.go b/server/routes.go index 42b5c910..e991e774 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1161,6 +1161,14 @@ func (s *Server) ProcessHandler(c *gin.Context) { Details: modelDetails, ExpiresAt: v.expiresAt, } + // The scheduler waits to set expiresAt, so if a model is loading it's + // possible that it will be set to the unix epoch. For those cases, just + // calculate the time w/ the sessionDuration instead. + var epoch time.Time + if v.expiresAt == epoch { + mr.ExpiresAt = time.Now().Add(v.sessionDuration) + } + models = append(models, mr) }