From d1692fd3e0b4a80ff55ba052b430207134df4714 Mon Sep 17 00:00:00 2001
From: Patrick Devine <pdevine@sonic.net>
Date: Wed, 15 May 2024 15:43:16 -0700
Subject: [PATCH] fix the cpu estimatedTotal memory + get the expiry time for
 loading models (#4461)

---
 llm/server.go    | 1 +
 server/routes.go | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/llm/server.go b/llm/server.go
index 11969997..ccb1e419 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -89,6 +89,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		cpuRunner = serverForCpu()
 		gpuCount = 0
+		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
diff --git a/server/routes.go b/server/routes.go
index 42b5c910..e991e774 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -1161,6 +1161,14 @@ func (s *Server) ProcessHandler(c *gin.Context) {
 			Details:   modelDetails,
 			ExpiresAt: v.expiresAt,
 		}
+		// The scheduler waits to set expiresAt, so if a model is loading it's
+		// possible that it will be set to the unix epoch. For those cases, just
+		// calculate the time w/ the sessionDuration instead.
+		var epoch time.Time
+		if v.expiresAt == epoch {
+			mr.ExpiresAt = time.Now().Add(v.sessionDuration)
+		}
+
 		models = append(models, mr)
 	}