Better tmpdir cleanup

If expanding the runners fails, don't leave a corrupt/incomplete payloads dir
We now write a pid file out to the tmpdir, which allows us to scan for stale tmpdirs
and remove this as long as there isn't still a process running.
This commit is contained in:
Daniel Hiltgen 2024-03-13 11:43:45 -07:00
parent 7ed3e94105
commit 74788b487c
2 changed files with 52 additions and 1 deletions

View file

@ -1,13 +1,16 @@
package gpu
import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
)
var (
@ -19,10 +22,22 @@ func PayloadsDir() (string, error) {
lock.Lock()
defer lock.Unlock()
if payloadsDir == "" {
cleanupTmpDirs()
tmpDir, err := os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
// Track our pid so we can clean up orphaned tmpdirs
pidFilePath := filepath.Join(tmpDir, "ollama.pid")
pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
if err != nil {
return "", err
}
if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
return "", err
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
payloadsDir = filepath.Join(tmpDir, "runners")
@ -30,6 +45,36 @@ func PayloadsDir() (string, error) {
return payloadsDir, nil
}
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
if err != nil {
return
}
for _, d := range dirs {
info, err := os.Stat(d)
if err != nil || !info.IsDir() {
continue
}
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
if err == nil {
pid, err := strconv.Atoi(string(raw))
if err == nil {
if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
// Another running ollama, ignore this tmpdir
continue
}
}
} else {
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
}
err = os.RemoveAll(d)
if err != nil {
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
}
}
}
func Cleanup() {
lock.Lock()
defer lock.Unlock()

View file

@ -196,7 +196,13 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
return nil
})
}
return libs, g.Wait()
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return nil, err
}
return libs, nil
}
func verifyDriverAccess() error {