Better tmpdir cleanup
If expanding the runners fails, don't leave a corrupt/incomplete payloads dir We now write a pid file out to the tmpdir, which allows us to scan for stale tmpdirs and remove this as long as there isn't still a process running.
This commit is contained in:
parent
7ed3e94105
commit
74788b487c
2 changed files with 52 additions and 1 deletions
|
@ -1,13 +1,16 @@
|
||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -19,10 +22,22 @@ func PayloadsDir() (string, error) {
|
||||||
lock.Lock()
|
lock.Lock()
|
||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
if payloadsDir == "" {
|
if payloadsDir == "" {
|
||||||
|
cleanupTmpDirs()
|
||||||
tmpDir, err := os.MkdirTemp("", "ollama")
|
tmpDir, err := os.MkdirTemp("", "ollama")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Track our pid so we can clean up orphaned tmpdirs
|
||||||
|
pidFilePath := filepath.Join(tmpDir, "ollama.pid")
|
||||||
|
pidFile, err := os.OpenFile(pidFilePath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if _, err := pidFile.Write([]byte(fmt.Sprint(os.Getpid()))); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
// We create a distinct subdirectory for payloads within the tmpdir
|
// We create a distinct subdirectory for payloads within the tmpdir
|
||||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
// This will typically look like /tmp/ollama3208993108/runners on linux
|
||||||
payloadsDir = filepath.Join(tmpDir, "runners")
|
payloadsDir = filepath.Join(tmpDir, "runners")
|
||||||
|
@ -30,6 +45,36 @@ func PayloadsDir() (string, error) {
|
||||||
return payloadsDir, nil
|
return payloadsDir, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Best effort to clean up prior tmpdirs
|
||||||
|
func cleanupTmpDirs() {
|
||||||
|
dirs, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, d := range dirs {
|
||||||
|
info, err := os.Stat(d)
|
||||||
|
if err != nil || !info.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(filepath.Join(d, "ollama.pid"))
|
||||||
|
if err == nil {
|
||||||
|
pid, err := strconv.Atoi(string(raw))
|
||||||
|
if err == nil {
|
||||||
|
if proc, err := os.FindProcess(int(pid)); err == nil && !errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||||
|
// Another running ollama, ignore this tmpdir
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
slog.Debug("failed to open ollama.pid", "path", d, "error", err)
|
||||||
|
}
|
||||||
|
err = os.RemoveAll(d)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func Cleanup() {
|
func Cleanup() {
|
||||||
lock.Lock()
|
lock.Lock()
|
||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
|
|
|
@ -196,7 +196,13 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return libs, g.Wait()
|
err = g.Wait()
|
||||||
|
if err != nil {
|
||||||
|
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
|
||||||
|
gpu.Cleanup()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return libs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func verifyDriverAccess() error {
|
func verifyDriverAccess() error {
|
||||||
|
|
Loading…
Reference in a new issue