ollama/llama/runner/image.go

package main

import (
	"errors"
	"fmt"
	"hash/maphash"
	"log/slog"
	"slices"
	"sync"
	"time"

	"github.com/ollama/ollama/llama"
)

const imageCacheSize = 4

type ImageContext struct {
	// mu is required to be held when generating embeddings or accessing the cache
	mu sync.Mutex

	clip   *llama.ClipContext
	mllama *llama.MllamaContext

	// cache of images to embeddings
	images    []imageCache
	imageHash maphash.Hash
}

func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
	arch, err := llama.GetModelArch(modelPath)
	if err != nil {
		return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
	}

	var c ImageContext
	if arch == "clip" {
		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
	} else if arch == "mllama" {
		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
	} else {
		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
	}

	if err != nil {
		return nil, err
	}

	c.images = make([]imageCache, imageCacheSize)

	return &c, nil
}

func (c *ImageContext) Free(modelPath string) {
	if c == nil {
		return
	}

	if c.clip != nil {
		c.clip.Free()
	}
	if c.mllama != nil {
		c.mllama.Free()
	}
}

func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 {
	if c == nil {
		return nil
	}

	hash := c.hashImage(data)

	c.mu.Lock()
	defer c.mu.Unlock()

	embed, err := c.findImage(hash)
	if err != nil {
		if c.mllama != nil {
			embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
		} else if c.clip != nil {
			embed = c.clip.NewEmbed(llamaContext, data)
		} else {
			return nil
		}

		c.addImage(hash, embed)
	}

	return embed
}

func (c *ImageContext) BatchSize(configuredBatchSize int) int {
	// If images are not supported, we don't need to allocate embedding batches
	if c == nil {
		return 0
	}

	// Mllama maps an image to 1 embedding token (llava creates many tokens)
	// and doesn't support more than a single image per request.
	// The embeddings are large (100 MB), so allocating a big batch can fail
	// on some systems
	if c.mllama != nil {
		return 1
	}

	return configuredBatchSize
}

func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
	if c != nil && c.mllama != nil {
		return c.mllama.EmbedSize(llamaContext)
	} else {
		return llamaContext.Model().NEmbd()
	}
}

func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
	if c == nil || c.mllama == nil {
		return false
	}

	return slices.ContainsFunc(inputs, func(input input) bool {
		return input.embed != nil
	})
}

type imageCache struct {
	key      uint64
	val      [][]float32
	lastUsed time.Time
}

func (c *ImageContext) hashImage(image []byte) uint64 {
	c.imageHash.Reset()
	_, _ = c.imageHash.Write(image)
	return c.imageHash.Sum64()
}

var errImageNotFound = errors.New("image not found in cache")

func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
	for i := range c.images {
		if c.images[i].key == hash {
			slog.Debug("loading image embeddings from cache", "entry", i)
			c.images[i].lastUsed = time.Now()
			return c.images[i].val, nil
		}
	}

	return nil, errImageNotFound
}

func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
	best := time.Now()
	var bestImage int

	for i := range c.images {
		if c.images[i].key == hash {
			bestImage = i
			break
		}

		if c.images[i].lastUsed.Compare(best) < 0 {
			best = c.images[i].lastUsed
			bestImage = i
		}
	}

	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
	c.images[bestImage].key = hash
	c.images[bestImage].val = embed
	c.images[bestImage].lastUsed = time.Now()
}
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`package main`

			`import (`
			`"errors"`
			`"fmt"`
			`"hash/maphash"`
			`"log/slog"`
runner.go: Don't set cross attention before sending embeddings Currently if an input has embeddings at any point then we will set cross attention to true from the beginning. This means that any tokens before the embeddings are sent will incorrectly have cross attention layers applied. This only sets cross attention when we have an embedding, either previously in this sequence or in the cache. It also makes cross attention capable of supporting parallelism at the runner level, though the mllama implementation doesn't support that yet. 2024-10-31 17:55:31 +00:00			`"slices"`
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`"sync"`
			`"time"`

			`"github.com/ollama/ollama/llama"`
			`)`

			`const imageCacheSize = 4`

			`type ImageContext struct {`
			`// mu is required to be held when generating embeddings or accessing the cache`
			`mu sync.Mutex`

			`clip *llama.ClipContext`
			`mllama *llama.MllamaContext`

			`// cache of images to embeddings`
			`images []imageCache`
			`imageHash maphash.Hash`
			`}`

			`func NewImageContext(llamaContext llama.Context, modelPath string) (ImageContext, error) {`
			`arch, err := llama.GetModelArch(modelPath)`
			`if err != nil {`
			`return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)`
			`}`

			`var c ImageContext`
			`if arch == "clip" {`
			`c.clip, err = llama.NewClipContext(llamaContext, modelPath)`
			`} else if arch == "mllama" {`
			`c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)`
			`} else {`
			`return nil, fmt.Errorf("unknown vision model architecture: %s", arch)`
			`}`

			`if err != nil {`
			`return nil, err`
			`}`

			`c.images = make([]imageCache, imageCacheSize)`

			`return &c, nil`
			`}`

			`func (c *ImageContext) Free(modelPath string) {`
			`if c == nil {`
			`return`
			`}`

			`if c.clip != nil {`
			`c.clip.Free()`
			`}`
			`if c.mllama != nil {`
			`c.mllama.Free()`
			`}`
			`}`

			`func (c ImageContext) NewEmbed(llamaContext llama.Context, data []byte, aspectRatioId int) [][]float32 {`
			`if c == nil {`
			`return nil`
			`}`

			`hash := c.hashImage(data)`

			`c.mu.Lock()`
			`defer c.mu.Unlock()`

			`embed, err := c.findImage(hash)`
			`if err != nil {`
			`if c.mllama != nil {`
			`embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)`
			`} else if c.clip != nil {`
			`embed = c.clip.NewEmbed(llamaContext, data)`
			`} else {`
			`return nil`
			`}`

			`c.addImage(hash, embed)`
			`}`

			`return embed`
			`}`

runner.go: Only allocate 1 element embedding batches for mllama Mllama has large embeddings (100 MB per image) and each embedding is represented as 1 token when passed to llama.cpp. Batches are pre- allocated for the size of the tokens times the batch size, so this results in allocations of over 50 GB at the default batch size. On some systems, these mallocs will fail. Since an image is represented as a single token and mllama doesn't support more than 1 image per request, we only need to allocate a batch size of 1, which is much more reasonable. In addition, for non-multimodal models, we don't need to allocate the embedding batches at all. Fixes #7464 2024-11-01 21:29:57 +00:00			`func (c *ImageContext) BatchSize(configuredBatchSize int) int {`
			`// If images are not supported, we don't need to allocate embedding batches`
			`if c == nil {`
			`return 0`
			`}`

			`// Mllama maps an image to 1 embedding token (llava creates many tokens)`
			`// and doesn't support more than a single image per request.`
			`// The embeddings are large (100 MB), so allocating a big batch can fail`
			`// on some systems`
			`if c.mllama != nil {`
			`return 1`
			`}`

			`return configuredBatchSize`
			`}`

runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`func (c ImageContext) EmbedSize(llamaContext llama.Context) int {`
			`if c != nil && c.mllama != nil {`
			`return c.mllama.EmbedSize(llamaContext)`
			`} else {`
			`return llamaContext.Model().NEmbd()`
			`}`
			`}`

runner.go: Don't set cross attention before sending embeddings Currently if an input has embeddings at any point then we will set cross attention to true from the beginning. This means that any tokens before the embeddings are sent will incorrectly have cross attention layers applied. This only sets cross attention when we have an embedding, either previously in this sequence or in the cache. It also makes cross attention capable of supporting parallelism at the runner level, though the mllama implementation doesn't support that yet. 2024-10-31 17:55:31 +00:00			`func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {`
			`if c == nil \|\| c.mllama == nil {`
			`return false`
			`}`

			`return slices.ContainsFunc(inputs, func(input input) bool {`
			`return input.embed != nil`
			`})`
			`}`

runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`type imageCache struct {`
			`key uint64`
			`val [][]float32`
			`lastUsed time.Time`
			`}`

			`func (c *ImageContext) hashImage(image []byte) uint64 {`
			`c.imageHash.Reset()`
			`_, _ = c.imageHash.Write(image)`
			`return c.imageHash.Sum64()`
			`}`

			`var errImageNotFound = errors.New("image not found in cache")`

			`func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {`
			`for i := range c.images {`
			`if c.images[i].key == hash {`
			`slog.Debug("loading image embeddings from cache", "entry", i)`
			`c.images[i].lastUsed = time.Now()`
			`return c.images[i].val, nil`
			`}`
			`}`

			`return nil, errImageNotFound`
			`}`

			`func (c *ImageContext) addImage(hash uint64, embed [][]float32) {`
			`best := time.Now()`
			`var bestImage int`

			`for i := range c.images {`
			`if c.images[i].key == hash {`
			`bestImage = i`
			`break`
			`}`

			`if c.images[i].lastUsed.Compare(best) < 0 {`
			`best = c.images[i].lastUsed`
			`bestImage = i`
			`}`
			`}`

			`slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)`
			`c.images[bestImage].key = hash`
			`c.images[bestImage].val = embed`
			`c.images[bestImage].lastUsed = time.Now()`
			`}`