ollama/llm/ggla.go

package llm

import (
	"encoding/binary"
	"errors"
	"io"
	"slices"
)

type containerGGLA struct {
	version uint32
}

func (c *containerGGLA) Name() string {
	return "ggla"
}

func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
		return nil, err
	}

	switch c.version {
	case 1:
	default:
		return nil, errors.New("invalid version")
	}

	model := newGGLA(c)
	err := model.decode(rs)
	return model, err
}

type ggla struct {
	*containerGGLA

	kv      KV
	tensors []*Tensor
}

func newGGLA(container *containerGGLA) *ggla {
	return &ggla{
		containerGGLA: container,
		kv:            make(KV),
	}
}

func (llm *ggla) KV() KV {
	return llm.kv
}

func (llm *ggla) Tensors() Tensors {
	return llm.tensors
}

func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
	var r uint32
	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
		return err
	}
	llm.kv["r"] = r

	var alpha uint32
	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
		return err
	}
	llm.kv["alpha"] = alpha

	for {
		var dims uint32
		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
			if errors.Is(err, io.EOF) {
				return nil
			}
			return err
		}

		defer func() {
			if errors.Is(retErr, io.EOF) {
				retErr = io.ErrUnexpectedEOF
			}
		}()

		var namesize uint32
		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
			return err
		}

		var t Tensor
		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
			return err
		}

		t.Shape = make([]uint64, dims)
		for i := 0; uint32(i) < dims; i++ {
			var shape32 uint32
			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
				return err
			}

			t.Shape[i] = uint64(shape32)
		}

		// ggla tensor shape is reversed
		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
		slices.Reverse(t.Shape)

		name := make([]byte, namesize)
		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
			return err
		}

		t.Name = string(name)

		offset, err := rs.Seek(0, io.SeekCurrent)
		if err != nil {
			return err
		}

		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
			return err
		}

		offset, err = rs.Seek(0, io.SeekCurrent)
		if err != nil {
			return err
		}

		t.Offset = uint64(offset)

		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
			return err
		}

		llm.tensors = append(llm.tensors, &t)
	}
}
decode ggla 2024-03-08 23:38:53 +00:00			`package llm`

			`import (`
			`"encoding/binary"`
			`"errors"`
			`"io"`
			`"slices"`
			`)`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`type containerGGLA struct {`
decode ggla 2024-03-08 23:38:53 +00:00			`version uint32`
			`}`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`func (c *containerGGLA) Name() string {`
decode ggla 2024-03-08 23:38:53 +00:00			`return "ggla"`
			`}`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {`
			`if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {`
			`return nil, err`
			`}`
decode ggla 2024-03-08 23:38:53 +00:00
			`switch c.version {`
			`case 1:`
			`default:`
			`return nil, errors.New("invalid version")`
			`}`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`model := newGGLA(c)`
refactor readseeker 2024-03-09 20:28:36 +00:00			`err := model.decode(rs)`
decode ggla 2024-03-08 23:38:53 +00:00			`return model, err`
			`}`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`type ggla struct {`
			`*containerGGLA`
decode ggla 2024-03-08 23:38:53 +00:00
			`kv KV`
refactor model parsing 2024-03-13 18:03:56 +00:00			`tensors []*Tensor`
decode ggla 2024-03-08 23:38:53 +00:00			`}`

Add gemma safetensors conversion (#3250) Co-authored-by: Michael Yang <mxyng@pm.me> 2024-03-29 01:54:01 +00:00			`func newGGLA(container containerGGLA) ggla {`
			`return &ggla{`
			`containerGGLA: container,`
decode ggla 2024-03-08 23:38:53 +00:00			`kv: make(KV),`
			`}`
			`}`

refactor model parsing 2024-03-13 18:03:56 +00:00			`func (llm *ggla) KV() KV {`
			`return llm.kv`
			`}`

refactor tensor query 2024-04-03 22:00:31 +00:00			`func (llm *ggla) Tensors() Tensors {`
refactor model parsing 2024-03-13 18:03:56 +00:00			`return llm.tensors`
			`}`

llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 04:47:52 +00:00			`func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {`
decode ggla 2024-03-08 23:38:53 +00:00			`var r uint32`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`
refactor model parsing 2024-03-13 18:03:56 +00:00			`llm.kv["r"] = r`
decode ggla 2024-03-08 23:38:53 +00:00
			`var alpha uint32`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`
refactor model parsing 2024-03-13 18:03:56 +00:00			`llm.kv["alpha"] = alpha`
decode ggla 2024-03-08 23:38:53 +00:00
			`for {`
			`var dims uint32`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {`
llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 04:47:52 +00:00			`if errors.Is(err, io.EOF) {`
			`return nil`
			`}`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 04:47:52 +00:00			`defer func() {`
			`if errors.Is(retErr, io.EOF) {`
			`retErr = io.ErrUnexpectedEOF`
			`}`
			`}()`

decode ggla 2024-03-08 23:38:53 +00:00			`var namesize uint32`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

			`var t Tensor`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

			`t.Shape = make([]uint64, dims)`
			`for i := 0; uint32(i) < dims; i++ {`
			`var shape32 uint32`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

			`t.Shape[i] = uint64(shape32)`
			`}`

			`// ggla tensor shape is reversed`
			`// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44`
			`slices.Reverse(t.Shape)`

			`name := make([]byte, namesize)`
refactor readseeker 2024-03-09 20:28:36 +00:00			`if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

			`t.Name = string(name)`

refactor readseeker 2024-03-09 20:28:36 +00:00			`offset, err := rs.Seek(0, io.SeekCurrent)`
			`if err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

llm: speed up gguf decoding by a lot (#5246) Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF. 2024-06-25 04:47:52 +00:00			`if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {`
refactor readseeker 2024-03-09 20:28:36 +00:00			`return err`
			`}`

			`offset, err = rs.Seek(0, io.SeekCurrent)`
			`if err != nil {`
			`return err`
			`}`

			`t.Offset = uint64(offset)`
decode ggla 2024-03-08 23:38:53 +00:00
simplify safetensors reading 2024-05-20 16:47:01 +00:00			`if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {`
decode ggla 2024-03-08 23:38:53 +00:00			`return err`
			`}`

refactor model parsing 2024-03-13 18:03:56 +00:00			`llm.tensors = append(llm.tensors, &t)`
decode ggla 2024-03-08 23:38:53 +00:00			`}`
			`}`