llm: speed up gguf decoding by a lot (#5246)
Previously, some costly things were causing the loading of GGUF files and their metadata and tensor information to be VERY slow: * Too many allocations when decoding strings * Hitting disk for each read of each key and value, resulting in a not-okay amount of syscalls/disk I/O. The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro m3. This commit also prevents collecting large arrays of values when decoding GGUFs (if desired). When such keys are encountered, their values are null, and are encoded as such in JSON. Also, this fixes a broken test that was not encoding valid GGUF.
This commit is contained in:
parent
2aa91a937b
commit
cb42e607c5
13 changed files with 263 additions and 69 deletions
13
llm/ggla.go
13
llm/ggla.go
|
@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
|
||||||
return llm.tensors
|
return llm.tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||||
var r uint32
|
var r uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||||
for {
|
for {
|
||||||
var dims uint32
|
var dims uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if errors.Is(retErr, io.EOF) {
|
||||||
|
retErr = io.ErrUnexpectedEOF
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
var namesize uint32
|
var namesize uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
|
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
25
llm/ggml.go
25
llm/ggml.go
|
@ -6,6 +6,8 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/util/bufioutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
|
@ -278,7 +280,18 @@ func DetectGGMLType(b []byte) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
// DecodeGGML decodes a GGML model from the given reader.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
|
if maxArraySize == 0 {
|
||||||
|
maxArraySize = 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
|
@ -291,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||||
case FILE_MAGIC_GGLA:
|
case FILE_MAGIC_GGLA:
|
||||||
c = &containerGGLA{}
|
c = &containerGGLA{}
|
||||||
case FILE_MAGIC_GGUF_LE:
|
case FILE_MAGIC_GGUF_LE:
|
||||||
c = &containerGGUF{ByteOrder: binary.LittleEndian}
|
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
c = &containerGGUF{ByteOrder: binary.BigEndian}
|
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||||
default:
|
default:
|
||||||
return nil, 0, errors.New("invalid file magic")
|
return nil, 0, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
model, err := c.Decode(rs)
|
model, err := c.Decode(rs)
|
||||||
if errors.Is(err, io.EOF) {
|
if err != nil {
|
||||||
// noop
|
|
||||||
} else if err != nil {
|
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -321,7 +332,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||||
embedding := llm.KV().EmbeddingLength()
|
embedding := llm.KV().EmbeddingLength()
|
||||||
heads := llm.KV().HeadCount()
|
heads := llm.KV().HeadCount()
|
||||||
headsKV := llm.KV().HeadCountKV()
|
headsKV := llm.KV().HeadCountKV()
|
||||||
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||||
|
|
1
llm/ggml_test.go
Normal file
1
llm/ggml_test.go
Normal file
|
@ -0,0 +1 @@
|
||||||
|
package llm
|
130
llm/gguf.go
130
llm/gguf.go
|
@ -3,11 +3,10 @@ package llm
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"log/slog"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
|
@ -29,6 +28,12 @@ type containerGGUF struct {
|
||||||
NumTensor uint64
|
NumTensor uint64
|
||||||
NumKV uint64
|
NumKV uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxArraySize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||||
|
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGUF) Name() string {
|
func (c *containerGGUF) Name() string {
|
||||||
|
@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
model := newGGUF(c)
|
model := newGGUF(c)
|
||||||
slog.Debug(fmt.Sprintf("model = %#v", model))
|
|
||||||
if err := model.Decode(rs); err != nil {
|
if err := model.Decode(rs); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -85,6 +89,8 @@ type gguf struct {
|
||||||
tensors []*Tensor
|
tensors []*Tensor
|
||||||
|
|
||||||
parameters uint64
|
parameters uint64
|
||||||
|
|
||||||
|
scratch [16 << 10]byte
|
||||||
}
|
}
|
||||||
|
|
||||||
func newGGUF(container *containerGGUF) *gguf {
|
func newGGUF(container *containerGGUF) *gguf {
|
||||||
|
@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode tensors
|
// decode tensors
|
||||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
for range llm.numTensor() {
|
||||||
name, err := readGGUFString(llm, rs)
|
name, err := readGGUFString(llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor name: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// dims is the number of dimensions in the tensor
|
// dims is the number of dimensions in the tensor
|
||||||
dims, err := readGGUF[uint32](llm, rs)
|
dims, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor dimensions: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
shape := [4]uint64{1, 1, 1, 1}
|
shape := [4]uint64{1, 1, 1, 1}
|
||||||
for i := 0; uint32(i) < dims; i++ {
|
for i := 0; uint32(i) < dims; i++ {
|
||||||
shape[i], err = readGGUF[uint64](llm, rs)
|
shape[i], err = readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor shape: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kind, err := readGGUF[uint32](llm, rs)
|
kind, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor kind: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := readGGUF[uint64](llm, rs)
|
offset, err := readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor := Tensor{
|
tensor := Tensor{
|
||||||
|
@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||||
alignment = 32
|
alignment = 32
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
padding := llm.padding(offset, int64(alignment))
|
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tensor := range llm.tensors {
|
for _, tensor := range llm.tensors {
|
||||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
return err
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to get current offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
padding := llm.padding(int64(tensor.Size()), int64(alignment))
|
padding := llm.padding(offset, int64(alignment))
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to seek to init padding: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||||
|
return fmt.Errorf("failed to seek to tensor: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
||||||
return b.String(), nil
|
return b.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||||
|
buf := llm.scratch[:8]
|
||||||
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
size := int(llm.ByteOrder.Uint64(buf))
|
||||||
|
for size > 0 {
|
||||||
|
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
size -= n
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1String(llm, r)
|
return readGGUFV1String(llm, r)
|
||||||
}
|
}
|
||||||
|
|
||||||
var length uint64
|
buf := llm.scratch[:8]
|
||||||
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
var b bytes.Buffer
|
length := int(llm.ByteOrder.Uint64(buf))
|
||||||
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
|
if length > len(llm.scratch) {
|
||||||
|
buf = make([]byte, length)
|
||||||
|
} else {
|
||||||
|
buf = llm.scratch[:length]
|
||||||
|
}
|
||||||
|
clear(buf)
|
||||||
|
|
||||||
|
_, err = io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
return string(buf), nil
|
||||||
return b.String(), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||||
|
@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
type array struct {
|
||||||
|
size int
|
||||||
|
values []any
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *array) MarshalJSON() ([]byte, error) {
|
||||||
|
return json.Marshal(a.values)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||||
t, err := readGGUF[uint32](llm, r)
|
t, err := readGGUF[uint32](llm, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint32(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, 0, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
|
@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1Array(llm, r)
|
return readGGUFV1Array(llm, r)
|
||||||
}
|
}
|
||||||
|
@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint64(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
|
@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||||
case ggufTypeBool:
|
case ggufTypeBool:
|
||||||
e, err = readGGUF[bool](llm, r)
|
e, err = readGGUF[bool](llm, r)
|
||||||
case ggufTypeString:
|
case ggufTypeString:
|
||||||
e, err = readGGUFString(llm, r)
|
if a.values != nil {
|
||||||
|
e, err = readGGUFString(llm, r)
|
||||||
|
} else {
|
||||||
|
err = discardGGUFString(llm, r)
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||||
}
|
}
|
||||||
|
@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
||||||
|
|
|
@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
gguf := NewGGUFV3(binary.LittleEndian)
|
gguf := NewGGUFV3(binary.LittleEndian)
|
||||||
inputLayerCount := 5
|
inputLayerCount := 5
|
||||||
|
|
||||||
tensors := []Tensor{
|
tensors := []Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
}
|
}
|
||||||
assert.Len(t, tensors, inputLayerCount+1)
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
err = gguf.Encode(f, KV{
|
err = gguf.Encode(f, KV{
|
||||||
|
@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
}, tensors)
|
}, tensors)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
ggml, err := LoadModel(f.Name())
|
ggml, err := LoadModel(f.Name(), 0)
|
||||||
require.NoError(t, err)
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Simple CPU scenario
|
// Simple CPU scenario
|
||||||
gpus := []gpu.GpuInfo{
|
gpus := []gpu.GpuInfo{
|
||||||
|
|
|
@ -60,7 +60,12 @@ type llmServer struct {
|
||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModel(model string) (*GGML, error) {
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f)
|
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||||
return ggml, err
|
return ggml, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -412,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(file)
|
ggml, _, err := DecodeGGML(file, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
|
@ -423,7 +423,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(temp)
|
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,7 +63,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(blob)
|
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -176,7 +176,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(bin)
|
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -210,7 +210,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
for offset < stat.Size() {
|
for offset < stat.Size() {
|
||||||
ggml, n, err := llm.DecodeGGML(file)
|
ggml, n, err := llm.DecodeGGML(file, 0)
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
|
|
|
@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||||
kvData, err := llm.LoadModel(digest)
|
maxArraySize := 0
|
||||||
|
if verbose {
|
||||||
|
maxArraySize = -1
|
||||||
|
}
|
||||||
|
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
|
||||||
schedCtx, schedDone := context.WithCancel(ctx)
|
schedCtx, schedDone := context.WithCancel(ctx)
|
||||||
sched := InitScheduler(schedCtx)
|
sched := InitScheduler(schedCtx)
|
||||||
s := &Server{addr: ln.Addr(), sched: sched}
|
s := &Server{addr: ln.Addr(), sched: sched}
|
||||||
r := s.GenerateRoutes()
|
|
||||||
|
http.Handle("/", s.GenerateRoutes())
|
||||||
|
|
||||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||||
srvr := &http.Server{
|
srvr := &http.Server{
|
||||||
Handler: r,
|
// Use http.DefaultServeMux so we get net/http/pprof for
|
||||||
|
// free.
|
||||||
|
//
|
||||||
|
// TODO(bmizerany): Decide if we want to make this
|
||||||
|
// configurable so it is not exposed by default, or allow
|
||||||
|
// users to bind it to a different port. This was a quick
|
||||||
|
// and easy way to get pprof, but it may not be the best
|
||||||
|
// way.
|
||||||
|
Handler: nil,
|
||||||
}
|
}
|
||||||
|
|
||||||
// listen for a ctrl+c and stop any loaded llm
|
// listen for a ctrl+c and stop any loaded llm
|
||||||
|
|
|
@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
|
|
|
@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []llm.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
})
|
})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
fname := f.Name()
|
fname := f.Name()
|
||||||
model := &Model{Name: modelName, ModelPath: fname}
|
model := &Model{Name: modelName, ModelPath: fname}
|
||||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
scenario.req = &LlmRequest{
|
scenario.req = &LlmRequest{
|
||||||
|
|
34
util/bufioutil/buffer_seeker.go
Normal file
34
util/bufioutil/buffer_seeker.go
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BufferedSeeker struct {
|
||||||
|
rs io.ReadSeeker
|
||||||
|
br *bufio.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
|
||||||
|
return &BufferedSeeker{
|
||||||
|
rs: rs,
|
||||||
|
br: bufio.NewReaderSize(rs, size),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Read(p []byte) (int, error) {
|
||||||
|
return b.br.Read(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
|
||||||
|
if whence == io.SeekCurrent {
|
||||||
|
offset -= int64(b.br.Buffered())
|
||||||
|
}
|
||||||
|
n, err := b.rs.Seek(offset, whence)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
b.br.Reset(b.rs)
|
||||||
|
return n, nil
|
||||||
|
}
|
64
util/bufioutil/buffer_seeker_test.go
Normal file
64
util/bufioutil/buffer_seeker_test.go
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBufferedSeeker(t *testing.T) {
|
||||||
|
const alphabet = "abcdefghijklmnopqrstuvwxyz"
|
||||||
|
|
||||||
|
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
|
||||||
|
|
||||||
|
checkRead := func(buf []byte, expected string) {
|
||||||
|
t.Helper()
|
||||||
|
_, err := bs.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if !bytes.Equal(buf, []byte(expected)) {
|
||||||
|
t.Fatalf("expected %s, got %s", expected, buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the first 5 bytes
|
||||||
|
buf := make([]byte, 5)
|
||||||
|
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err := bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// read 'a'
|
||||||
|
checkRead(buf[:1], "a")
|
||||||
|
|
||||||
|
if bs.br.Buffered() == 0 {
|
||||||
|
t.Fatalf("totally unexpected sanity check failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek past 'b'
|
||||||
|
_, err = bs.Seek(1, io.SeekCurrent)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "cdefg")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err = bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek to the end
|
||||||
|
_, err = bs.Seek(-5, io.SeekEnd)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "vwxyz")
|
||||||
|
}
|
Loading…
Reference in a new issue