diff --git a/llm/ggml.go b/llm/ggml.go index 608085d0..18ae4bd6 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -7,9 +7,10 @@ import ( ) type GGML struct { - magic uint32 container model + + Size int64 } const ( @@ -82,7 +83,7 @@ type model interface { type container interface { Name() string - Decode(io.Reader) (model, error) + Decode(*readOffset) (model, error) } type containerGGML struct{} @@ -91,7 +92,7 @@ func (c *containerGGML) Name() string { return "ggml" } -func (c *containerGGML) Decode(r io.Reader) (model, error) { +func (c *containerGGML) Decode(ro *readOffset) (model, error) { return nil, nil } @@ -103,9 +104,9 @@ func (c *containerGGMF) Name() string { return "ggmf" } -func (c *containerGGMF) Decode(r io.Reader) (model, error) { +func (c *containerGGMF) Decode(ro *readOffset) (model, error) { var version uint32 - binary.Read(r, binary.LittleEndian, &version) + binary.Read(ro, binary.LittleEndian, &version) switch version { case 1: @@ -125,9 +126,9 @@ func (c *containerGGJT) Name() string { return "ggjt" } -func (c *containerGGJT) Decode(r io.Reader) (model, error) { +func (c *containerGGJT) Decode(ro *readOffset) (model, error) { var version uint32 - binary.Read(r, binary.LittleEndian, &version) + binary.Read(ro, binary.LittleEndian, &version) switch version { case 1, 2, 3: @@ -139,7 +140,7 @@ func (c *containerGGJT) Decode(r io.Reader) (model, error) { // different model types may have different layouts for hyperparameters var llama llamaModel - binary.Read(r, binary.LittleEndian, &llama.hyperparameters) + binary.Read(ro, binary.LittleEndian, &llama.hyperparameters) return &llama, nil } @@ -151,9 +152,9 @@ func (c *containerLORA) Name() string { return "ggla" } -func (c *containerLORA) Decode(r io.Reader) (model, error) { +func (c *containerLORA) Decode(ro *readOffset) (model, error) { var version uint32 - binary.Read(r, binary.LittleEndian, &version) + binary.Read(ro, binary.LittleEndian, &version) switch version { case 1: @@ -180,33 +181,51 @@ const ( ) func DecodeGGML(r io.Reader) (*GGML, error) { - var ggml GGML - binary.Read(r, binary.LittleEndian, &ggml.magic) + ro := readOffset{Reader: r} - switch ggml.magic { + var magic uint32 + if err := binary.Read(&ro, binary.LittleEndian, &magic); err != nil { + return nil, err + } + + var c container + switch magic { case FILE_MAGIC_GGML: - ggml.container = &containerGGML{} + c = &containerGGML{} case FILE_MAGIC_GGMF: - ggml.container = &containerGGMF{} + c = &containerGGMF{} case FILE_MAGIC_GGJT: - ggml.container = &containerGGJT{} + c = &containerGGJT{} case FILE_MAGIC_GGLA: - ggml.container = &containerLORA{} + c = &containerLORA{} case FILE_MAGIC_GGUF_LE: - ggml.container = &containerGGUF{bo: binary.LittleEndian} + c = &containerGGUF{bo: binary.LittleEndian} case FILE_MAGIC_GGUF_BE: - ggml.container = &containerGGUF{bo: binary.BigEndian} + c = &containerGGUF{bo: binary.BigEndian} default: return nil, errors.New("invalid file magic") } - model, err := ggml.Decode(r) + model, err := c.Decode(&ro) if err != nil { return nil, err } - ggml.model = model - // final model type - return &ggml, nil + return &GGML{ + container: c, + model: model, + Size: ro.offset, + }, nil +} + +type readOffset struct { + io.Reader + offset int64 +} + +func (r *readOffset) Read(p []byte) (int, error) { + n, err := r.Reader.Read(p) + r.offset += int64(n) + return n, err } diff --git a/llm/gguf.go b/llm/gguf.go index 12f98abe..dc883187 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -23,26 +23,24 @@ type containerGGUF struct { NumTensor uint64 NumKV uint64 } - - parameters uint64 } func (c *containerGGUF) Name() string { return "gguf" } -func (c *containerGGUF) Decode(r io.Reader) (model, error) { - binary.Read(r, c.bo, &c.Version) +func (c *containerGGUF) Decode(ro *readOffset) (model, error) { + binary.Read(ro, c.bo, &c.Version) switch c.Version { case 1: - binary.Read(r, c.bo, &c.V1) + binary.Read(ro, c.bo, &c.V1) default: - binary.Read(r, c.bo, &c.V2) + binary.Read(ro, c.bo, &c.V2) } model := newGGUFModel(c) - if err := model.Decode(r); err != nil { + if err := model.Decode(ro); err != nil { return nil, err } @@ -67,9 +65,23 @@ const ( type kv map[string]any +type tensor struct { + name string + kind uint32 + offset uint64 + size uint64 + + // shape is the number of elements in each dimension + shape [4]uint64 +} + type ggufModel struct { *containerGGUF + kv + tensors []tensor + + parameters uint64 } func newGGUFModel(container *containerGGUF) *ggufModel { @@ -142,49 +154,49 @@ func (llm *ggufModel) FileType() string { return "unknown" } -func (llm *ggufModel) Decode(r io.Reader) error { +func (llm *ggufModel) Decode(ro *readOffset) error { // decode key-values for i := 0; uint64(i) < llm.NumKV(); i++ { - k, err := llm.readString(r) + k, err := llm.readString(ro) if err != nil { return err } - vtype := llm.readU32(r) + vtype := llm.readU32(ro) var v any switch vtype { case ggufTypeUint8: - v = llm.readU8(r) + v = llm.readU8(ro) case ggufTypeInt8: - v = llm.readI8(r) + v = llm.readI8(ro) case ggufTypeUint16: - v = llm.readU16(r) + v = llm.readU16(ro) case ggufTypeInt16: - v = llm.readI16(r) + v = llm.readI16(ro) case ggufTypeUint32: - v = llm.readU32(r) + v = llm.readU32(ro) case ggufTypeInt32: - v = llm.readI32(r) + v = llm.readI32(ro) case ggufTypeUint64: - v = llm.readU64(r) + v = llm.readU64(ro) case ggufTypeInt64: - v = llm.readI64(r) + v = llm.readI64(ro) case ggufTypeFloat32: - v = llm.readF32(r) + v = llm.readF32(ro) case ggufTypeFloat64: - v = llm.readF64(r) + v = llm.readF64(ro) case ggufTypeBool: - v = llm.readBool(r) + v = llm.readBool(ro) case ggufTypeString: - s, err := llm.readString(r) + s, err := llm.readString(ro) if err != nil { return err } v = s case ggufTypeArray: - a, err := llm.readArray(r) + a, err := llm.readArray(ro) if err != nil { return err } @@ -199,21 +211,84 @@ func (llm *ggufModel) Decode(r io.Reader) error { // decode tensors for i := 0; uint64(i) < llm.NumTensor(); i++ { - if _, err := llm.readString(r); err != nil { + name, err := llm.readString(ro) + if err != nil { return err } - dimensions := llm.readU32(r) + dims := llm.readU32(ro) - var elements uint64 = 1 - for i := 0; uint32(i) < dimensions; i++ { - elements *= llm.readU64(r) + shape := [4]uint64{1, 1, 1, 1} + for i := 0; uint32(i) < dims; i++ { + shape[i] = llm.readU64(ro) } - llm.readU32(r) // type - llm.readU64(r) // offset + kind := llm.readU32(ro) + offset := llm.readU64(ro) - llm.parameters += elements + var blockSize uint64 + switch { + case kind < 2: + blockSize = 1 + case kind < 10: + blockSize = 32 + default: + blockSize = 256 + } + + var typeSize uint64 + switch kind { + case 0: // FP32 + typeSize = 4 + case 1: // FP16 + typeSize = 2 + case 2: // Q4_0 + typeSize = 2 + blockSize/2 + case 3: // Q4_1 + typeSize = 2 + 2 + blockSize/2 + case 6: // Q5_0 + typeSize = 2 + 4 + blockSize/2 + case 7: // Q5_1 + typeSize = 2 + 2 + 4 + blockSize/2 + case 8: // Q8_0 + typeSize = 2 + blockSize + case 9: // Q8_1 + typeSize = 4 + 4 + blockSize + case 10: // Q2_K + typeSize = blockSize/16 + blockSize/4 + 2 + 2 + case 11: // Q3_K + typeSize = blockSize/8 + blockSize/4 + 12 + 2 + case 12: // Q4_K + typeSize = 2 + 2 + 12 + blockSize/2 + case 13: // Q5_K + typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2 + case 14: // Q6_K + typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2 + } + + parameters := shape[0] * shape[1] * shape[2] * shape[3] + size := parameters * typeSize / blockSize + + llm.tensors = append(llm.tensors, tensor{ + name: name, + kind: kind, + offset: offset, + size: size, + shape: shape, + }) + + llm.parameters += parameters + } + + alignment, ok := llm.kv["general.alignment"].(uint32) + if !ok { + alignment = 32 + } + + io.CopyN(io.Discard, ro, int64(alignment)-ro.offset%int64(alignment)) + for _, tensor := range llm.tensors { + padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1) + io.CopyN(io.Discard, ro, padded) } return nil diff --git a/server/images.go b/server/images.go index 9995fe62..6eb569a3 100644 --- a/server/images.go +++ b/server/images.go @@ -388,24 +388,38 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars } defer bin.Close() - fn(api.ProgressResponse{Status: "creating model layer"}) - ggml, err := llm.DecodeGGML(bin) - if err != nil { - return err + var offset int64 + for { + fn(api.ProgressResponse{Status: "creating model layer"}) + + bin.Seek(offset, io.SeekStart) + ggml, err := llm.DecodeGGML(bin) + if errors.Is(err, io.EOF) { + break + } else if err != nil { + return err + } + + config.ModelFormat = ggml.Name() + config.ModelFamily = ggml.ModelFamily() + config.ModelType = ggml.ModelType() + config.FileType = ggml.FileType() + + mediatype := mediatype + if ggml.ModelFamily() == "clip" { + mediatype = "application/vnd.ollama.image.projector" + } + + sr := io.NewSectionReader(bin, offset, ggml.Size) + layer, err := NewLayer(sr, mediatype) + if err != nil { + return err + } + + layers.Add(layer) + + offset += ggml.Size } - - config.ModelFormat = ggml.Name() - config.ModelFamily = ggml.ModelFamily() - config.ModelType = ggml.ModelType() - config.FileType = ggml.FileType() - - bin.Seek(0, io.SeekStart) - layer, err := NewLayer(bin, mediatype) - if err != nil { - return err - } - - layers.Add(layer) case "adapter": if strings.HasPrefix(c.Args, "@") { blobPath, err := GetBlobsPath(strings.TrimPrefix(c.Args, "@"))