package llm import ( "bytes" "encoding/binary" "fmt" "io" "github.com/jmorganca/ollama/format" ) type containerGGUF struct { bo binary.ByteOrder Version uint32 V1 struct { NumTensor uint32 NumKV uint32 } V2 struct { NumTensor uint64 NumKV uint64 } } func (c *containerGGUF) Name() string { return "gguf" } func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) { binary.Read(rso, c.bo, &c.Version) switch c.Version { case 1: binary.Read(rso, c.bo, &c.V1) default: binary.Read(rso, c.bo, &c.V2) } model := newGGUFModel(c) if err := model.Decode(rso); err != nil { return nil, err } return model, nil } const ( ggufTypeUint8 uint32 = iota ggufTypeInt8 ggufTypeUint16 ggufTypeInt16 ggufTypeUint32 ggufTypeInt32 ggufTypeFloat32 ggufTypeBool ggufTypeString ggufTypeArray ggufTypeUint64 ggufTypeInt64 ggufTypeFloat64 ) type kv map[string]any type tensor struct { name string kind uint32 offset uint64 // shape is the number of elements in each dimension shape [4]uint64 } func (t tensor) blockSize() uint64 { switch { case t.kind < 2: return 1 case t.kind < 10: return 32 default: return 256 } } func (t tensor) typeSize() uint64 { blockSize := t.blockSize() switch t.kind { case 0: // FP32 return 4 case 1: // FP16 return 2 case 2: // Q4_0 return 2 + blockSize/2 case 3: // Q4_1 return 2 + 2 + blockSize/2 case 6: // Q5_0 return 2 + 4 + blockSize/2 case 7: // Q5_1 return 2 + 2 + 4 + blockSize/2 case 8: // Q8_0 return 2 + blockSize case 9: // Q8_1 return 4 + 4 + blockSize case 10: // Q2_K return blockSize/16 + blockSize/4 + 2 + 2 case 11: // Q3_K return blockSize/8 + blockSize/4 + 12 + 2 case 12: // Q4_K return 2 + 2 + 12 + blockSize/2 case 13: // Q5_K return 2 + 2 + 12 + blockSize/8 + blockSize/2 case 14: // Q6_K return blockSize/2 + blockSize/4 + blockSize/16 + 2 case 15: // Q8_K return 2 + blockSize + 2*blockSize/16 case 16: // IQ2_XXS return 2 + 2*blockSize/8 case 17: // IQ2_XS return 2 + 2*blockSize/8 + blockSize/32 case 18: // IQ3_XXS return 2 + 3*blockSize/8 default: return 0 } } func (t tensor) parameters() uint64 { return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3] } func (t tensor) size() uint64 { return t.parameters() * t.typeSize() / t.blockSize() } type ggufModel struct { *containerGGUF kv tensors []tensor parameters uint64 } func newGGUFModel(container *containerGGUF) *ggufModel { return &ggufModel{ containerGGUF: container, kv: make(kv), } } func (llm *ggufModel) NumTensor() uint64 { if llm.Version == 1 { return uint64(llm.V1.NumTensor) } return llm.V2.NumTensor } func (llm *ggufModel) NumKV() uint64 { if llm.Version == 1 { return uint64(llm.V1.NumKV) } return llm.V2.NumKV } func (llm *ggufModel) ModelFamily() string { if t, ok := llm.kv["general.architecture"].(string); ok { return t } return "unknown" } func (llm *ggufModel) ModelType() string { if llm.parameters > 0 { return format.HumanNumber(llm.parameters) } return "unknown" } func (llm *ggufModel) FileType() string { if t, ok := llm.kv["general.file_type"].(uint32); ok { return fileType(t) } return "unknown" } func (llm *ggufModel) Decode(rso *readSeekOffset) error { // decode key-values for i := 0; uint64(i) < llm.NumKV(); i++ { k, err := llm.readString(rso) if err != nil { return err } vtype := llm.readU32(rso) var v any switch vtype { case ggufTypeUint8: v = llm.readU8(rso) case ggufTypeInt8: v = llm.readI8(rso) case ggufTypeUint16: v = llm.readU16(rso) case ggufTypeInt16: v = llm.readI16(rso) case ggufTypeUint32: v = llm.readU32(rso) case ggufTypeInt32: v = llm.readI32(rso) case ggufTypeUint64: v = llm.readU64(rso) case ggufTypeInt64: v = llm.readI64(rso) case ggufTypeFloat32: v = llm.readF32(rso) case ggufTypeFloat64: v = llm.readF64(rso) case ggufTypeBool: v = llm.readBool(rso) case ggufTypeString: s, err := llm.readString(rso) if err != nil { return err } v = s case ggufTypeArray: a, err := llm.readArray(rso) if err != nil { return err } v = a default: return fmt.Errorf("invalid type: %d", vtype) } llm.kv[k] = v } // decode tensors for i := 0; uint64(i) < llm.NumTensor(); i++ { name, err := llm.readString(rso) if err != nil { return err } // dims is the number of dimensions in the tensor dims := llm.readU32(rso) shape := [4]uint64{1, 1, 1, 1} for i := 0; uint32(i) < dims; i++ { shape[i] = llm.readU64(rso) } tensor := tensor{ name: name, kind: llm.readU32(rso), offset: llm.readU64(rso), shape: shape, } llm.tensors = append(llm.tensors, tensor) llm.parameters += tensor.parameters() } alignment, ok := llm.kv["general.alignment"].(uint32) if !ok { alignment = 32 } rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent) for _, tensor := range llm.tensors { padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) rso.Seek(padded, io.SeekCurrent) } return nil } func (llm *ggufModel) NumLayers() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())] if !exists { return 0 } return value.(uint32) } func (llm *ggufModel) NumHead() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())] if !exists { return 0 } return value.(uint32) } func (llm *ggufModel) NumEmbed() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())] if !exists { return 0 } return value.(uint32) } func (llm *ggufModel) NumHeadKv() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())] if !exists { return 0 } return value.(uint32) } func (llm *ggufModel) NumCtx() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())] if !exists { return 0 } return value.(uint32) } func (llm *ggufModel) NumGQA() uint32 { numHeadKv := llm.NumHeadKv() if numHeadKv == 0 { return 0 } return llm.NumHead() / numHeadKv } func (llm ggufModel) readU8(r io.Reader) uint8 { var u8 uint8 binary.Read(r, llm.bo, &u8) return u8 } func (llm ggufModel) readI8(r io.Reader) int8 { var i8 int8 binary.Read(r, llm.bo, &i8) return i8 } func (llm ggufModel) readU16(r io.Reader) uint16 { var u16 uint16 binary.Read(r, llm.bo, &u16) return u16 } func (llm ggufModel) readI16(r io.Reader) int16 { var i16 int16 binary.Read(r, llm.bo, &i16) return i16 } func (llm ggufModel) readU32(r io.Reader) uint32 { var u32 uint32 binary.Read(r, llm.bo, &u32) return u32 } func (llm ggufModel) readI32(r io.Reader) int32 { var i32 int32 binary.Read(r, llm.bo, &i32) return i32 } func (llm ggufModel) readU64(r io.Reader) uint64 { var u64 uint64 binary.Read(r, llm.bo, &u64) return u64 } func (llm ggufModel) readI64(r io.Reader) int64 { var i64 int64 binary.Read(r, llm.bo, &i64) return i64 } func (llm ggufModel) readF32(r io.Reader) float32 { var f32 float32 binary.Read(r, llm.bo, &f32) return f32 } func (llm ggufModel) readF64(r io.Reader) float64 { var f64 float64 binary.Read(r, llm.bo, &f64) return f64 } func (llm ggufModel) readBool(r io.Reader) bool { var b bool binary.Read(r, llm.bo, &b) return b } func (llm ggufModel) readStringV1(r io.Reader) (string, error) { var nameLength uint32 binary.Read(r, llm.bo, &nameLength) var b bytes.Buffer if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil { return "", err } // gguf v1 strings are null-terminated b.Truncate(b.Len() - 1) return b.String(), nil } func (llm ggufModel) readString(r io.Reader) (string, error) { if llm.Version == 1 { return llm.readStringV1(r) } var nameLength uint64 binary.Read(r, llm.bo, &nameLength) var b bytes.Buffer if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil { return "", err } return b.String(), nil } func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) { atype := llm.readU32(r) n := llm.readU32(r) for i := 0; uint32(i) < n; i++ { switch atype { case ggufTypeUint8: arr = append(arr, llm.readU8(r)) case ggufTypeInt8: arr = append(arr, llm.readI8(r)) case ggufTypeUint16: arr = append(arr, llm.readU16(r)) case ggufTypeInt16: arr = append(arr, llm.readI16(r)) case ggufTypeUint32: arr = append(arr, llm.readU32(r)) case ggufTypeInt32: arr = append(arr, llm.readI32(r)) case ggufTypeFloat32: arr = append(arr, llm.readF32(r)) case ggufTypeBool: arr = append(arr, llm.readBool(r)) case ggufTypeString: s, err := llm.readStringV1(r) if err != nil { return nil, err } arr = append(arr, s) default: return nil, fmt.Errorf("invalid array type: %d", atype) } } return } func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) { if llm.Version == 1 { return llm.readArrayV1(r) } atype := llm.readU32(r) n := llm.readU64(r) for i := 0; uint64(i) < n; i++ { switch atype { case ggufTypeUint8: arr = append(arr, llm.readU8(r)) case ggufTypeInt8: arr = append(arr, llm.readI8(r)) case ggufTypeUint16: arr = append(arr, llm.readU16(r)) case ggufTypeInt16: arr = append(arr, llm.readI16(r)) case ggufTypeUint32: arr = append(arr, llm.readU32(r)) case ggufTypeInt32: arr = append(arr, llm.readI32(r)) case ggufTypeUint64: arr = append(arr, llm.readU64(r)) case ggufTypeInt64: arr = append(arr, llm.readI64(r)) case ggufTypeFloat32: arr = append(arr, llm.readF32(r)) case ggufTypeFloat64: arr = append(arr, llm.readF64(r)) case ggufTypeBool: arr = append(arr, llm.readBool(r)) case ggufTypeString: s, err := llm.readString(r) if err != nil { return nil, err } arr = append(arr, s) default: return nil, fmt.Errorf("invalid array type: %d", atype) } } return }