diff --git a/cmd/cmd.go b/cmd/cmd.go index 9f6c928c..bf55061f 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -105,24 +105,48 @@ func CreateHandler(cmd *cobra.Command, args []string) error { zf := zip.NewWriter(tf) - files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors")) + files := []string{} + + tfiles, err := filepath.Glob(filepath.Join(path, "pytorch_model-*.bin")) if err != nil { return err + } else if len(tfiles) == 0 { + tfiles, err = filepath.Glob(filepath.Join(path, "model-*.safetensors")) + if err != nil { + return err + } } + files = append(files, tfiles...) + if len(files) == 0 { - return fmt.Errorf("no safetensors files were found in '%s'", path) + return fmt.Errorf("no models were found in '%s'", path) } - // add the safetensor config file + tokenizer + // add the safetensor/torch config file + tokenizer files = append(files, filepath.Join(path, "config.json")) + files = append(files, filepath.Join(path, "params.json")) files = append(files, filepath.Join(path, "added_tokens.json")) files = append(files, filepath.Join(path, "tokenizer.model")) for _, fn := range files { f, err := os.Open(fn) - if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") { - continue + + // just skip whatever files aren't there + if os.IsNotExist(err) { + if strings.HasSuffix(fn, "tokenizer.model") { + // try the parent dir before giving up + parentDir := filepath.Dir(path) + newFn := filepath.Join(parentDir, "tokenizer.model") + f, err = os.Open(newFn) + if os.IsNotExist(err) { + continue + } else if err != nil { + return err + } + } else { + continue + } } else if err != nil { return err } diff --git a/convert/convert.go b/convert/convert.go index fc4f3085..bf6f0bf5 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -1,21 +1,16 @@ package convert import ( - "bytes" "cmp" "encoding/binary" "encoding/json" "fmt" - "io" "log/slog" "os" "path/filepath" - "regexp" "slices" + "strings" - "github.com/d4l3k/go-bfloat16" - "github.com/mitchellh/mapstructure" - "github.com/x448/float16" "google.golang.org/protobuf/proto" "github.com/ollama/ollama/convert/sentencepiece" @@ -45,157 +40,45 @@ type ByteOrder interface { binary.AppendByteOrder } -type MetaData struct { - Type string `mapstructure:"dtype"` - Shape []int `mapstructure:"shape"` - Offsets []int `mapstructure:"data_offsets"` -} - type ModelArch interface { GetTensors() error LoadVocab() error WriteGGUF() (string, error) } +type ModelFormat interface { + GetLayerName(string) (string, error) + GetTensors(string, *Params) ([]llm.Tensor, error) + GetParams(string) (*Params, error) + GetModelArch(string, string, *Params) (ModelArch, error) +} + type ModelData struct { Path string Name string Params *Params Vocab *Vocab Tensors []llm.Tensor + Format ModelFormat } -func ReadSafeTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) { - f, err := os.Open(fn) - if err != nil { - return nil, 0, err - } - defer f.Close() - - var jsonSize uint64 - if err := binary.Read(f, binary.LittleEndian, &jsonSize); err != nil { - return nil, 0, err - } - - buf := make([]byte, jsonSize) - _, err = io.ReadFull(f, buf) - if err != nil { - return nil, 0, err - } - - d := json.NewDecoder(bytes.NewBuffer(buf)) - d.UseNumber() - var parsed map[string]interface{} - if err = d.Decode(&parsed); err != nil { - return nil, 0, err - } - - var keys []string - for k := range parsed { - keys = append(keys, k) - } - - slices.Sort(keys) - - slog.Info("converting layers") - - var tensors []llm.Tensor - for _, k := range keys { - vals := parsed[k].(map[string]interface{}) - var data MetaData - if err = mapstructure.Decode(vals, &data); err != nil { - return nil, 0, err - } - - var size uint64 - var kind uint32 - switch len(data.Shape) { - case 0: - // metadata - continue - case 1: - // convert to float32 - kind = 0 - size = uint64(data.Shape[0] * 4) - case 2: - // convert to float16 - kind = 1 - size = uint64(data.Shape[0] * data.Shape[1] * 2) - } - - ggufName, err := GetTensorName(k) - if err != nil { - slog.Error("%v", err) - return nil, 0, err - } - - shape := []uint64{0, 0, 0, 0} - for i := range data.Shape { - shape[i] = uint64(data.Shape[i]) - } - - t := llm.Tensor{ - Name: ggufName, - Kind: kind, - Offset: offset, - Shape: shape[:], - } - - t.WriterTo = safetensorWriterTo{ - t: &t, - params: params, - bo: params.ByteOrder, - filename: fn, - start: uint64(data.Offsets[0]), - end: uint64(data.Offsets[1]), - padding: 8 + jsonSize, - } - - slog.Debug(fmt.Sprintf("%v", t)) - tensors = append(tensors, t) - offset += size - } - return tensors, offset, nil -} - -func GetSafeTensors(dirpath string, params *Params) ([]llm.Tensor, error) { - var tensors []llm.Tensor - files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors")) +func GetModelFormat(dirname string) (ModelFormat, error) { + files, err := filepath.Glob(filepath.Join(dirname, "*")) if err != nil { return nil, err } - var offset uint64 - for _, f := range files { - var t []llm.Tensor - var err error - t, offset, err = ReadSafeTensors(f, offset, params) - if err != nil { - slog.Error("%v", err) - return nil, err + for _, fn := range files { + slog.Debug(fmt.Sprintf("file = %s", fn)) + if strings.HasSuffix(fn, ".safetensors") { + return &SafetensorFormat{}, nil + } else if strings.HasSuffix(fn, ".bin") { + slog.Debug("model is torch") + return &TorchFormat{}, nil } - tensors = append(tensors, t...) - } - return tensors, nil -} - -func GetParams(dirpath string) (*Params, error) { - f, err := os.Open(filepath.Join(dirpath, "config.json")) - if err != nil { - return nil, err - } - defer f.Close() - - var params Params - - d := json.NewDecoder(f) - err = d.Decode(¶ms) - if err != nil { - return nil, err } - params.ByteOrder = binary.LittleEndian - return ¶ms, nil + return nil, fmt.Errorf("couldn't determine model format") } // Details on gguf's tokenizer can be found at: @@ -206,7 +89,7 @@ type Vocab struct { Types []int32 } -func LoadSentencePieceTokens(dirpath string, vocabSize int) (*Vocab, error) { +func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) { slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model"))) in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model")) if err != nil { @@ -286,8 +169,8 @@ func LoadSentencePieceTokens(dirpath string, vocabSize int) (*Vocab, error) { } slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens))) - if vocabSize > len(v.Tokens) { - missingTokens := vocabSize - len(v.Tokens) + if params.VocabSize > len(v.Tokens) { + missingTokens := params.VocabSize - len(v.Tokens) slog.Warn(fmt.Sprintf("vocab is missing %d tokens", missingTokens)) for cnt := 0; cnt < missingTokens; cnt++ { v.Tokens = append(v.Tokens, fmt.Sprintf("", cnt+1)) @@ -298,136 +181,3 @@ func LoadSentencePieceTokens(dirpath string, vocabSize int) (*Vocab, error) { return v, nil } - -func GetTensorName(n string) (string, error) { - tMap := map[string]string{ - "model.embed_tokens.weight": "token_embd.weight", - "model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight", - "model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight", - "model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight", - "model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight", - "model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight", - "model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight", - "model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight", - "model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight", - "model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight", - "lm_head.weight": "output.weight", - "model.norm.weight": "output_norm.weight", - } - - v, ok := tMap[n] - if ok { - return v, nil - } - - // quick hack to rename the layers to gguf format - for k, v := range tMap { - re := regexp.MustCompile(k) - newName := re.ReplaceAllString(n, v) - if newName != n { - return newName, nil - } - } - - return "", fmt.Errorf("couldn't find a layer name for '%s'", n) -} - -type safetensorWriterTo struct { - t *llm.Tensor - - params *Params - bo ByteOrder - - filename string - - start, end, padding uint64 - handler func(w io.Writer, r safetensorWriterTo, f *os.File) error -} - -func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) { - f, err := os.Open(r.filename) - if err != nil { - return 0, err - } - defer f.Close() - - if _, err = f.Seek(int64(r.padding+r.start), 0); err != nil { - return 0, err - } - - // use the handler if one is present - if r.handler != nil { - return 0, r.handler(w, r, f) - } - - remaining := r.end - r.start - - bufSize := uint64(10240) - var finished bool - for { - data := make([]byte, min(bufSize, remaining)) - - b, err := io.ReadFull(f, data) - remaining -= uint64(b) - - if err == io.EOF || remaining <= 0 { - finished = true - } else if err != nil { - return 0, err - } - - // convert bfloat16 -> ieee float32 - tDataF32 := bfloat16.DecodeFloat32(data) - - switch r.t.Kind { - case 0: - if err := binary.Write(w, r.bo, tDataF32); err != nil { - return 0, err - } - case 1: - // convert float32 -> float16 - tempBuf := make([]uint16, len(data)/2) - for cnt, v := range tDataF32 { - tDataF16 := float16.Fromfloat32(v) - tempBuf[cnt] = uint16(tDataF16) - } - if err := binary.Write(w, binary.LittleEndian, tempBuf); err != nil { - return 0, err - } - } - if finished { - break - } - } - return 0, nil -} - -func GetModelArchFromParams(name, dirPath string, params *Params) (ModelArch, error) { - switch len(params.Architectures) { - case 0: - return nil, fmt.Errorf("No architecture specified to convert") - case 1: - switch params.Architectures[0] { - case "MistralForCausalLM": - return &MistralModel{ - ModelData{ - Name: name, - Path: dirPath, - Params: params, - }, - }, nil - case "GemmaForCausalLM": - return &GemmaModel{ - ModelData{ - Name: name, - Path: dirPath, - Params: params, - }, - }, nil - default: - return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0]) - } - } - - return nil, fmt.Errorf("Unknown error") -} diff --git a/convert/gemma.go b/convert/gemma.go index 7ccc065a..648a4ad9 100644 --- a/convert/gemma.go +++ b/convert/gemma.go @@ -65,13 +65,14 @@ func addOnes(data []float32, vectorSize int) ([]float32, error) { } func (m *GemmaModel) GetTensors() error { - t, err := GetSafeTensors(m.Path, m.Params) + t, err := m.Format.GetTensors(m.Path, m.Params) if err != nil { return err } - m.Tensors = []llm.Tensor{} + slog.Debug(fmt.Sprintf("Total tensors: %d", len(t))) + m.Tensors = []llm.Tensor{} for _, l := range t { if strings.HasSuffix(l.Name, "norm.weight") { wt := l.WriterTo.(safetensorWriterTo) @@ -85,7 +86,7 @@ func (m *GemmaModel) GetTensors() error { } func (m *GemmaModel) LoadVocab() error { - v, err := LoadSentencePieceTokens(m.Path, m.Params.VocabSize) + v, err := LoadSentencePieceTokens(m.Path, m.Params) if err != nil { return err } diff --git a/convert/llama.go b/convert/llama.go new file mode 100644 index 00000000..c7f7b290 --- /dev/null +++ b/convert/llama.go @@ -0,0 +1,176 @@ +package convert + +import ( + "encoding/binary" + "fmt" + "io" + "log/slog" + "os" + "regexp" + "strings" + + "github.com/nlpodyssey/gopickle/pytorch" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + "github.com/x448/float16" + + "github.com/ollama/ollama/llm" +) + +type LlamaModel struct { + ModelData +} + +func llamaLayerHandler(w io.Writer, r torchWriterTo) error { + slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name)) + + data := r.storage.(*pytorch.HalfStorage).Data + tData := make([]uint16, len(data)) + for cnt, v := range data { + tData[cnt] = uint16(float16.Fromfloat32(v)) + } + + var err error + var heads uint32 + if strings.Contains(r.t.Name, "attn_q") { + heads = uint32(r.params.AttentionHeads) + } else if strings.Contains(r.t.Name, "attn_k") { + heads = uint32(r.params.KeyValHeads) + if heads == 0 { + heads = uint32(r.params.AttentionHeads) + } + } else { + return fmt.Errorf("unknown layer type") + } + + slog.Debug(fmt.Sprintf("heads = %d", heads)) + + tData, err = llamaRepack(tData, int(heads), r.t.Shape) + if err != nil { + return err + } + + if err = binary.Write(w, r.bo, tData); err != nil { + return err + } + return nil +} + +func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) { + n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data)) + origShape := n.Shape().Clone() + + // reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf + if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(origShape...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + newN, err := native.SelectU16(n, 1) + if err != nil { + return nil, err + } + + var fullTensor []uint16 + for _, v := range newN { + fullTensor = append(fullTensor, v...) + } + return fullTensor, nil +} + +func (m *LlamaModel) GetTensors() error { + t, err := m.Format.GetTensors(m.Path, m.Params) + if err != nil { + return err + } + + m.Tensors = []llm.Tensor{} + + pattern := `^blk\.[0-9]+\.attn_(?Pq|k)\.weight$` + re, err := regexp.Compile(pattern) + if err != nil { + return err + } + + for _, l := range t { + matches := re.FindAllStringSubmatch(l.Name, -1) + if len(matches) > 0 { + slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name)) + wt := l.WriterTo.(torchWriterTo) + wt.handler = llamaLayerHandler + l.WriterTo = wt + } + m.Tensors = append(m.Tensors, l) + } + + return nil +} + +func (m *LlamaModel) LoadVocab() error { + var v *Vocab + var err error + + slog.Debug("loading vocab") + v, err = LoadSentencePieceTokens(m.Path, m.Params) + if err != nil { + return err + } + + slog.Debug("vocab loaded") + + m.Vocab = v + return nil +} + +func (m *LlamaModel) WriteGGUF() (string, error) { + kv := llm.KV{ + "general.architecture": "llama", + "general.name": m.Name, + "llama.vocab_size": uint32(len(m.Vocab.Tokens)), + "llama.context_length": uint32(m.Params.ContextSize), + "llama.embedding_length": uint32(m.Params.HiddenSize), + "llama.block_count": uint32(m.Params.HiddenLayers), + "llama.feed_forward_length": uint32(m.Params.IntermediateSize), + "llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads), + "llama.attention.head_count": uint32(m.Params.AttentionHeads), + "llama.attention.head_count_kv": uint32(m.Params.KeyValHeads), + "llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS), + "general.file_type": uint32(1), + "tokenizer.ggml.model": "llama", + + "tokenizer.ggml.tokens": m.Vocab.Tokens, + "tokenizer.ggml.scores": m.Vocab.Scores, + "tokenizer.ggml.token_type": m.Vocab.Types, + + "tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID), + "tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID), + "tokenizer.ggml.unknown_token_id": uint32(0), + "tokenizer.ggml.add_bos_token": true, + "tokenizer.ggml.add_eos_token": false, + } + + f, err := os.CreateTemp("", "ollama-gguf") + if err != nil { + return "", err + } + defer f.Close() + + mod := llm.NewGGUFV3(m.Params.ByteOrder) + if err := mod.Encode(f, kv, m.Tensors); err != nil { + return "", err + } + + slog.Debug(fmt.Sprintf("gguf file = %s", f.Name())) + + return f.Name(), nil +} diff --git a/convert/mistral.go b/convert/mistral.go index 51ad6729..70c92edd 100644 --- a/convert/mistral.go +++ b/convert/mistral.go @@ -97,7 +97,7 @@ func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) { } func (m *MistralModel) GetTensors() error { - t, err := GetSafeTensors(m.Path, m.Params) + t, err := m.Format.GetTensors(m.Path, m.Params) if err != nil { return err } @@ -124,7 +124,7 @@ func (m *MistralModel) GetTensors() error { } func (m *MistralModel) LoadVocab() error { - v, err := LoadSentencePieceTokens(m.Path, m.Params.VocabSize) + v, err := LoadSentencePieceTokens(m.Path, m.Params) if err != nil { return err } diff --git a/convert/safetensors.go b/convert/safetensors.go new file mode 100644 index 00000000..468bc707 --- /dev/null +++ b/convert/safetensors.go @@ -0,0 +1,304 @@ +package convert + +import ( + "bytes" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "regexp" + "slices" + + "github.com/d4l3k/go-bfloat16" + "github.com/mitchellh/mapstructure" + "github.com/x448/float16" + + "github.com/ollama/ollama/llm" +) + +type safetensorWriterTo struct { + t *llm.Tensor + + params *Params + bo ByteOrder + + filename string + + start, end, padding uint64 + handler func(w io.Writer, r safetensorWriterTo, f *os.File) error +} + +type tensorMetaData struct { + Type string `mapstructure:"dtype"` + Shape []int `mapstructure:"shape"` + Offsets []int `mapstructure:"data_offsets"` +} + +type SafetensorFormat struct{} + +func (m *SafetensorFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) { + slog.Debug("getting tensor data") + var tensors []llm.Tensor + files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors")) + if err != nil { + return nil, err + } + + var offset uint64 + for _, f := range files { + var t []llm.Tensor + var err error + t, offset, err = m.readTensors(f, offset, params) + if err != nil { + slog.Error("%v", err) + return nil, err + } + tensors = append(tensors, t...) + } + slog.Debug(fmt.Sprintf("all tensors = %d", len(tensors))) + return tensors, nil +} + +func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) ([]llm.Tensor, uint64, error) { + f, err := os.Open(fn) + if err != nil { + return nil, 0, err + } + defer f.Close() + + var jsonSize uint64 + if err := binary.Read(f, binary.LittleEndian, &jsonSize); err != nil { + return nil, 0, err + } + + buf := make([]byte, jsonSize) + _, err = io.ReadFull(f, buf) + if err != nil { + return nil, 0, err + } + + d := json.NewDecoder(bytes.NewBuffer(buf)) + d.UseNumber() + var parsed map[string]interface{} + if err = d.Decode(&parsed); err != nil { + return nil, 0, err + } + + var keys []string + for k := range parsed { + keys = append(keys, k) + } + + slices.Sort(keys) + + slog.Info("converting layers") + + var tensors []llm.Tensor + for _, k := range keys { + vals := parsed[k].(map[string]interface{}) + var data tensorMetaData + if err = mapstructure.Decode(vals, &data); err != nil { + slog.Error("couldn't decode properly") + return nil, 0, err + } + + slog.Debug(fmt.Sprintf("metadata = %#v", data)) + var size uint64 + var kind uint32 + switch len(data.Shape) { + case 0: + // metadata + continue + case 1: + // convert to float32 + kind = 0 + size = uint64(data.Shape[0] * 4) + case 2: + // convert to float16 + kind = 1 + size = uint64(data.Shape[0] * data.Shape[1] * 2) + } + + ggufName, err := m.GetLayerName(k) + if err != nil { + slog.Error("%v", err) + return nil, 0, err + } + + shape := []uint64{0, 0, 0, 0} + for i := range data.Shape { + shape[i] = uint64(data.Shape[i]) + } + + t := llm.Tensor{ + Name: ggufName, + Kind: kind, + Offset: offset, + Shape: shape[:], + } + + t.WriterTo = safetensorWriterTo{ + t: &t, + params: params, + bo: params.ByteOrder, + filename: fn, + start: uint64(data.Offsets[0]), + end: uint64(data.Offsets[1]), + padding: 8 + jsonSize, + } + + tensors = append(tensors, t) + offset += size + } + slog.Debug(fmt.Sprintf("total tensors for file = %d", len(tensors))) + slog.Debug(fmt.Sprintf("offset = %d", offset)) + return tensors, offset, nil +} + +func (m *SafetensorFormat) GetParams(dirpath string) (*Params, error) { + f, err := os.Open(filepath.Join(dirpath, "config.json")) + if err != nil { + return nil, err + } + defer f.Close() + + var params Params + + d := json.NewDecoder(f) + err = d.Decode(¶ms) + if err != nil { + return nil, err + } + + params.ByteOrder = binary.LittleEndian + return ¶ms, nil +} + +func (m *SafetensorFormat) GetLayerName(n string) (string, error) { + directMap := map[string]string{ + "model.embed_tokens.weight": "token_embd.weight", + "lm_head.weight": "output.weight", + "model.norm.weight": "output_norm.weight", + } + + tMap := map[string]string{ + "model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight", + "model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight", + "model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight", + "model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight", + "model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight", + "model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight", + "model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight", + "model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight", + "model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight", + } + + v, ok := directMap[n] + if ok { + return v, nil + } + + // quick hack to rename the layers to gguf format + for k, v := range tMap { + re := regexp.MustCompile(k) + newName := re.ReplaceAllString(n, v) + if newName != n { + return newName, nil + } + } + + return "", fmt.Errorf("couldn't find a layer name for '%s'", n) +} + +func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) { + f, err := os.Open(r.filename) + if err != nil { + return 0, err + } + defer f.Close() + + if _, err = f.Seek(int64(r.padding+r.start), 0); err != nil { + return 0, err + } + + // use the handler if one is present + if r.handler != nil { + return 0, r.handler(w, r, f) + } + + remaining := r.end - r.start + + bufSize := uint64(10240) + var finished bool + for { + data := make([]byte, min(bufSize, remaining)) + + b, err := io.ReadFull(f, data) + remaining -= uint64(b) + + if err == io.EOF || remaining <= 0 { + finished = true + } else if err != nil { + return 0, err + } + + // convert bfloat16 -> ieee float32 + tDataF32 := bfloat16.DecodeFloat32(data) + + switch r.t.Kind { + case 0: + if err := binary.Write(w, r.bo, tDataF32); err != nil { + return 0, err + } + case 1: + // convert float32 -> float16 + tempBuf := make([]uint16, len(data)/2) + for cnt, v := range tDataF32 { + tDataF16 := float16.Fromfloat32(v) + tempBuf[cnt] = uint16(tDataF16) + } + if err := binary.Write(w, r.bo, tempBuf); err != nil { + return 0, err + } + } + if finished { + break + } + } + return 0, nil +} + +func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) { + switch len(params.Architectures) { + case 0: + return nil, fmt.Errorf("No architecture specified to convert") + case 1: + switch params.Architectures[0] { + case "MistralForCausalLM": + return &MistralModel{ + ModelData{ + Name: name, + Path: dirPath, + Params: params, + Format: m, + }, + }, nil + case "GemmaForCausalLM": + return &GemmaModel{ + ModelData{ + Name: name, + Path: dirPath, + Params: params, + Format: m, + }, + }, nil + default: + return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0]) + } + } + + return nil, fmt.Errorf("Unknown error") +} diff --git a/convert/torch.go b/convert/torch.go new file mode 100644 index 00000000..fd237505 --- /dev/null +++ b/convert/torch.go @@ -0,0 +1,286 @@ +package convert + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/nlpodyssey/gopickle/pytorch" + "github.com/nlpodyssey/gopickle/types" + "github.com/x448/float16" + + "github.com/ollama/ollama/llm" +) + +type torchWriterTo struct { + t *llm.Tensor + + params *Params + bo ByteOrder + + storage pytorch.StorageInterface + handler func(w io.Writer, r torchWriterTo) error +} + +type TorchFormat struct{} + +func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) { + slog.Debug("getting torch tensors") + + files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin")) + if err != nil { + slog.Error("didn't find any torch files") + return nil, err + } + + var offset uint64 + + var tensors []llm.Tensor + for _, fn := range files { + m, err := pytorch.Load(fn) + if err != nil { + slog.Error(fmt.Sprintf("error unpickling: %q", err)) + return []llm.Tensor{}, err + } + + for _, k := range m.(*types.Dict).Keys() { + if strings.HasSuffix(k.(string), "self_attn.rotary_emb.inv_freq") { + continue + } + + t, _ := m.(*types.Dict).Get(k) + tshape := t.(*pytorch.Tensor).Size + + var size uint64 + var kind uint32 + switch len(tshape) { + case 0: + continue + case 1: + // convert to float32 + kind = 0 + size = uint64(tshape[0] * 4) + case 2: + // convert to float16 + kind = 1 + size = uint64(tshape[0] * tshape[1] * 2) + } + + ggufName, err := tf.GetLayerName(k.(string)) + if err != nil { + slog.Error("%v", err) + return nil, err + } + slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName)) + + shape := []uint64{0, 0, 0, 0} + for i := range tshape { + shape[i] = uint64(tshape[i]) + } + + tensor := llm.Tensor{ + Name: ggufName, + Kind: kind, + Offset: offset, // calculate the offset + Shape: shape[:], + } + + tensor.WriterTo = torchWriterTo{ + t: &tensor, + params: params, + bo: params.ByteOrder, + storage: t.(*pytorch.Tensor).Source, + } + + tensors = append(tensors, tensor) + offset += size + } + } + + return tensors, nil + +} + +func getAltParams(dirpath string) (*Params, error) { + f, err := os.Open(filepath.Join(dirpath, "params.json")) + if err != nil { + slog.Error("no params.json") + return nil, err + } + defer f.Close() + + type TorchParams struct { + HiddenSize int `json:"dim"` + AttentionHeads int `json:"n_heads"` + KeyValHeads int `json:"n_kv_heads"` + HiddenLayers int `json:"n_layers"` + RopeTheta int `json:"rope_theta"` + NormEPS float64 `json:"norm_eps"` + } + + var tparams TorchParams + + d := json.NewDecoder(f) + err = d.Decode(&tparams) + if err != nil { + return nil, err + } + + params := &Params{ + HiddenSize: tparams.HiddenSize, + AttentionHeads: tparams.AttentionHeads, + KeyValHeads: tparams.KeyValHeads, + HiddenLayers: tparams.HiddenLayers, + NormEPS: tparams.NormEPS, + } + + switch { + case tparams.RopeTheta == 1000000: + // Codellama + params.ContextSize = 16384 + case tparams.NormEPS == 1e-06: + // llama2 + slog.Debug("Found llama2 - setting context size to 4096") + params.ContextSize = 4096 + default: + params.ContextSize = 2048 + } + + params.ByteOrder = binary.LittleEndian + return params, nil +} + +func (m *TorchFormat) GetParams(dirpath string) (*Params, error) { + f, err := os.Open(filepath.Join(dirpath, "config.json")) + if err != nil { + if os.IsNotExist(err) { + // try params.json instead + return getAltParams(dirpath) + } else { + return nil, err + } + } + + var params Params + d := json.NewDecoder(f) + err = d.Decode(¶ms) + if err != nil { + return nil, err + } + + params.ByteOrder = binary.LittleEndian + return ¶ms, nil +} + +func (m *TorchFormat) GetLayerName(n string) (string, error) { + directMap := map[string]string{ + "tok_embeddings.weight": "token_embd.weight", + "output.weight": "output.weight", + "norm.weight": "output_norm.weight", + "rope.freqs": "rope_freqs.weight", + "model.embed_tokens.weight": "token_embd.weight", + "lm_head.weight": "output.weight", + "model.norm.weight": "output_norm.weight", + } + + lMap := map[string]string{ + "layers.(\\d+).attention_norm.weight": "blk.$1.attn_norm.weight", + "layers.(\\d+).attention_output_norm.weight": "blk.$1.attn_norm.weight", + "layers.(\\d+).feed_forward.w2.weight": "blk.$1.ffn_down.weight", + "layers.(\\d+).feed_forward.w1.weight": "blk.$1.ffn_gate.weight", + "layers.(\\d+).feed_forward.w3.weight": "blk.$1.ffn_up.weight", + "layers.(\\d+).ffn_norm.weight": "blk.$1.ffn_norm.weight", + "layers.(\\d+).attention.wk.weight": "blk.$1.attn_k.weight", + "layers.(\\d+).attention.wo.weight": "blk.$1.attn_output.weight", + "layers.(\\d+).attention.wq.weight": "blk.$1.attn_q.weight", + "layers.(\\d+).attention.wv.weight": "blk.$1.attn_v.weight", + "model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight", + "model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight", + "model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight", + "model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight", + "model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight", + "model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight", + "model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight", + "model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight", + "model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight", + } + + v, ok := directMap[n] + if ok { + return v, nil + } + + // quick hack to rename the layers to gguf format + for k, v := range lMap { + re := regexp.MustCompile(k) + newName := re.ReplaceAllString(n, v) + if newName != n { + return newName, nil + } + } + + return "", fmt.Errorf("couldn't find a layer name for '%s'", n) +} + +func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) { + // use the handler if one is present + if r.handler != nil { + return 0, r.handler(w, r) + } + + switch r.storage.(type) { + case *pytorch.FloatStorage: + slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name)) + return 0, nil + case *pytorch.HalfStorage: + switch r.t.Kind { + case 0: + data := r.storage.(*pytorch.HalfStorage).Data + slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data))) + if err := binary.Write(w, r.bo, data); err != nil { + return 0, err + } + case 1: + data := r.storage.(*pytorch.HalfStorage).Data + tData := make([]uint16, len(data)) + for cnt, v := range data { + tData[cnt] = uint16(float16.Fromfloat32(v)) + } + slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData))) + if err := binary.Write(w, r.bo, tData); err != nil { + return 0, err + } + } + } + + return 0, nil +} + +func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) { + switch len(params.Architectures) { + case 0: + return nil, fmt.Errorf("No architecture specified to convert") + case 1: + switch params.Architectures[0] { + case "LlamaForCausalLM": + return &LlamaModel{ + ModelData{ + Name: name, + Path: dirPath, + Params: params, + Format: m, + }, + }, nil + default: + return nil, fmt.Errorf("Models based on '%s' are not yet supported", params.Architectures[0]) + } + } + + return nil, fmt.Errorf("Unknown error") +} diff --git a/go.mod b/go.mod index 4047c2d7..4325d58d 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,10 @@ require ( golang.org/x/sync v0.3.0 ) -require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 +require ( + github.com/nlpodyssey/gopickle v0.3.0 + github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 +) require ( github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect @@ -68,7 +71,7 @@ require ( golang.org/x/net v0.17.0 // indirect golang.org/x/sys v0.13.0 golang.org/x/term v0.13.0 - golang.org/x/text v0.13.0 // indirect + golang.org/x/text v0.14.0 // indirect google.golang.org/protobuf v1.30.0 gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index d1a75b56..16de9170 100644 --- a/go.sum +++ b/go.sum @@ -122,6 +122,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/nlpodyssey/gopickle v0.3.0 h1:BLUE5gxFLyyNOPzlXxt6GoHEMMxD0qhsE4p0CIQyoLw= +github.com/nlpodyssey/gopickle v0.3.0/go.mod h1:f070HJ/yR+eLi5WmM1OXJEGaTpuJEUiib19olXgYha0= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4= @@ -236,8 +238,8 @@ golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/llm/gguf.go b/llm/gguf.go index 796642e3..d49847e3 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -6,6 +6,8 @@ import ( "fmt" "io" "strings" + + "log/slog" ) type containerGGUF struct { @@ -52,6 +54,7 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { } model := newGGUF(c) + slog.Debug(fmt.Sprintf("model = %#v", model)) if err := model.Decode(rs); err != nil { return nil, err } @@ -187,6 +190,8 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { llm.kv[k] = v } + slog.Debug(fmt.Sprintf("general.architecture = %s", llm.kv["general.architecture"])) + // decode tensors for i := 0; uint64(i) < llm.numTensor(); i++ { name, err := readGGUFString(llm, rs) @@ -451,6 +456,7 @@ var ggufKVOrder = map[string][]string{ "llama": { "general.architecture", "general.name", + "llama.vocab_size", "llama.context_length", "llama.embedding_length", "llama.block_count", @@ -509,11 +515,17 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error { return err } + kvCheck := make(map[string]bool) + for k := range kv { + kvCheck[k] = false + } + for _, k := range ggufKVOrder["llama"] { v, ok := kv[k] if !ok { continue } + kvCheck[k] = true if err := binary.Write(ws, llm.ByteOrder, uint64(len(k))); err != nil { return err @@ -567,6 +579,12 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error { } } + for k, v := range kvCheck { + if !v { + return fmt.Errorf("Didn't know how to write kv %s", k) + } + } + for _, tensor := range tensors { if err := binary.Write(ws, llm.ByteOrder, uint64(len(tensor.Name))); err != nil { return err diff --git a/server/images.go b/server/images.go index 0da51b85..74fa1a5e 100644 --- a/server/images.go +++ b/server/images.go @@ -322,7 +322,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c pathName := realpath(modelFileDir, c.Args) - ggufName, err := convertSafetensors(name, pathName, fn) + ggufName, err := convertModel(name, pathName, fn) if err != nil { var pathErr *fs.PathError switch { @@ -633,7 +633,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, c return nil } -func convertSafetensors(name, path string, fn func(resp api.ProgressResponse)) (string, error) { +func convertModel(name, path string, fn func(resp api.ProgressResponse)) (string, error) { r, err := zip.OpenReader(path) if err != nil { return "", err @@ -668,17 +668,22 @@ func convertSafetensors(name, path string, fn func(resp api.ProgressResponse)) ( rc.Close() } - params, err := convert.GetParams(tempDir) + mf, err := convert.GetModelFormat(tempDir) if err != nil { return "", err } - mArch, err := convert.GetModelArchFromParams(name, tempDir, params) + params, err := mf.GetParams(tempDir) if err != nil { return "", err } - fn(api.ProgressResponse{Status: "processing safetensors"}) + mArch, err := mf.GetModelArch(name, tempDir, params) + if err != nil { + return "", err + } + + fn(api.ProgressResponse{Status: "processing tensors"}) if err := mArch.GetTensors(); err != nil { return "", err }