diff --git a/llm/ggml.go b/llm/ggml.go index 932efe62..f40f17e5 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -381,6 +381,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui ) partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128 + case "stablelm": + fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) + partialOffload = max( + 4*batch*(vocab+2*embedding), + fullOffload, + ) } return diff --git a/llm/gguf.go b/llm/gguf.go index b8840ecd..acdeb29f 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -248,13 +248,17 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { } padding := llm.padding(offset, int64(alignment)) - if _, err := rs.Seek(padding-offset, io.SeekCurrent); err != nil { + if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { return err } for _, tensor := range llm.tensors { - padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) - if _, err := rs.Seek(padded, io.SeekCurrent); err != nil { + if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil { + return err + } + + padding := llm.padding(int64(tensor.size()), int64(alignment)) + if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { return err } } @@ -623,8 +627,9 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error { return err } - padding := llm.padding(offset, 32) - if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil { + var alignment int64 = 32 + padding := llm.padding(offset, alignment) + if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil { return err } @@ -638,8 +643,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error { return err } - padding := llm.padding(offset, 32) - if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil { + padding := llm.padding(offset, alignment) + if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil { return err } } @@ -648,5 +653,5 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error { } func (gguf) padding(offset, align int64) int64 { - return (offset + align - 1) / align * align + return (align - offset%align) % align } diff --git a/llm/server.go b/llm/server.go index a9bfdc47..3cab6f1d 100644 --- a/llm/server.go +++ b/llm/server.go @@ -112,7 +112,6 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option var memoryLayerOutput uint64 for k, v := range layers { if !strings.HasPrefix(k, "blk.") { - slog.Info("aaa", "name", k, "size", format.HumanBytes2(v.size())) memoryLayerOutput += v.size() } } diff --git a/types/model/name.go b/types/model/name.go index 7f037b61..9c56c49a 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -521,6 +521,8 @@ func parts(s string) iter_Seq2[PartKind, string] { return } state, j, partLen = PartModel, i, 0 + case PartHost: + // noop: support for host:port default: yield(PartExtraneous, s[i+1:j]) return @@ -678,6 +680,9 @@ func isValidByteFor(kind PartKind, c byte) bool { if kind == PartNamespace && c == '.' { return false } + if kind == PartHost && c == ':' { + return true + } if c == '.' || c == '-' { return true } diff --git a/types/model/name_test.go b/types/model/name_test.go index b45dadaf..8749477a 100644 --- a/types/model/name_test.go +++ b/types/model/name_test.go @@ -40,6 +40,7 @@ var testNames = map[string]fields{ "user/model": {namespace: "user", model: "model"}, "example.com/ns/mistral:7b+Q4_0": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "Q4_0"}, "example.com/ns/mistral:7b+X": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "X"}, + "localhost:5000/ns/mistral": {host: "localhost:5000", namespace: "ns", model: "mistral"}, // invalid digest "mistral:latest@invalid256-": {},