diff --git a/llama/llama.go b/llama/llama.go index 89943380..a092ea12 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -88,6 +88,7 @@ import ( "fmt" "runtime" "runtime/cgo" + "slices" "strings" "unsafe" ) @@ -260,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) { } m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)} - if m.c == (*C.struct_llama_model)(C.NULL) { + if m.c == nil { return nil, fmt.Errorf("unable to load model: %s", modelPath) } @@ -276,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) { c: C.llama_new_context_with_model(model.c, params.c), numThreads: int(params.c.n_threads), } - if c.c == (*C.struct_llama_context)(C.NULL) { + if c.c == nil { return nil, errors.New("unable to create llama context") } @@ -300,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float defer C.free(unsafe.Pointer(cLoraPath)) loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath) + if loraAdapter == nil { + return errors.New("unable to load lora") + } err := -1 if loraAdapter != nil { @@ -322,13 +326,25 @@ type Batch struct { // Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero). // Batches cannot contain both types at the same time. batchSize is the maximum number of entries // that can be added per sequence -func NewBatch(batchSize int, maxSeq int, embedSize int) *Batch { - return &Batch{ +func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) { + b := Batch{ c: C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)), batchSize: batchSize, maxSeq: maxSeq, embedSize: embedSize, } + + // Check to see if any of the allocations in llama_batch_init() failed + nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) || + b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil || + slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil) + + if nilPointer { + C.llama_batch_free(b.c) + return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize) + } + + return &b, nil } func (b *Batch) Size() int { @@ -484,6 +500,9 @@ func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, erro mp := C.CString(modelPath) defer C.free(unsafe.Pointer(mp)) c := C.clip_model_load(mp, 1) + if c == nil { + return nil, fmt.Errorf("unable to load clip model: %v", modelPath) + } projEmbedSize := int(C.clip_n_mmproj_embd(c)) modelEmbedSize := llamaContext.Model().NEmbd() @@ -498,8 +517,11 @@ func (c *ClipContext) Free() { C.clip_free(c.c) } -func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 { +func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) { l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data))) + if l == nil { + return nil, errors.New("unable to make llava embedding from image") + } numTokens := int(l.n_image_pos) numEmbed := llamaContext.Model().NEmbd() @@ -516,7 +538,7 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 { C.llava_image_embed_free(l) - return embed + return embed, nil } type MllamaContext struct { @@ -527,6 +549,9 @@ func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext, mp := C.CString(modelPath) defer C.free(unsafe.Pointer(mp)) c := C.mllama_model_load(mp, 1) + if c == nil { + return nil, fmt.Errorf("unable to load mllama model: %v", modelPath) + } projEmbedSize := int(C.mllama_n_embd(c)) modelEmbedSize := llamaContext.Model().NEmbd() @@ -541,19 +566,25 @@ func (m *MllamaContext) Free() { C.mllama_free(m.c) } -func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) [][]float32 { +func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) { img := C.mllama_image_init() defer C.mllama_image_free(img) - C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img) + ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)) + if !ok { + return nil, errors.New("unable to load mllama image data") + } rows := make([]float32, m.EmbedSize(llamaContext)) - C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))) + ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))) + if !ok { + return nil, errors.New("unable to make mllama embedding from image") + } embed := make([][]float32, 1) embed[0] = rows - return embed + return embed, nil } func (m *MllamaContext) EmbedSize(llamaContext *Context) int { @@ -592,7 +623,7 @@ type SamplingParams struct { Grammar string } -func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext { +func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) { var cparams C.struct_gpt_sampler_cparams cparams.top_k = C.int32_t(params.TopK) cparams.top_p = C.float(params.TopP) @@ -615,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext { cparams.grammar = grammar context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)} + if context.c == nil { + return nil, errors.New("unable to create sampling context") + } + runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) }) - return context + return context, nil } func (s *SamplingContext) Reset() { diff --git a/llama/runner/image.go b/llama/runner/image.go index ee76f47a..1cb898d3 100644 --- a/llama/runner/image.go +++ b/llama/runner/image.go @@ -63,9 +63,9 @@ func (c *ImageContext) Free(modelPath string) { } } -func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 { +func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) { if c == nil { - return nil + return nil, nil } hash := c.hashImage(data) @@ -76,17 +76,23 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect embed, err := c.findImage(hash) if err != nil { if c.mllama != nil { - embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId) + embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId) + if err != nil { + return nil, err + } } else if c.clip != nil { - embed = c.clip.NewEmbed(llamaContext, data) + embed, err = c.clip.NewEmbed(llamaContext, data) + if err != nil { + return nil, err + } } else { - return nil + return nil, errors.New("received image but vision model not loaded") } c.addImage(hash, embed) } - return embed + return embed, nil } func (c *ImageContext) BatchSize(configuredBatchSize int) int { diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 041bafb3..33900bd2 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -131,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen var sc *llama.SamplingContext if params.samplingParams != nil { - sc = llama.NewSamplingContext(s.model, *params.samplingParams) + sc, err = llama.NewSamplingContext(s.model, *params.samplingParams) + if err != nil { + return nil, err + } for _, input := range inputs { if input.embed == nil { sc.Accept(input.token, false) @@ -194,7 +197,11 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) { return nil, fmt.Errorf("invalid image index: %d", n) } - embed := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) + embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) + if err != nil { + return nil, err + } + for _, e := range embed { inputs = append(inputs, input{embed: e}) } @@ -305,13 +312,19 @@ func (s *Server) run(ctx context.Context) { // Logically these batches are used only within the context of processBatch // but it is better for performance to allocate them once here - tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0) + tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0) + if err != nil { + panic(err) + } defer tokenBatch.Free() var embedBatch *llama.Batch embedBatchSize := s.image.BatchSize(s.batchSize) if embedBatchSize != 0 { - embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc)) + embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc)) + if err != nil { + panic(err) + } defer embedBatch.Free() } else { embedBatch = &llama.Batch{} diff --git a/llama/sampling_ext.cpp b/llama/sampling_ext.cpp index 98085993..3dd7edf4 100644 --- a/llama/sampling_ext.cpp +++ b/llama/sampling_ext.cpp @@ -5,24 +5,28 @@ struct gpt_sampler *gpt_sampler_cinit( const struct llama_model *model, struct gpt_sampler_cparams *params) { - gpt_sampler_params sparams; - sparams.top_k = params->top_k; - sparams.top_p = params->top_p; - sparams.min_p = params->min_p; - sparams.tfs_z = params->tfs_z; - sparams.typ_p = params->typical_p; - sparams.temp = params->temp; - sparams.penalty_last_n = params->penalty_last_n; - sparams.penalty_repeat = params->penalty_repeat; - sparams.penalty_freq = params->penalty_freq; - sparams.penalty_present = params->penalty_present; - sparams.mirostat = params->mirostat; - sparams.mirostat_tau = params->mirostat_tau; - sparams.mirostat_eta = params->mirostat_eta; - sparams.penalize_nl = params->penalize_nl; - sparams.seed = params->seed; - sparams.grammar = params->grammar; - return gpt_sampler_init(model, sparams); + try { + gpt_sampler_params sparams; + sparams.top_k = params->top_k; + sparams.top_p = params->top_p; + sparams.min_p = params->min_p; + sparams.tfs_z = params->tfs_z; + sparams.typ_p = params->typical_p; + sparams.temp = params->temp; + sparams.penalty_last_n = params->penalty_last_n; + sparams.penalty_repeat = params->penalty_repeat; + sparams.penalty_freq = params->penalty_freq; + sparams.penalty_present = params->penalty_present; + sparams.mirostat = params->mirostat; + sparams.mirostat_tau = params->mirostat_tau; + sparams.mirostat_eta = params->mirostat_eta; + sparams.penalize_nl = params->penalize_nl; + sparams.seed = params->seed; + sparams.grammar = params->grammar; + return gpt_sampler_init(model, sparams); + } catch (const std::exception & err) { + return nullptr; + } } void gpt_sampler_cfree(struct gpt_sampler *sampler)