Merge branch 'main' into v0.2-wip

2023-07-21 12:42:38 -04:00 · 2023-07-21 12:42:38 -04:00 · 436036aa67
commit 436036aa67
parent 0538ba1dab 231123ee1e
4 changed files with 10 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ## [0.1.75]
 - Update llama.cpp
 ## [0.1.74]
 ### Added
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -164,7 +164,7 @@ llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 #     int32_t  n_batch;                      // prompt processing batch size
 #     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
 #     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-#     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+#     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 #     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
 #     float    rope_freq_base;  // RoPE base frequency
@ -192,7 +192,7 @@ class llama_context_params(Structure):
        ("n_batch", c_int32),
        ("n_gpu_layers", c_int32),
        ("main_gpu", c_int32),
-        ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("tensor_split", POINTER(c_float)),
        ("rope_freq_base", c_float),
        ("rope_freq_scale", c_float),
        ("progress_callback", llama_progress_callback),
@ -933,22 +933,19 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
 # /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
 # /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
 # /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
 # LLAMA_API void llama_sample_classifier_free_guidance(
 #             struct llama_context * ctx,
 #         llama_token_data_array * candidates,
 #             struct llama_context * guidance_ctx,
-#                             float   scale,
+#                             float   scale);
 #                             float   smooth_factor);
 def llama_sample_classifier_free_guidance(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    guidance_ctx: llama_context_p,
    scale: c_float,
    smooth_factor: c_float,
 ):
    return _lib.llama_sample_classifier_free_guidance(
-        ctx, candidates, guidance_ctx, scale, smooth_factor
+        ctx, candidates, guidance_ctx, scale
    )
@ -957,7 +954,6 @@ _lib.llama_sample_classifier_free_guidance.argtypes = [
    llama_token_data_array_p,
    llama_context_p,
    c_float,
    c_float,
 ]
 _lib.llama_sample_classifier_free_guidance.restype = None
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "llama_cpp_python"
-version = "0.1.74"
+version = "0.1.75"
 description = "Python bindings for the llama.cpp library"
 readme = "README.md"
 license = { text = "MIT" }
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit e782c9e735f93ab4767ffc37462c523b73a17ddc
+Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d
		`@ -1 +1 @@`
			`Subproject commit e782c9e735f93ab4767ffc37462c523b73a17ddc`				`Subproject commit d924522a46c5ef097af4a88087d91673e8e87e4d`