Skip to content

Commit 7c898d5

Browse files
committed
Update llama.cpp
1 parent bb610b9 commit 7c898d5

File tree

2 files changed

+26
-8
lines changed

2 files changed

+26
-8
lines changed

llama_cpp/llama_cpp.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ def _load_shared_library(lib_base_name: str):
112112

113113
# define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
114114
LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
115-
# define LLAMA_SESSION_VERSION 3
116-
LLAMA_SESSION_VERSION = 3
115+
# define LLAMA_SESSION_VERSION 4
116+
LLAMA_SESSION_VERSION = 4
117117

118118

119119
# struct llama_model;
@@ -220,6 +220,14 @@ def _load_shared_library(lib_base_name: str):
220220
LLAMA_ROPE_SCALING_YARN = 2
221221
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
222222

223+
# enum llama_split_mode {
224+
# LLAMA_SPLIT_NONE = 0, // single GPU
225+
# LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
226+
# LLAMA_SPLIT_ROW = 2, // split rows across GPUs
227+
# };
228+
LLAMA_SPLIT_NONE = 0
229+
LLAMA_SPLIT_LAYER = 1
230+
LLAMA_SPLIT_ROW = 2
223231

224232
# typedef struct llama_token_data {
225233
# llama_token id; // token id
@@ -365,20 +373,28 @@ class llama_model_kv_override(Structure):
365373

366374
# struct llama_model_params {
367375
# int32_t n_gpu_layers; // number of layers to store in VRAM
368-
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
369-
# const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
376+
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
377+
378+
# // main_gpu interpretation depends on split_mode:
379+
# // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
380+
# // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
381+
# // LLAMA_SPLIT_LAYER: ignored
382+
# int32_t main_gpu;
383+
384+
# // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
385+
# const float * tensor_split;
370386

371387
# // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
372388
# // If the provided progress_callback returns true, model loading continues.
373389
# // If it returns false, model loading is immediately aborted.
374390
# llama_progress_callback progress_callback;
391+
375392
# // context pointer passed to the progress callback
376393
# void * progress_callback_user_data;
377394

378395
# // override key-value pairs of the model meta data
379396
# const struct llama_model_kv_override * kv_overrides;
380397

381-
382398
# // Keep the booleans together to avoid misalignment during copy-by-value.
383399
# bool vocab_only; // only load the vocabulary, no weights
384400
# bool use_mmap; // use mmap if possible
@@ -389,8 +405,9 @@ class llama_model_params(Structure):
389405
390406
Attributes:
391407
n_gpu_layers (int): number of layers to store in VRAM
392-
main_gpu (int): the GPU that is used for scratch and small tensors
393-
tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
408+
split_mode (int): how to split the model across multiple GPUs
409+
main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
410+
tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
394411
progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
395412
progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
396413
kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -400,6 +417,7 @@ class llama_model_params(Structure):
400417

401418
_fields_ = [
402419
("n_gpu_layers", c_int32),
420+
("split_mode", c_int),
403421
("main_gpu", c_int32),
404422
("tensor_split", c_float_p),
405423
("progress_callback", llama_progress_callback),

vendor/llama.cpp

0 commit comments

Comments
 (0)