Skip to content

Commit a94b0de

Browse files
committed
Merge branch 'main' into D4ve-R/main
2 parents 7b1c17b + 788394c commit a94b0de

File tree

13 files changed

+248
-74
lines changed

13 files changed

+248
-74
lines changed

.github/workflows/build-docker.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,4 @@ jobs:
4747
- name: Publish to GitHub Tag
4848
if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/')
4949
run: |
50-
echo "Docker image published for tag: ${{ github.ref_name }
50+
echo "Docker image published for tag: ${{ github.ref_name }}"

CHANGELOG.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
11+
12+
## [0.2.24]
13+
14+
- feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
15+
- feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
16+
- feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
17+
- feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
18+
- fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
19+
- perf: Don't convert logprobs arrays to lists by @kddubey in #1021
20+
- docs: Fix README.md functionary demo typo by @evelynmitchell in #996
21+
- examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
22+
23+
## [0.2.23]
24+
25+
- Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514
26+
- Add qwen chat format by @yhfgyyf in #1005
27+
- Add support for running the server with SSL by @rgerganov in #994
28+
- Replace logits_to_logprobs implementation with numpy equivalent to llama.cpp by @player1537 in #991
29+
- Fix UnsupportedOperation: fileno in suppress_stdout_stderr by @zocainViken in #961
30+
- Add Pygmalion chat format by @chiensen in #986
31+
- README.md multimodal params fix by @zocainViken in #967
32+
- Fix minor typo in README by @aniketmaurya in #958
33+
34+
## [0.2.22]
35+
36+
- Update llama.cpp to ggerganov/llama.cpp@8a7b2fa528f130631a5f43648481596ab320ed5a
37+
- Fix conflict with transformers library by kddubey in #952
38+
1039
## [0.2.21]
1140

1241
- Update llama.cpp to ggerganov/llama.cpp@64e64aa2557d97490b2fe1262b313e2f4a1607e3

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,8 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
207207
messages = [
208208
{
209209
"role": "system",
210-
"content": "A chat between a curious user and an artificial intelligence assitant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant callse functions with appropriate input when necessary"
210+
"content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
211+
211212
},
212213
{
213214
"role": "user",
@@ -219,7 +220,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
219220
"function": {
220221
"name": "UserDetail",
221222
"parameters": {
222-
"type": "object"
223+
"type": "object",
223224
"title": "UserDetail",
224225
"properties": {
225226
"name": {
@@ -265,7 +266,8 @@ Then you'll need to use a custom chat handler to load the clip model and process
265266
>>> llm = Llama(
266267
model_path="./path/to/llava/llama-model.gguf",
267268
chat_handler=chat_handler,
268-
n_ctx=2048 # n_ctx should be increased to accomodate the image embedding
269+
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
270+
logits_all=True,# needed to make llava work
269271
)
270272
>>> llm.create_chat_completion(
271273
messages = [

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
embd = []
7474
if len(embd_inp) <= input_consumed:
7575
logits = llama_cpp.llama_get_logits(ctx)
76-
n_vocab = llama_cpp.llama_n_vocab(ctx)
76+
n_vocab = llama_cpp.llama_n_vocab(model)
7777

7878
_arr = (llama_cpp.llama_token_data * n_vocab)(*[
7979
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
@@ -83,12 +83,12 @@
8383
llama_cpp.llama_token_data_array(_arr, len(_arr), False))
8484

8585
_arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
86-
llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
86+
llama_cpp.llama_sample_repetition_penalties(ctx, candidates_p,
8787
_arr,
88-
last_n_repeat, repeat_penalty)
89-
llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
90-
_arr,
91-
last_n_repeat, frequency_penalty, presence_penalty)
88+
penalty_last_n=last_n_repeat,
89+
penalty_repeat=repeat_penalty,
90+
penalty_freq=frequency_penalty,
91+
penalty_present=presence_penalty)
9292

9393
llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
9494
llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
@@ -126,4 +126,4 @@
126126

127127
llama_cpp.llama_print_timings(ctx)
128128

129-
llama_cpp.llama_free(ctx)
129+
llama_cpp.llama_free(ctx)

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.21"
4+
__version__ = "0.2.24"

llama_cpp/_utils.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,18 @@ def __enter__(self):
1717
if self.disable:
1818
return self
1919

20+
# Check if sys.stdout and sys.stderr have fileno method
21+
if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
22+
return self # Return the instance without making changes
23+
2024
self.outnull_file = self.open(self.os.devnull, "w")
2125
self.errnull_file = self.open(self.os.devnull, "w")
2226

2327
self.old_stdout_fileno_undup = self.sys.stdout.fileno()
2428
self.old_stderr_fileno_undup = self.sys.stderr.fileno()
2529

26-
self.old_stdout_fileno = self.os.dup(self.sys.stdout.fileno())
27-
self.old_stderr_fileno = self.os.dup(self.sys.stderr.fileno())
30+
self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
31+
self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
2832

2933
self.old_stdout = self.sys.stdout
3034
self.old_stderr = self.sys.stderr
@@ -40,14 +44,16 @@ def __exit__(self, *_):
4044
if self.disable:
4145
return
4246

43-
self.sys.stdout = self.old_stdout
44-
self.sys.stderr = self.old_stderr
47+
# Check if sys.stdout and sys.stderr have fileno method
48+
if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
49+
self.sys.stdout = self.old_stdout
50+
self.sys.stderr = self.old_stderr
4551

46-
self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
47-
self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
52+
self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
53+
self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
4854

49-
self.os.close(self.old_stdout_fileno)
50-
self.os.close(self.old_stderr_fileno)
55+
self.os.close(self.old_stdout_fileno)
56+
self.os.close(self.old_stderr_fileno)
5157

52-
self.outnull_file.close()
53-
self.errnull_file.close()
58+
self.outnull_file.close()
59+
self.errnull_file.close()

llama_cpp/llama.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import sys
33
import uuid
44
import time
5-
import math
65
import multiprocessing
76
from abc import ABC, abstractmethod
87
from typing import (
@@ -751,9 +750,9 @@ def __init__(
751750
yarn_beta_slow: float = 1.0,
752751
yarn_orig_ctx: int = 0,
753752
mul_mat_q: bool = True,
754-
f16_kv: bool = True,
755753
logits_all: bool = False,
756754
embedding: bool = False,
755+
offload_kqv: bool = False,
757756
# Sampling Params
758757
last_n_tokens_size: int = 64,
759758
# LoRA Params
@@ -771,7 +770,7 @@ def __init__(
771770
**kwargs, # type: ignore
772771
):
773772
"""Load a llama.cpp model from `model_path`.
774-
773+
775774
Examples:
776775
Basic usage
777776
@@ -817,9 +816,9 @@ def __init__(
817816
yarn_beta_fast: YaRN low correction dim
818817
yarn_beta_slow: YaRN high correction dim
819818
yarn_orig_ctx: YaRN original context size
820-
f16_kv: Use fp16 for KV cache, fp32 otherwise
821819
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
822820
embedding: Embedding mode only.
821+
offload_kqv: Offload K, Q, V to GPU.
823822
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
824823
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
825824
lora_path: Path to a LoRA file to apply to the model.
@@ -904,9 +903,9 @@ def __init__(
904903
)
905904
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
906905
self.context_params.mul_mat_q = mul_mat_q
907-
self.context_params.f16_kv = f16_kv
908906
self.context_params.logits_all = logits_all
909907
self.context_params.embedding = embedding
908+
self.context_params.offload_kqv = offload_kqv
910909

911910
# Sampling Params
912911
self.last_n_tokens_size = last_n_tokens_size
@@ -923,6 +922,12 @@ def __init__(
923922
self._model = _LlamaModel(
924923
path_model=self.model_path, params=self.model_params, verbose=self.verbose
925924
)
925+
# Set the default value for the context and correct the batch
926+
if n_ctx == 0:
927+
n_ctx = self._model.n_ctx_train()
928+
self.n_batch = min(n_ctx, n_batch)
929+
self.context_params.n_ctx = self._model.n_ctx_train()
930+
self.context_params.n_batch = self.n_batch
926931

927932
self._ctx = _LlamaContext(
928933
model=self._model,
@@ -1549,8 +1554,8 @@ def logit_bias_processor(
15491554
self.detokenize(completion_tokens[:returned_tokens])
15501555
)
15511556
token_offset = len(prompt_tokens) + returned_tokens
1552-
logits = self._scores[token_offset - 1, :].tolist()
1553-
current_logprobs = Llama.logits_to_logprobs(logits)
1557+
logits = self._scores[token_offset - 1, :]
1558+
current_logprobs = Llama.logits_to_logprobs(logits).tolist()
15541559
sorted_logprobs = list(
15551560
sorted(
15561561
zip(current_logprobs, range(len(current_logprobs))),
@@ -1668,8 +1673,8 @@ def logit_bias_processor(
16681673
self.detokenize(completion_tokens[:returned_tokens])
16691674
)
16701675
token_offset = len(prompt_tokens) + returned_tokens - 1
1671-
logits = self._scores[token_offset, :].tolist()
1672-
current_logprobs = Llama.logits_to_logprobs(logits)
1676+
logits = self._scores[token_offset, :]
1677+
current_logprobs = Llama.logits_to_logprobs(logits).tolist()
16731678
sorted_logprobs = list(
16741679
sorted(
16751680
zip(current_logprobs, range(len(current_logprobs))),
@@ -1782,9 +1787,8 @@ def logit_bias_processor(
17821787
self.detokenize([token]).decode("utf-8", errors="ignore")
17831788
for token in all_tokens
17841789
]
1785-
all_logprobs = [
1786-
Llama.logits_to_logprobs(row.tolist()) for row in self._scores
1787-
][token_offset:]
1790+
all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
1791+
# TODO: may be able to change this loop to use np.take_along_dim
17881792
for token, token_str, logprobs_token in zip(
17891793
all_tokens, all_token_strs, all_logprobs
17901794
):
@@ -2149,7 +2153,6 @@ def __getstate__(self):
21492153
yarn_beta_slow=self.context_params.yarn_beta_slow,
21502154
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
21512155
mul_mat_q=self.context_params.mul_mat_q,
2152-
f16_kv=self.context_params.f16_kv,
21532156
logits_all=self.context_params.logits_all,
21542157
embedding=self.context_params.embedding,
21552158
# Sampling Params
@@ -2192,7 +2195,6 @@ def __setstate__(self, state):
21922195
yarn_beta_slow=state["yarn_beta_slow"],
21932196
yarn_orig_ctx=state["yarn_orig_ctx"],
21942197
mul_mat_q=state["mul_mat_q"],
2195-
f16_kv=state["f16_kv"],
21962198
logits_all=state["logits_all"],
21972199
embedding=state["embedding"],
21982200
# Sampling Params
@@ -2280,10 +2282,22 @@ def token_nl(self) -> int:
22802282
return self._model.token_nl()
22812283

22822284
@staticmethod
2283-
def logits_to_logprobs(logits: List[float]) -> List[float]:
2284-
exps = [math.exp(float(x)) for x in logits]
2285-
sum_exps = sum(exps)
2286-
return [math.log(x / sum_exps) for x in exps]
2285+
def logits_to_logprobs(
2286+
logits: Union[npt.NDArray[np.single], List], axis: int = -1
2287+
) -> npt.NDArray[np.single]:
2288+
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
2289+
logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
2290+
if logits_maxs.ndim > 0:
2291+
logits_maxs[~np.isfinite(logits_maxs)] = 0
2292+
elif not np.isfinite(logits_maxs):
2293+
logits_maxs = 0
2294+
subtract_maxs = np.subtract(logits, logits_maxs, dtype=np.single)
2295+
exp = np.exp(subtract_maxs)
2296+
# Suppress warnings about log of zero
2297+
with np.errstate(divide="ignore"):
2298+
summed = np.sum(exp, axis=axis, keepdims=True)
2299+
out = np.log(summed)
2300+
return subtract_maxs - out
22872301

22882302
@staticmethod
22892303
def longest_token_prefix(a: Sequence[int], b: Sequence[int]):

llama_cpp/llama_chat_format.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,21 @@ def format_alpaca(
423423
_prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
424424
return ChatFormatterResponse(prompt=_prompt)
425425

426+
@register_chat_format("qwen")
427+
def format_qwen(
428+
messages: List[llama_types.ChatCompletionRequestMessage],
429+
**kwargs: Any,
430+
) -> ChatFormatterResponse:
431+
_roles = dict(user="<|im_start|>user", assistant="<|im_start|>assistant")
432+
system_message="You are a helpful assistant."
433+
system_template="<|im_start|>system\n{system_message}"
434+
system_message=system_template.format(system_message=system_message)
435+
_messages = _map_roles(messages, _roles)
436+
_messages.append((_roles["assistant"], None))
437+
_sep = "<|im_end|>"
438+
_prompt = _format_chatml(system_message, _messages, _sep)
439+
_sep2 = "<|endoftext|>"
440+
return ChatFormatterResponse(prompt=_prompt,stop=_sep2)
426441

427442
@register_chat_format("vicuna")
428443
def format(
@@ -637,6 +652,23 @@ def format_zephyr(
637652
_prompt = _format_chatml(system_message, _messages, _sep)
638653
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
639654

655+
656+
@register_chat_format("pygmalion")
657+
def format_pygmalion(
658+
messages: List[llama_types.ChatCompletionRequestMessage],
659+
**kwargs: Any,
660+
) -> ChatFormatterResponse:
661+
system_template = """<|system|>{system_message}"""
662+
system_message = _get_system_message(messages)
663+
system_message = system_template.format(system_message=system_message)
664+
_roles = dict(user="<|user|>", assistant="<|model|>")
665+
_sep = "\n"
666+
_messages = _map_roles(messages, _roles)
667+
_messages.append((_roles["assistant"], None))
668+
_prompt = _format_chatml(system_message, _messages, _sep)
669+
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
670+
671+
640672
@register_chat_format("chatml")
641673
def format_chatml(
642674
messages: List[llama_types.ChatCompletionRequestMessage],

0 commit comments

Comments
 (0)