Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/low_level_api/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def gpt_params_parse(argv = None):
parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")

parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
parser.add_argument("-p", "--prompt", type=str, default=None, help="initial prompt",dest="prompt")
parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
Expand Down
50 changes: 36 additions & 14 deletions examples/low_level_api/low_level_api_chat_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __init__(self, params: GptParams) -> None:
self.multibyte_fix = []

# model load
self.lparams = llama_cpp.llama_context_default_params()
self.lparams = llama_cpp.llama_model_default_params()
self.lparams.n_ctx = self.params.n_ctx
self.lparams.n_parts = self.params.n_parts
self.lparams.seed = self.params.seed
Expand All @@ -72,7 +72,11 @@ def __init__(self, params: GptParams) -> None:

self.model = llama_cpp.llama_load_model_from_file(
self.params.model.encode("utf8"), self.lparams)
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)

# Context Params.
self.cparams = llama_cpp.llama_context_default_params()

self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
if (not self.ctx):
raise RuntimeError(f"error: failed to load model '{self.params.model}'")

Expand Down Expand Up @@ -244,7 +248,7 @@ def __init__(self, params: GptParams) -> None:
# tokenize a prompt
def _tokenize(self, prompt, bos=True):
_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
_n = llama_cpp.llama_tokenize(self.model, prompt.encode("utf8", errors="ignore"), len(prompt), _arr, len(_arr), bos, False)
return _arr[:_n]

def set_color(self, c):
Expand Down Expand Up @@ -304,7 +308,7 @@ def generate(self):
self.n_past += n_eval"""

if (llama_cpp.llama_eval(
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past
) != 0):
raise Exception("Failed to llama_eval!")

Expand Down Expand Up @@ -332,7 +336,7 @@ def generate(self):
id = 0

logits = llama_cpp.llama_get_logits(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.model)

# Apply params.logit_bias map
for key, value in self.params.logit_bias.items():
Expand All @@ -349,12 +353,20 @@ def generate(self):
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)

_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
llama_cpp.llama_sample_repetition_penalties(
ctx=self.ctx,
candidates=candidates_p,
last_tokens_data = _arr,
penalty_last_n = last_n_repeat,
penalty_repeat = llama_cpp.c_float(self.params.repeat_penalty),
penalty_freq = llama_cpp.c_float(self.params.frequency_penalty),
penalty_present = llama_cpp.c_float(self.params.presence_penalty),
)

# NOT PRESENT IN CURRENT VERSION ?
# llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
# _arr,
# last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))

if not self.params.penalize_nl:
logits[llama_cpp.llama_token_nl()] = nl_logit
Expand Down Expand Up @@ -473,7 +485,7 @@ def exit(self):
def token_to_str(self, token_id: int) -> bytes:
size = 32
buffer = (ctypes.c_char * size)()
n = llama_cpp.llama_token_to_piece_with_model(
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token_id), buffer, size)
assert n <= size
return bytes(buffer[:n])
Expand Down Expand Up @@ -532,6 +544,9 @@ def interact(self):
print(i,end="",flush=True)
self.params.input_echo = False

# Using string instead of tokens to check for antiprompt,
# It is more reliable than tokens for interactive mode.
generated_str = ""
while self.params.interactive:
self.set_color(util.CONSOLE_COLOR_USER_INPUT)
if (self.params.instruct):
Expand All @@ -546,6 +561,10 @@ def interact(self):
try:
for i in self.output():
print(i,end="",flush=True)
generated_str += i
for ap in self.params.antiprompt:
if generated_str.endswith(ap):
raise KeyboardInterrupt
except KeyboardInterrupt:
self.set_color(util.CONSOLE_COLOR_DEFAULT)
if not self.params.instruct:
Expand All @@ -561,7 +580,7 @@ def interact(self):
time_now = datetime.now()
prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.

Expand All @@ -575,8 +594,11 @@ def interact(self):
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{USER_NAME}: Name a color.
{AI_NAME}: Blue
{USER_NAME}:"""
{USER_NAME}: """

params = gpt_params_parse()
if params.prompt is None and params.file is None:
params.prompt = prompt

with LLaMAInteract(params) as m:
m.interact()