88
99
1010class Llama :
11+ """High-level Python wrapper for a llama.cpp model."""
12+
1113 def __init__ (
1214 self ,
1315 model_path : str ,
@@ -18,7 +20,25 @@ def __init__(
1820 logits_all : bool = False ,
1921 vocab_only : bool = False ,
2022 n_threads : Optional [int ] = None ,
21- ):
23+ ) -> "Llama" :
24+ """Load a llama.cpp model from `model_path`.
25+
26+ Args:
27+ model_path: Path to the model directory.
28+ n_ctx: Number of tokens to keep in memory.
29+ n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
30+ seed: Random seed.
31+ f16_kv: Use half-precision for key/value matrices.
32+ logits_all: Return logits for all tokens, not just the vocabulary.
33+ vocab_only: Only use tokens in the vocabulary.
34+ n_threads: Number of threads to use. If None, the number of threads is automatically determined.
35+
36+ Raises:
37+ ValueError: If the model path does not exist.
38+
39+ Returns:
40+ A Llama instance.
41+ """
2242 self .model_path = model_path
2343
2444 self .last_n = 64
@@ -56,6 +76,27 @@ def __call__(
5676 repeat_penalty : float = 1.1 ,
5777 top_k : int = 40 ,
5878 ):
79+ """Generate text from a prompt.
80+
81+ Args:
82+ prompt: The prompt to generate text from.
83+ suffix: A suffix to append to the generated text. If None, no suffix is appended.
84+ max_tokens: The maximum number of tokens to generate.
85+ temperature: The temperature to use for sampling.
86+ top_p: The top-p value to use for sampling.
87+ logprobs: The number of logprobs to return. If None, no logprobs are returned.
88+ echo: Whether to echo the prompt.
89+ stop: A list of strings to stop generation when encountered.
90+ repeat_penalty: The penalty to apply to repeated tokens.
91+ top_k: The top-k value to use for sampling.
92+
93+ Raises:
94+ ValueError: If the requested tokens exceed the context window.
95+ RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
96+
97+ Returns:
98+ Response object containing the generated text.
99+ """
59100 text = b""
60101 finish_reason = "length"
61102 completion_tokens = 0
0 commit comments