@@ -24,7 +24,7 @@ def mock_eval(*args, **kwargs):
2424 monkeypatch .setattr ("llama_cpp.llama_cpp.llama_eval" , mock_eval )
2525
2626 output_text = " jumps over the lazy dog."
27- output_tokens = llama .tokenize (output_text .encode ("utf-8" , errors = "ignore" ))
27+ output_tokens = llama .tokenize (output_text .encode ("utf-8" ))
2828 token_eos = llama .token_eos ()
2929 n = 0
3030
@@ -93,4 +93,38 @@ def test_llama_pickle():
9393
9494 text = b"Hello World"
9595
96- assert llama .detokenize (llama .tokenize (text )) == text
96+ assert llama .detokenize (llama .tokenize (text )) == text
97+
98+ def test_utf8 (monkeypatch ):
99+ llama = llama_cpp .Llama (model_path = MODEL , vocab_only = True )
100+
101+ ## Set up mock function
102+ def mock_eval (* args , ** kwargs ):
103+ return 0
104+
105+ monkeypatch .setattr ("llama_cpp.llama_cpp.llama_eval" , mock_eval )
106+
107+ output_text = "😀"
108+ output_tokens = llama .tokenize (output_text .encode ("utf-8" ))
109+ token_eos = llama .token_eos ()
110+ n = 0
111+
112+ def mock_sample (* args , ** kwargs ):
113+ nonlocal n
114+ if n < len (output_tokens ):
115+ n += 1
116+ return output_tokens [n - 1 ]
117+ else :
118+ return token_eos
119+
120+ monkeypatch .setattr ("llama_cpp.llama_cpp.llama_sample_top_p_top_k" , mock_sample )
121+
122+ ## Test basic completion with utf8 multibyte
123+ n = 0 # reset
124+ completion = llama .create_completion ("" , max_tokens = 4 )
125+ assert completion ["choices" ][0 ]["text" ] == output_text
126+
127+ ## Test basic completion with incomplete utf8 multibyte
128+ n = 0 # reset
129+ completion = llama .create_completion ("" , max_tokens = 1 )
130+ assert completion ["choices" ][0 ]["text" ] == ""
0 commit comments