llama-cpp-python/examples/hf_pull/main.py at main · XyLearningProgramming/llama-cpp-python

36 lines (31 loc) · 908 Bytes

import llama_cpp

import llama_cpp.llama_tokenizer

llama = llama_cpp.Llama.from_pretrained(

repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",

filename="*q8_0.gguf",

tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(

"Qwen/Qwen1.5-0.5B"

verbose=False,

)

response = llama.create_chat_completion(

messages=[{"role": "user", "content": "What is the capital of France?"}],

response_format={

"type": "json_object",

"schema": {

"type": "object",

"properties": {

"country": {"type": "string"},

"capital": {"type": "string"},

"required": ["country", "capital"],

stream=True,

)

for chunk in response:

delta = chunk["choices"][0]["delta"]

if "content" not in delta:

continue

print(delta["content"], end="", flush=True)

print()

Provide feedback