llama-cpp-python/llama_cpp/server/app.py at main · dkzdev/llama-cpp-python

History

483 lines (411 loc) · 16.8 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

import json

import multiprocessing

from threading import Lock

from functools import partial

from typing import Iterator, List, Optional, Union, Dict

from typing_extensions import TypedDict, Literal

import llama_cpp

import anyio

from anyio.streams.memory import MemoryObjectSendStream

from starlette.concurrency import run_in_threadpool, iterate_in_threadpool

from fastapi import Depends, FastAPI, APIRouter, Request

from fastapi.middleware.cors import CORSMiddleware

from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict

from sse_starlette.sse import EventSourceResponse

class Settings(BaseSettings):

model: str = Field(

description="The path to the model to use for generating completions."

)

model_alias: Optional[str] = Field(

default=None,

description="The alias of the model to use for generating completions.",

)

n_ctx: int = Field(default=2048, ge=1, description="The context size.")

n_gpu_layers: int = Field(

default=0,

ge=0,

description="The number of layers to put on the GPU. The rest will be on the CPU.",

)

n_batch: int = Field(

default=512, ge=1, description="The batch size to use per eval."

)

n_threads: int = Field(

default=max(multiprocessing.cpu_count() // 2, 1),

ge=1,

description="The number of threads to use.",

)

f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")

use_mlock: bool = Field(

default=llama_cpp.llama_mlock_supported(),

description="Use mlock.",

)

use_mmap: bool = Field(

default=llama_cpp.llama_mmap_supported(),

description="Use mmap.",

)

embedding: bool = Field(default=True, description="Whether to use embeddings.")

last_n_tokens_size: int = Field(

default=64,

ge=0,

description="Last n tokens to keep for repeat penalty calculation.",

)

logits_all: bool = Field(default=True, description="Whether to return logits.")

cache: bool = Field(

default=False,

description="Use a cache to reduce processing times for evaluated prompts.",

)

cache_type: Literal["ram", "disk"] = Field(

default="ram",

description="The type of cache to use. Only used if cache is True.",

)

cache_size: int = Field(

default=2 << 30,

description="The size of the cache in bytes. Only used if cache is True.",

)

vocab_only: bool = Field(

default=False, description="Whether to only return the vocabulary."

)

verbose: bool = Field(

default=True, description="Whether to print debug information."

)

router = APIRouter()

settings: Optional[Settings] = None

llama: Optional[llama_cpp.Llama] = None

def create_app(settings: Optional[Settings] = None):

if settings is None:

settings = Settings()

app = FastAPI(

title="🦙 llama.cpp Python API",

version="0.0.1",

)

app.add_middleware(

CORSMiddleware,

allow_origins=["*"],

allow_credentials=True,

allow_methods=["*"],

allow_headers=["*"],

)

app.include_router(router)

global llama

llama = llama_cpp.Llama(

model_path=settings.model,

n_gpu_layers=settings.n_gpu_layers,

f16_kv=settings.f16_kv,

use_mlock=settings.use_mlock,

use_mmap=settings.use_mmap,

embedding=settings.embedding,

logits_all=settings.logits_all,

n_threads=settings.n_threads,

n_batch=settings.n_batch,

n_ctx=settings.n_ctx,

last_n_tokens_size=settings.last_n_tokens_size,

vocab_only=settings.vocab_only,

verbose=settings.verbose,

)

if settings.cache:

if settings.cache_type == "disk":

cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)

else:

cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)

cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)

llama.set_cache(cache)

def set_settings(_settings: Settings):

global settings

settings = _settings

set_settings(settings)

return app

llama_lock = Lock()

def get_llama():

with llama_lock:

yield llama

def get_settings():

yield settings

model_field = Field(description="The model to use for generating completions.")

max_tokens_field = Field(

default=16, ge=1, le=2048, description="The maximum number of tokens to generate."

)

temperature_field = Field(

default=0.8,

ge=0.0,

le=2.0,

description="Adjust the randomness of the generated text.\n\n"

+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",

)

top_p_field = Field(

default=0.95,

ge=0.0,

le=1.0,

description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"

+ "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",

)

stop_field = Field(

default=None,

description="A list of tokens at which to stop generation. If None, no stop tokens are used.",

)

stream_field = Field(

default=False,

description="Whether to stream the results as they are generated. Useful for chatbots.",

)

top_k_field = Field(

default=40,

ge=0,

description="Limit the next token selection to the K most probable tokens.\n\n"

+ "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",

)

repeat_penalty_field = Field(

default=1.1,

ge=0.0,

description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"

+ "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",

)

presence_penalty_field = Field(

default=0.0,

ge=-2.0,

le=2.0,

description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",

)

frequency_penalty_field = Field(

default=0.0,

ge=-2.0,

le=2.0,

description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",

)

mirostat_mode_field = Field(

default=0,

ge=0,

le=2,

description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"

)

mirostat_tau_field = Field(

default=5.0,

ge=0.0,

le=10.0,

description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"

)

mirostat_eta_field = Field(

default=0.1,

ge=0.001,

le=1.0,

description="Mirostat learning rate"

)

class CreateCompletionRequest(BaseModel):

prompt: Union[str, List[str]] = Field(

default="", description="The prompt to generate completions for."

)

suffix: Optional[str] = Field(

default=None,

description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",

)

max_tokens: int = max_tokens_field

temperature: float = temperature_field

top_p: float = top_p_field

mirostat_mode: int = mirostat_mode_field

mirostat_tau: float = mirostat_tau_field

mirostat_eta: float = mirostat_eta_field

echo: bool = Field(

default=False,

description="Whether to echo the prompt in the generated text. Useful for chatbots.",

)

stop: Optional[Union[str, List[str]]] = stop_field

stream: bool = stream_field

logprobs: Optional[int] = Field(

default=None,

ge=0,

description="The number of logprobs to generate. If None, no logprobs are generated.",

)

presence_penalty: Optional[float] = presence_penalty_field

frequency_penalty: Optional[float] = frequency_penalty_field

# ignored or currently unsupported

model: Optional[str] = model_field

n: Optional[int] = 1

logprobs: Optional[int] = Field(None)

best_of: Optional[int] = 1

logit_bias: Optional[Dict[str, float]] = Field(None)

user: Optional[str] = Field(None)

# llama.cpp specific parameters

top_k: int = top_k_field

repeat_penalty: float = repeat_penalty_field

class Config:

schema_extra = {

"example": {

"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",

"stop": ["\n", "###"],

}

CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)

@router.post(

"/v1/completions",

response_model=CreateCompletionResponse,

)

async def create_completion(

request: Request,

body: CreateCompletionRequest,

llama: llama_cpp.Llama = Depends(get_llama),

if isinstance(body.prompt, list):

assert len(body.prompt) <= 1

body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""

exclude = {

"n",

"best_of",

"logit_bias",

"user",

}

kwargs = body.dict(exclude=exclude)

if body.stream:

send_chan, recv_chan = anyio.create_memory_object_stream(10)

async def event_publisher(inner_send_chan: MemoryObjectSendStream):

async with inner_send_chan:

try:

iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore

async for chunk in iterate_in_threadpool(iterator):

await inner_send_chan.send(dict(data=json.dumps(chunk)))

if await request.is_disconnected():

raise anyio.get_cancelled_exc_class()()

await inner_send_chan.send(dict(data="[DONE]"))

except anyio.get_cancelled_exc_class() as e:

print("disconnected")

with anyio.move_on_after(1, shield=True):

print(

f"Disconnected from client (via refresh/close) {request.client}"

)

await inner_send_chan.send(dict(closing=True))

raise e

return EventSourceResponse(

recv_chan, data_sender_callable=partial(event_publisher, send_chan)

)

else:

completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore

return completion

class CreateEmbeddingRequest(BaseModel):

model: Optional[str] = model_field

input: Union[str, List[str]] = Field(description="The input to embed.")

user: Optional[str]

class Config:

schema_extra = {

"example": {

"input": "The food was delicious and the waiter...",

}

CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)

@router.post(

"/v1/embeddings",

response_model=CreateEmbeddingResponse,

)

async def create_embedding(

request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)

return await run_in_threadpool(

llama.create_embedding, **request.dict(exclude={"user"})

)

class ChatCompletionRequestMessage(BaseModel):

role: Literal["system", "user", "assistant"] = Field(

default="user", description="The role of the message."

)

content: str = Field(default="", description="The content of the message.")

class CreateChatCompletionRequest(BaseModel):

messages: List[ChatCompletionRequestMessage] = Field(

default=[], description="A list of messages to generate completions for."

)

max_tokens: int = max_tokens_field

temperature: float = temperature_field

top_p: float = top_p_field

mirostat_mode: int = mirostat_mode_field

mirostat_tau: float = mirostat_tau_field

mirostat_eta: float = mirostat_eta_field

stop: Optional[List[str]] = stop_field

stream: bool = stream_field

presence_penalty: Optional[float] = presence_penalty_field

frequency_penalty: Optional[float] = frequency_penalty_field

# ignored or currently unsupported

model: Optional[str] = model_field

n: Optional[int] = 1

logit_bias: Optional[Dict[str, float]] = Field(None)

user: Optional[str] = Field(None)

# llama.cpp specific parameters

top_k: int = top_k_field

repeat_penalty: float = repeat_penalty_field

class Config:

schema_extra = {

"example": {

"messages": [

ChatCompletionRequestMessage(

role="system", content="You are a helpful assistant."

ChatCompletionRequestMessage(

role="user", content="What is the capital of France?"

]

}

CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)

@router.post(

"/v1/chat/completions",

response_model=CreateChatCompletionResponse,

)

async def create_chat_completion(

request: Request,

body: CreateChatCompletionRequest,

llama: llama_cpp.Llama = Depends(get_llama),

) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:

exclude = {

"n",

"logit_bias",

"user",

}

kwargs = body.dict(exclude=exclude)

if body.stream:

send_chan, recv_chan = anyio.create_memory_object_stream(10)

async def event_publisher(inner_send_chan: MemoryObjectSendStream):

async with inner_send_chan:

try:

iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore

async for chat_chunk in iterate_in_threadpool(iterator):

await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))

if await request.is_disconnected():

raise anyio.get_cancelled_exc_class()()

await inner_send_chan.send(dict(data="[DONE]"))

except anyio.get_cancelled_exc_class() as e:

print("disconnected")

with anyio.move_on_after(1, shield=True):

print(

f"Disconnected from client (via refresh/close) {request.client}"

)

await inner_send_chan.send(dict(closing=True))

raise e

return EventSourceResponse(

recv_chan,

data_sender_callable=partial(event_publisher, send_chan),

)

else:

completion: llama_cpp.ChatCompletion = await run_in_threadpool(

llama.create_chat_completion, **kwargs # type: ignore

)

return completion

class ModelData(TypedDict):

id: str

object: Literal["model"]

owned_by: str

permissions: List[str]

class ModelList(TypedDict):

object: Literal["list"]

data: List[ModelData]

GetModelResponse = create_model_from_typeddict(ModelList)

@router.get("/v1/models", response_model=GetModelResponse)

async def get_models(

settings: Settings = Depends(get_settings),

llama: llama_cpp.Llama = Depends(get_llama),

) -> ModelList:

return {

"object": "list",

"data": [

{

"id": settings.model_alias

if settings.model_alias is not None

else llama.model_path,

"object": "model",

"owned_by": "me",

"permissions": [],

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

app.py

Latest commit

History

app.py

File metadata and controls