llama-cpp-python/llama_cpp/server/settings.py at devel · D4ve-R/llama-cpp-python

History

163 lines (156 loc) · 5.39 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

import os

import multiprocessing

from typing import Optional, List, Literal

from pydantic import Field

from pydantic_settings import BaseSettings, SettingsConfigDict

import llama_cpp

# Disable warning for model and model_alias settings

BaseSettings.model_config['protected_namespaces'] = ()

class ModelSettings(BaseSettings):

model: str = Field(

description="The path to the model to use for generating completions."

)

model_alias: Optional[str] = Field(

default=None,

description="The alias of the model to use for generating completions.",

)

# Model Params

n_gpu_layers: int = Field(

default=0,

ge=-1,

description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",

)

main_gpu: int = Field(

default=0,

ge=0,

description="Main GPU to use.",

)

tensor_split: Optional[List[float]] = Field(

default=None,

description="Split layers across multiple GPUs in proportion.",

)

vocab_only: bool = Field(

default=False, description="Whether to only return the vocabulary."

)

use_mmap: bool = Field(

default=llama_cpp.llama_mmap_supported(),

description="Use mmap.",

)

use_mlock: bool = Field(

default=llama_cpp.llama_mlock_supported(),

description="Use mlock.",

)

# Context Params

seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")

n_ctx: int = Field(default=2048, ge=1, description="The context size.")

n_batch: int = Field(

default=512, ge=1, description="The batch size to use per eval."

)

n_threads: int = Field(

default=max(multiprocessing.cpu_count() // 2, 1),

ge=1,

description="The number of threads to use.",

)

n_threads_batch: int = Field(

default=max(multiprocessing.cpu_count() // 2, 1),

ge=0,

description="The number of threads to use when batch processing.",

)

rope_scaling_type: int = Field(

default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED

)

rope_freq_base: float = Field(

default=0.0, description="RoPE base frequency"

)

rope_freq_scale: float = Field(

default=0.0, description="RoPE frequency scaling factor"

)

yarn_ext_factor: float = Field(

default=-1.0

)

yarn_attn_factor: float = Field(

default=1.0

)

yarn_beta_fast: float = Field(

default=32.0

)

yarn_beta_slow: float = Field(

default=1.0

)

yarn_orig_ctx: int = Field(

default=0

)

mul_mat_q: bool = Field(

default=True, description="if true, use experimental mul_mat_q kernels"

)

f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")

logits_all: bool = Field(default=True, description="Whether to return logits.")

embedding: bool = Field(default=True, description="Whether to use embeddings.")

# Sampling Params

last_n_tokens_size: int = Field(

default=64,

ge=0,

description="Last n tokens to keep for repeat penalty calculation.",

)

# LoRA Params

lora_base: Optional[str] = Field(

default=None,

description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."

)

lora_path: Optional[str] = Field(

default=None,

description="Path to a LoRA file to apply to the model.",

)

# Backend Params

numa: bool = Field(

default=False,

description="Enable NUMA support.",

)

# Chat Format Params

chat_format: str = Field(

default="llama-2",

description="Chat format to use.",

)

clip_model_path: Optional[str] = Field(

default=None,

description="Path to a CLIP model to use for multi-modal chat completion.",

)

# Cache Params

cache: bool = Field(

default=False,

description="Use a cache to reduce processing times for evaluated prompts.",

)

cache_type: Literal["ram", "disk"] = Field(

default="ram",

description="The type of cache to use. Only used if cache is True.",

)

cache_size: int = Field(

default=2 << 30,

description="The size of the cache in bytes. Only used if cache is True.",

)

# Misc

verbose: bool = Field(

default=True, description="Whether to print debug information."

)

class ServerSettings(BaseSettings):

model_config = SettingsConfigDict(env_file='.env', extra='ignore')

host: str = Field(default="localhost", description="Listen address")

port: int = Field(default=8000, description="Listen port")

interrupt_requests: bool = Field(

default=True,

description="Whether to interrupt requests when a new request is received.",

)

config: Optional[str] = Field(default=None, description="Path to config file")

plugins: Optional[str] = Field(default=None, description="Path to the plugins directory")

class Settings(ModelSettings):

model_config = SettingsConfigDict(env_file='model.env', extra='ignore')

models: Optional[List[ModelSettings]] = Field(

default = [],

description="Model configs, overwrites default config"

)

SETTINGS: Optional[ServerSettings] = None

def set_settings(settings: ServerSettings):

global SETTINGS

SETTINGS = settings

def get_settings():

yield SETTINGS

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

settings.py

Latest commit

History

settings.py

File metadata and controls