refactor cache settings

D4ve-R · D4ve-R · commit fd731d7512e4 · 2023-11-29T18:35:46.000+01:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -759,10 +759,6 @@ def __init__(
         # Chat Format Params
         chat_format: str = "llama-2",
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
-        # Cache
-        cache: bool = False,
-        cache_type: str = "ram",
-        cache_size: int = 2 << 30,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -820,9 +816,6 @@ def __init__(
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
-            cache: Optional if true enables caching.
-            cache_type: String can be "ram" or "disk".
-            cache_size: Number of bytes to cache, defaults to 2GB
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -965,17 +958,6 @@ def __init__(
             (n_ctx, self._n_vocab), dtype=np.single
         )
 
-        if cache:
-            if cache_type == "disk":
-                if verbose:
-                    print(f"Using disk cache with size {cache_size}")
-                cache = LlamaDiskCache(capacity_bytes=cache_size)
-            else:
-                if verbose:
-                    print(f"Using ram cache with size {cache_size}")
-                cache = LlamaRAMCache(capacity_bytes=cache_size)
-            self.set_cache(cache)
-
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -78,6 +78,16 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             chat_handler=chat_handler
         )
         self._model.alias = model
+        if settings.cache:
+            if settings.cache_type == "disk":
+                if settings.verbose:
+                    print(f"Using disk cache with size {settings.cache_size}")
+                cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
+            else:
+                if settings.verbose:
+                    print(f"Using ram cache with size {settings.cache_size}")
+                cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
+            self._model.set_cache(cache)
         return self._model
 
     def __getitem__(self, model: str):