forked from abetlen/llama-cpp-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllama_tokenizer.py
More file actions
106 lines (87 loc) · 3.37 KB
/
llama_tokenizer.py
File metadata and controls
106 lines (87 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from __future__ import annotations
import abc
from typing import (
List,
Optional,
Any,
)
import llama_cpp
from llama_cpp.llama_types import List
class BaseLlamaTokenizer(abc.ABC):
@abc.abstractmethod
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
"""Tokenize the text into tokens.
Args:
text: The text to tokenize.
add_bos: Whether to add a beginning of sequence token.
special: Whether to tokenize text literally or as special tokens."""
raise NotImplementedError
@abc.abstractmethod
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
"""Detokenize the tokens into text.
Args:
tokens: The tokens to detokenize.
prev_tokens: If tokens is a continuation of a previous sequence, the previous tokens.
"""
raise NotImplementedError
class LlamaTokenizer(BaseLlamaTokenizer):
def __init__(self, llama: llama_cpp.Llama):
self._model = llama._model # type: ignore
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
return self._model.tokenize(text, add_bos=add_bos, special=special)
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
return self._model.detokenize(tokens)
def encode(
self, text: str, add_bos: bool = True, special: bool = True
) -> List[int]:
return self.tokenize(
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
)
def decode(self, tokens: List[int]) -> str:
return self.detokenize(tokens).decode("utf-8", errors="ignore")
@classmethod
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
class LlamaHFTokenizer(BaseLlamaTokenizer):
def __init__(self, hf_tokenizer: Any):
self.hf_tokenizer = hf_tokenizer
def tokenize(
self, text: bytes, add_bos: bool = True, special: bool = True
) -> List[int]:
return self.hf_tokenizer.encode(
text.decode("utf-8", errors="ignore"), add_special_tokens=special
)
def detokenize(
self, tokens: List[int], prev_tokens: Optional[List[int]] = None
) -> bytes:
if prev_tokens is not None:
text = self.hf_tokenizer.decode(prev_tokens + tokens).encode(
"utf-8", errors="ignore"
)
prev_text = self.hf_tokenizer.decode(prev_tokens).encode(
"utf-8", errors="ignore"
)
return text[len(prev_text) :]
else:
return self.hf_tokenizer.decode(tokens).encode("utf-8", errors="ignore")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
try:
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"The `transformers` library is required to use the `HFTokenizer`."
"You can install it with `pip install transformers`."
)
hf_tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path
)
return cls(hf_tokenizer)