-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
executable file
·81 lines (58 loc) · 2.52 KB
/
model.py
File metadata and controls
executable file
·81 lines (58 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import logging
import torch
from transformers import BertModel, BertTokenizer
logger = logging.getLogger(__name__)
class Handler:
"""
The model handler is responsible for loading the model and tokenizer from the transformers library.
It also provides methods to embed and compute the similarity between two input texts.
"""
def __init__(self, model_name: str = "bert-base-uncased") -> None:
"""
Initialize the model handler.
A tokenizer and a model are loaded from the Hugging Face Transformers library given the model name.
:param model_name: Model name to use for loading model with transformers.
"""
# Load the model and tokenizer with the Hugging Face Transformers library
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertModel.from_pretrained(model_name)
logger.info("Handler initialisation completed. ")
def embed(self, text) -> list[float]:
"""
Embed the input text.
Embed the input text using the model and return the embedding as a list of floats.
:param text: The text to embed.
:return: The embedding of the input text.
"""
# Forward pass
logging.info(f"Generating embedding for: {text[0:10]}...")
tensors = self.forward(text)
return tensors.tolist()
def similarity(self, text_1: str, text_2: str) -> float:
"""
Compute the cosine similarity between two input texts.
Embed both the input texts using the model and compute the cosine similarity between the two embeddings.
:param text_1: The first text.
:param text_2: The second text.
:return: The similarity score between the two input texts.
"""
# Forward pass
tensor_1 = self.forward(text_1)
tensor_2 = self.forward(text_2)
# Compute the similarity
return torch.nn.functional.cosine_similarity(tensor_1, tensor_2, dim=0).item()
@torch.no_grad()
def forward(self, text: str) -> torch.Tensor:
"""
Forward pass of the model.
Forward the input text through the model and return the last pooling layer.
This is a tensor of shape (768,).
:param text: Text to embed.
:return: Tensor with the last pooling layer.
"""
# Tokenize the input
inputs = self.tokenizer(
text, return_tensors="pt", padding=True, truncation=True
)
# Forward pass
return self.model(**inputs).pooler_output[0]