-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Expand file tree
/
Copy pathspeaker.py
More file actions
133 lines (115 loc) · 4.23 KB
/
speaker.py
File metadata and controls
133 lines (115 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Specifies the inference interfaces for speaker recognition modules.
Authors:
* Aku Rouhe 2021
* Peter Plantinga 2021
* Loren Lugosch 2020
* Mirco Ravanelli 2020
* Titouan Parcollet 2021
* Abdel Heba 2021
* Andreas Nautsch 2022, 2023
* Pooneh Mousavi 2023
* Sylvain de Langen 2023
* Adel Moumen 2023
* Pradnya Kandarkar 2023
"""
import torch
from speechbrain.inference.classifiers import EncoderClassifier
class SpeakerRecognition(EncoderClassifier):
"""A ready-to-use model for speaker recognition. It can be used to
perform speaker verification with verify_batch().
Arguments
---------
*args : tuple
**kwargs : dict
Arguments are forwarded to ``Pretrained`` parent class.
Example
-------
>>> import torchaudio
>>> from speechbrain.inference.speaker import SpeakerRecognition
>>> # Model is downloaded from the speechbrain HuggingFace repo
>>> tmpdir = getfixture("tmpdir")
>>> verification = SpeakerRecognition.from_hparams(
... source="speechbrain/spkrec-ecapa-voxceleb",
... savedir=tmpdir,
... )
>>> # Perform verification
>>> from speechbrain.dataio import audio_io
>>> signal, fs = audio_io.load("tests/samples/single-mic/example1.wav")
>>> signal2, fs = audio_io.load("tests/samples/single-mic/example2.flac")
>>> score, prediction = verification.verify_batch(signal, signal2)
"""
MODULES_NEEDED = [
"compute_features",
"mean_var_norm",
"embedding_model",
"mean_var_norm_emb",
]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
def verify_batch(
self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25
):
"""Performs speaker verification with cosine distance.
It returns the score and the decision (0 different speakers,
1 same speakers).
Arguments
---------
wavs1 : Torch.Tensor
torch.Tensor containing the speech waveform1 (batch, time).
Make sure the sample rate is fs=16000 Hz.
wavs2 : Torch.Tensor
torch.Tensor containing the speech waveform2 (batch, time).
Make sure the sample rate is fs=16000 Hz.
wav1_lens : Torch.Tensor
torch.Tensor containing the relative length for each sentence
in the length (e.g., [0.8 0.6 1.0])
wav2_lens : Torch.Tensor
torch.Tensor containing the relative length for each sentence
in the length (e.g., [0.8 0.6 1.0])
threshold : Float
Threshold applied to the cosine distance to decide if the
speaker is different (0) or the same (1).
Returns
-------
score
The score associated to the binary verification output
(cosine distance).
prediction
The prediction is 1 if the two signals in input are from the same
speaker and 0 otherwise.
"""
emb1 = self.encode_batch(wavs1, wav1_lens, normalize=False)
emb2 = self.encode_batch(wavs2, wav2_lens, normalize=False)
score = self.similarity(emb1, emb2)
return score, score > threshold
def verify_files(self, path_x, path_y, **kwargs):
"""Speaker verification with cosine distance
Returns the score and the decision (0 different speakers,
1 same speakers).
Arguments
---------
path_x : str
Path to file x
path_y : str
Path to file y
**kwargs : dict
Arguments to ``load_audio``
Returns
-------
score
The score associated to the binary verification output
(cosine distance).
prediction
The prediction is 1 if the two signals in input are from the same
speaker and 0 otherwise.
"""
waveform_x = self.load_audio(path_x, **kwargs)
waveform_y = self.load_audio(path_y, **kwargs)
# Fake batches:
batch_x = waveform_x.unsqueeze(0)
batch_y = waveform_y.unsqueeze(0)
# Verify:
score, decision = self.verify_batch(batch_x, batch_y)
# Squeeze:
return score[0], decision[0]