diffusers/src/diffusers/models/attention.py at main · python273/diffusers

History

226 lines (176 loc) · 8.35 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

import math

import torch

import torch.nn.functional as F

from torch import nn

class AttentionBlock(nn.Module):

"""

An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted

to the N-d case.

https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.

Uses three q, k, v linear layers to compute attention

"""

def __init__(

self,

channels,

num_head_channels=None,

num_groups=32,

rescale_output_factor=1.0,

eps=1e-5,

super().__init__()

self.channels = channels

self.num_heads = channels // num_head_channels if num_head_channels is not None else 1

self.num_head_size = num_head_channels

self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)

# define q,k,v as linear layers

self.query = nn.Linear(channels, channels)

self.key = nn.Linear(channels, channels)

self.value = nn.Linear(channels, channels)

self.rescale_output_factor = rescale_output_factor

self.proj_attn = nn.Linear(channels, channels, 1)

def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:

new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)

# move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)

new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)

return new_projection

def forward(self, hidden_states):

residual = hidden_states

batch, channel, height, width = hidden_states.shape

# norm

hidden_states = self.group_norm(hidden_states)

hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)

# proj to q, k, v

query_proj = self.query(hidden_states)

key_proj = self.key(hidden_states)

value_proj = self.value(hidden_states)

# transpose

query_states = self.transpose_for_scores(query_proj)

key_states = self.transpose_for_scores(key_proj)

value_states = self.transpose_for_scores(value_proj)

# get scores

scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))

attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)

attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)

# compute attention output

context_states = torch.matmul(attention_probs, value_states)

context_states = context_states.permute(0, 2, 1, 3).contiguous()

new_context_states_shape = context_states.size()[:-2] + (self.channels,)

context_states = context_states.view(new_context_states_shape)

# compute next hidden_states

hidden_states = self.proj_attn(context_states)

hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)

# res connect and rescale

hidden_states = (hidden_states + residual) / self.rescale_output_factor

return hidden_states

class SpatialTransformer(nn.Module):

"""

Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply

standard transformer action. Finally, reshape to image

"""

def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None):

super().__init__()

self.n_heads = n_heads

self.d_head = d_head

self.in_channels = in_channels

inner_dim = n_heads * d_head

self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)

self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)

self.transformer_blocks = nn.ModuleList(

[

BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)

for d in range(depth)

]

)

self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)

def forward(self, x, context=None):

# note: if no context is given, cross-attention defaults to self-attention

b, c, h, w = x.shape

x_in = x

x = self.norm(x)

x = self.proj_in(x)

x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)

for block in self.transformer_blocks:

x = block(x, context=context)

x = x.reshape(b, h, w, c).permute(0, 3, 1, 2)

x = self.proj_out(x)

return x + x_in

class BasicTransformerBlock(nn.Module):

def __init__(self, dim, n_heads, d_head, dropout=0.0, context_dim=None, gated_ff=True, checkpoint=True):

super().__init__()

self.attn1 = CrossAttention(

query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout

) # is a self-attention

self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)

self.attn2 = CrossAttention(

query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout

) # is self-attn if context is none

self.norm1 = nn.LayerNorm(dim)

self.norm2 = nn.LayerNorm(dim)

self.norm3 = nn.LayerNorm(dim)

self.checkpoint = checkpoint

def forward(self, x, context=None):

x = self.attn1(self.norm1(x)) + x

x = self.attn2(self.norm2(x), context=context) + x

x = self.ff(self.norm3(x)) + x

return x

class CrossAttention(nn.Module):

def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):

super().__init__()

inner_dim = dim_head * heads

context_dim = context_dim if context_dim is not None else query_dim

self.scale = dim_head**-0.5

self.heads = heads

self.to_q = nn.Linear(query_dim, inner_dim, bias=False)

self.to_k = nn.Linear(context_dim, inner_dim, bias=False)

self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))

def reshape_heads_to_batch_dim(self, tensor):

batch_size, seq_len, dim = tensor.shape

head_size = self.heads

tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)

tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)

return tensor

def reshape_batch_dim_to_heads(self, tensor):

batch_size, seq_len, dim = tensor.shape

head_size = self.heads

tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)

tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)

return tensor

def forward(self, x, context=None, mask=None):

batch_size, sequence_length, dim = x.shape

h = self.heads

q = self.to_q(x)

context = context if context is not None else x

k = self.to_k(context)

v = self.to_v(context)

q = self.reshape_heads_to_batch_dim(q)

k = self.reshape_heads_to_batch_dim(k)

v = self.reshape_heads_to_batch_dim(v)

sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale

if mask is not None:

mask = mask.reshape(batch_size, -1)

max_neg_value = -torch.finfo(sim.dtype).max

mask = mask[:, None, :].repeat(h, 1, 1)

sim.masked_fill_(~mask, max_neg_value)

# attention, what we cannot get enough of

attn = sim.softmax(dim=-1)

out = torch.einsum("b i j, b j d -> b i d", attn, v)

out = self.reshape_batch_dim_to_heads(out)

return self.to_out(out)

class FeedForward(nn.Module):

def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):

super().__init__()

inner_dim = int(dim * mult)

dim_out = dim_out if dim_out is not None else dim

project_in = GEGLU(dim, inner_dim)

self.net = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))

def forward(self, x):

return self.net(x)

# feedforward

class GEGLU(nn.Module):

def __init__(self, dim_in, dim_out):

super().__init__()

self.proj = nn.Linear(dim_in, dim_out * 2)

def forward(self, x):

x, gate = self.proj(x).chunk(2, dim=-1)

return x * F.gelu(gate)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

attention.py

Latest commit

History

attention.py

File metadata and controls