sdv1.5 v1-inference.yaml
model:
base_learning_rate: 1.0e-04
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [ 10000 ]
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
f_start: [ 1.e-6 ]
f_max: [ 1. ]
f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
modules/initialize.py
Thread(target=load_model).start()
load_model->shared.sd_model
modules/shared_items.py
Shared()->
sd_model()->modules.sd_models.model_data.get_sd_model()
sd_models.py
SdModelData:
get_sd_model()->load_model()
model_data = SdModelData()
sd_models.py load_model()
load_model(checkpoint_info,already_loaded_state_dict)->
state_dict = get_checkpoint_state_dict(checkpoint_info,..)
- torch.load()
checkpoint_config = sd_model_config.find_checkpoint_config(state_dict,checkpoint_info)
# state_dict 权重已经加载上来了,类似下面这种
'model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight': tensor([0.8882, 0.9307, 0.8149, 0.8799, 0.8374, 0.8779, 0.8208, 0.7705, 0.7871,
0.6953, 0.8354, 0.8594, 0.7881, 0.8018, 0.8442, 0.7744, 0.7969, 0.7715,
sd_models_config.py
find_checkpoint_config(state_dict,info)
guess_model_config_from_state_dict(state_dict,info.filename)
- config_default # 根据权重的关键key从开头的config中选出来符合要求的yaml
sd_model.py load_model()
sd_config = OmegaConf.load(checkpoint_config)
Creating model from config: /root/autodl-tmp/stable-diffusion-webui/configs/v1-inference.yaml
sd_model = instantiate_from_config(sd_config.model)
简单分析下ldm下的代码:
models是串起全流程的代码,比如DDPM,modules下的是具体的模块代码
repositories/stable-diffusion-stability-ai/ldm/util.py
get_obj_from_str(config["target"])(**config.get("params", dict()))
module:ldm.models.diffusion.ddpm,cls:LatentDiffusion
importlib.import_module(module, package=None)->
<module 'ldm.models.diffusion.ddpm' from '/root/autodl-tmp/stable-diffusion-webui/repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py'>
sd_model.py load_model()
sd_model = instantiate_from_config(sd_config.model)
# sd_model = LatentDiffusion
repositories/stable-diffusion-stability-ai/ldm/models/diffusion/ddpm.py LatentDiffusion()
self.instantiate_first_stage()
- model = instantiate_from_config(config)
- self.first_stage_model = model.eval()
self.instantiate_cond_stage()
- model = instantiate_from_config(config)
- self.cond_stage_model = model.eval()
self.model = DiffusionWrapper(unet_config,..)
- self.diffusion_model = instantiate_from_config(diff_model_config)
sd_model.first_stage_model:
AutoencoderKL(
(encoder): Encoder(
(conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(down): ModuleList(
(0): Module(
(block): ModuleList(
(0-1): 2 x ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))
)
)
(1): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))
)
)
(2): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))
)
(1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(downsample): Downsample(
(conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2))
)
)
(3): Module(
(block): ModuleList(
(0-1): 2 x ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
)
)
(mid): Module(
(block_1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(attn_1): AttnBlock(
(norm): GroupNorm(32, 512, eps=1e-06, affine=True)
(q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
)
(block_2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(decoder): Decoder(
(conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(mid): Module(
(block_1): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(attn_1): AttnBlock(
(norm): GroupNorm(32, 512, eps=1e-06, affine=True)
(q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
(proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
)
(block_2): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(up): ModuleList(
(0): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
)
(1-2): 2 x ResnetBlock(
(norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
)
(1): Module(
(block): ModuleList(
(0): ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1-2): 2 x ResnetBlock(
(norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(upsample): Upsample(
(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(2-3): 2 x Module(
(block): ModuleList(
(0-2): 3 x ResnetBlock(
(norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
(dropout): Dropout(p=0.0, inplace=False)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(attn): ModuleList()
(upsample): Upsample(
(conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)
(conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(loss): Identity()
(quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1))
(post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1))
)
sd_model.cond_stage_model:
FrozenCLIPEmbedder(
(transformer): CLIPTextModel(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): Embedding(49408, 768)
(position_embedding): Embedding(77, 768)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-11): 12 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
sd_model.model -> diffusionModel
FrozenCLIPEmbedder(
(transformer): CLIPTextModel(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): Embedding(49408, 768)
(position_embedding): Embedding(77, 768)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-11): 12 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
sd_model.py load_model_weights
load_model_weights(sd_model,checkpoint_info,state_dict,...)->
model.is_sdxl
model.is_sd1
model.is_sd2
model.load_state_dict(state_dict,strict=False)
vae = model.first_stage_model
model.first_stage_model = None
model.half()
model.first_stage_model = vae
sd_vae.load_vae(model,vae_file,vae_source)
sd_model.py load_model
send_model_to_device(sd_model)
sd_hijack.model_hijack.hijack(sd_model)
modules/sd_hijack.py
StableDiffusionModelHijack->hijack(,m)-> m=sd_model
type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenCLIPEmbedder:
model_embeddings = m.cond_stage_model.roberta.embeddings
model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings,self) # 49408,768
m.cond_stage_model = sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords(m.cond_stage_model,self)
apply_weighted_forward(m)
self.apply_optimizations()
self.clip = m.cond_stage_model
self.layers = flatten(m)
ldm.modules.diffusionmodules.openaimodel.copy_of_UNetModel_forward_for_webui = ldm.modules.diffusionmodules.openaimodel.UNetModel.forward
ldm.modules.diffusionmodules.openaimodel.UNetModel.forward = sd_unet.UNetModel_forward
modules/sd_hijack_clip.py
FrozenCLIPEmbedderWithCustomWords()->
self.tokenizer = wrapped.tokenizer
vocab = self.tokenizer.get_vocab()
sd_model.py load_model
sd_model.eval()
model_data.set_sd_model(sd_model)
sd_hijack.model_hijack.embedding_db.load_textual_inversion_embeddings(force_reload=True)
script_callbacks.model_loaded_callback(sd_model)
sd_model.cond_stage_model_empty_prompt = get_empty_cond(sd_model)
Model loaded in 3004.5s (calculate hash: 175.0s, load weights from disk: 0.2s, find config: 13.4s, create model: 0.4s, apply weights to model: 667.5s, apply half(): 298.5s, apply dtype to VAE: 15.6s, load VAE: 101.6s, load weights from state dict: 69.7s, move model to device: 21.8s, hijack: 1429.6s, load textual inversion embeddings: 114.8s, scripts callbacks: 53.8s, calculate empty prompt: 42.5s).
sdxl sd_xl_base.yaml
model:
target: sgm.models.diffusion.DiffusionEngine
params:
scale_factor: 0.13025
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
params:
adm_in_channels: 2816
num_classes: sequential
use_checkpoint: True
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2]
num_res_blocks: 2
channel_mult: [1, 2, 4]
num_head_channels: 64
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
context_dim: 2048
spatial_transformer_attn_type: softmax-xformers
legacy: False
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
# crossattn cond
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
params:
layer: hidden
layer_idx: 11
# crossattn and vector cond
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
params:
arch: ViT-bigG-14
version: laion2b_s39b_b160k
freeze: True
layer: penultimate
always_return_pooled: True
legacy: False
# vector cond
- is_trainable: False
input_key: original_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: crop_coords_top_left
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
# vector cond
- is_trainable: False
input_key: target_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256 # multiplied by two
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
sd_model.py load_model_weights()
sd_model_xl.extend_sdxl(model)
sd_model_xl.py
model.model.conditioning_key = "crossattn"
discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization()
sgm.models.diffusion.DiffusionEngine.get_learned_conditioning = get_learned_conditioning
sgm.models.diffusion.DiffusionEngine.apply_model = apply_model
sgm.models.diffusion.DiffusionEngine.get_first_stage_encoding = get_first_stage_encoding
generative-models中sgm代码结构和ldm一致,models下面是整体代码流程,modules下是具体的模块代码。
repositories/generative-models/sgm/moduels/diffusion.py
model = instantiate_from_config(network_config)
self.model = get_obj_from_str(model)->
self.denoiser = instantiate_from_config(denoiser_config)
self.conditioner = instantiate_from_config(conditioner_config)
self.first_stage_model = instantiate_from_config(first_stage_config).eval()
model.conditioner
GeneralConditioner(
(embedders): ModuleList(
(0): FrozenCLIPEmbedder(
(transformer): CLIPTextModel(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): Embedding(49408, 768)
(position_embedding): Embedding(77, 768)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-11): 12 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(1): FrozenOpenCLIPEmbedder2(
(model): CLIP(
(transformer): Transformer(
(resblocks): ModuleList(
(0-31): 32 x ResidualAttentionBlock(
(ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
)
(ls_1): Identity()
(ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1280, out_features=5120, bias=True)
(gelu): GELUHijack(approximate='none')
(c_proj): Linear(in_features=5120, out_features=1280, bias=True)
)
(ls_2): Identity()
)
)
)
(token_embedding): Embedding(49408, 1280)
(ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(2-4): 3 x ConcatTimestepEmbedderND(
(timestep): Timestep()
)
)
(wrapped): Module()
)
model.first_stage_model:
GeneralConditioner(
(embedders): ModuleList(
(0): FrozenCLIPEmbedder(
(transformer): CLIPTextModel(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): Embedding(49408, 768)
(position_embedding): Embedding(77, 768)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-11): 12 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): QuickGELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(1): FrozenOpenCLIPEmbedder2(
(model): CLIP(
(transformer): Transformer(
(resblocks): ModuleList(
(0-31): 32 x ResidualAttentionBlock(
(ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
)
(ls_1): Identity()
(ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1280, out_features=5120, bias=True)
(gelu): GELUHijack(approximate='none')
(c_proj): Linear(in_features=5120, out_features=1280, bias=True)
)
(ls_2): Identity()
)
)
)
(token_embedding): Embedding(49408, 1280)
(ln_final): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(2-4): 3 x ConcatTimestepEmbedderND(
(timestep): Timestep()
)
)
(wrapped): Module()
)
model.model:
OpenAIWrapper(
(diffusion_model): UNetModel(
(time_embed): Sequential(
(0): Linear(in_features=320, out_features=1280, bias=True)
(1): SiLU()
(2): Linear(in_features=1280, out_features=1280, bias=True)
)
(label_emb): Sequential(
(0): Sequential(
(0): Linear(in_features=2816, out_features=1280, bias=True)
(1): SiLU()
(2): Linear(in_features=1280, out_features=1280, bias=True)
)
)
(input_blocks): ModuleList(
(0): TimestepEmbedSequential(
(0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1-2): 2 x TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
)
(3): TimestepEmbedSequential(
(0): Downsample(
(op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
(4): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=2048, out_features=640, bias=False)
(to_v): Linear(in_features=2048, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
(5): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=2048, out_features=640, bias=False)
(to_v): Linear(in_features=2048, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
(6): TimestepEmbedSequential(
(0): Downsample(
(op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)
)
(7): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=2048, out_features=1280, bias=False)
(to_v): Linear(in_features=2048, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
(8): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=2048, out_features=1280, bias=False)
(to_v): Linear(in_features=2048, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
)
(middle_block): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=2048, out_features=1280, bias=False)
(to_v): Linear(in_features=2048, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
(2): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Identity()
)
)
(output_blocks): ModuleList(
(0-1): 2 x TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=2048, out_features=1280, bias=False)
(to_v): Linear(in_features=2048, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
)
(2): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=1280, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
(proj_in): Linear(in_features=1280, out_features=1280, bias=True)
(transformer_blocks): ModuleList(
(0-9): 10 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=1280, out_features=1280, bias=False)
(to_v): Linear(in_features=1280, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=1280, out_features=10240, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=5120, out_features=1280, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=1280, out_features=1280, bias=False)
(to_k): Linear(in_features=2048, out_features=1280, bias=False)
(to_v): Linear(in_features=2048, out_features=1280, bias=False)
(to_out): Sequential(
(0): Linear(in_features=1280, out_features=1280, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=1280, out_features=1280, bias=True)
)
(2): Upsample(
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(3): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=2048, out_features=640, bias=False)
(to_v): Linear(in_features=2048, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
(4): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=2048, out_features=640, bias=False)
(to_v): Linear(in_features=2048, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
)
(5): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=640, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
)
(1): SpatialTransformer(
(norm): GroupNorm(32, 640, eps=1e-06, affine=True)
(proj_in): Linear(in_features=640, out_features=640, bias=True)
(transformer_blocks): ModuleList(
(0-1): 2 x BasicTransformerBlock(
(attn1): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=640, out_features=640, bias=False)
(to_v): Linear(in_features=640, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(ff): FeedForward(
(net): Sequential(
(0): GEGLU(
(proj): Linear(in_features=640, out_features=5120, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=2560, out_features=640, bias=True)
)
)
(attn2): CrossAttention(
(to_q): Linear(in_features=640, out_features=640, bias=False)
(to_k): Linear(in_features=2048, out_features=640, bias=False)
(to_v): Linear(in_features=2048, out_features=640, bias=False)
(to_out): Sequential(
(0): Linear(in_features=640, out_features=640, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
)
(norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
)
)
(proj_out): Linear(in_features=640, out_features=640, bias=True)
)
(2): Upsample(
(conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(6): TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 960, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
(7-8): 2 x TimestepEmbedSequential(
(0): ResBlock(
(in_layers): Sequential(
(0): GroupNorm32(32, 640, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(h_upd): Identity()
(x_upd): Identity()
(emb_layers): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=320, bias=True)
)
(out_layers): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Dropout(p=0, inplace=False)
(3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
)
)
)
(out): Sequential(
(0): GroupNorm32(32, 320, eps=1e-05, affine=True)
(1): SiLU()
(2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)