diff --git a/README.md b/README.md index 9d72b85..c06f3c0 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ cd ../../ ``` and the finetuned SD2.1 checkpoint [+++prelim private upload on HF+++] from [https://huggingface.co/stabilityai/stable-unclip-preview](https://huggingface.co/stabilityai/stable-unclip-preview), and put the ckpt into the `checkpoints folder` -The, run +Then, run ``` streamlit run scripts/streamlit/stablekarlo.py diff --git a/configs/stable-diffusion/v2-1-stable-karlo-inference.yaml b/configs/stable-diffusion/v2-1-stable-karlo-inference.yaml index 5aeb176..ea8fa93 100644 --- a/configs/stable-diffusion/v2-1-stable-karlo-inference.yaml +++ b/configs/stable-diffusion/v2-1-stable-karlo-inference.yaml @@ -23,11 +23,20 @@ model: params: model: "ViT-L/14" + noise_aug_config: + target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation + params: + clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th" + timestep_dim: 768 + noise_schedule_config: + timesteps: 1000 + beta_schedule: squaredcos_cap_v2 + unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: num_classes: "sequential" - adm_in_channels: 768 + adm_in_channels: 1536 use_checkpoint: True image_size: 32 # unused in_channels: 4 diff --git a/ldm/models/diffusion/ddpm.py b/ldm/models/diffusion/ddpm.py index bde253f..81bc6d3 100644 --- a/ldm/models/diffusion/ddpm.py +++ b/ldm/models/diffusion/ddpm.py @@ -1796,11 +1796,13 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion): class ImageEmbeddingConditionedLatentDiffusion(LatentDiffusion): - def __init__(self, embedder_config, embedding_key="jpg", embedding_dropout=0.5, freeze_embedder=True, *args, **kwargs): + def __init__(self, embedder_config, embedding_key="jpg", embedding_dropout=0.5, + freeze_embedder=True, noise_aug_config=None, *args, **kwargs): super().__init__(*args, **kwargs) self.embed_key = embedding_key self.embedding_dropout = embedding_dropout self._init_embedder(embedder_config, freeze_embedder) + self._init_noise_aug(noise_aug_config) def _init_embedder(self, config, freeze=True): embedder = instantiate_from_config(config) @@ -1810,12 +1812,27 @@ class ImageEmbeddingConditionedLatentDiffusion(LatentDiffusion): for param in self.embedder.parameters(): param.requires_grad = False + def _init_noise_aug(self, config): + if config is not None: + # use the KARLO schedule for noise augmentation on CLIP image embeddings + noise_augmentor = instantiate_from_config(config) + assert isinstance(noise_augmentor, nn.Module) + noise_augmentor = noise_augmentor.eval() + noise_augmentor.train = disabled_train + self.noise_augmentor = noise_augmentor + else: + self.noise_augmentor = None + def get_input(self, batch, k, cond_key=None, bs=None, **kwargs): outputs = LatentDiffusion.get_input(self, batch, k, bs=bs, **kwargs) z, c = outputs[0], outputs[1] img = batch[self.embed_key][:bs] img = rearrange(img, 'b h w c -> b c h w') c_adm = self.embedder(img) + if self.noise_augmentor is not None: + c_adm, noise_level_emb = self.noise_augmentor(c_adm) + # assume this gives embeddings of noise levels + c_adm = torch.cat((c_adm, noise_level_emb), 1) if self.training: c_adm = torch.bernoulli((1. - self.embedding_dropout) * torch.ones(c_adm.shape[0], device=c_adm.device)[:, None]) * c_adm diff --git a/ldm/modules/diffusionmodules/openaimodel.py b/ldm/modules/diffusionmodules/openaimodel.py index 157d3b2..b5da99a 100644 --- a/ldm/modules/diffusionmodules/openaimodel.py +++ b/ldm/modules/diffusionmodules/openaimodel.py @@ -409,6 +409,15 @@ class QKVAttention(nn.Module): return count_flops_attn(model, _x, y) +class Timestep(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, t): + return timestep_embedding(t, self.dim) + + class UNetModel(nn.Module): """ The full UNet model with attention and timestep embedding. diff --git a/ldm/modules/diffusionmodules/util.py b/ldm/modules/diffusionmodules/util.py index 637363d..99f6829 100644 --- a/ldm/modules/diffusionmodules/util.py +++ b/ldm/modules/diffusionmodules/util.py @@ -34,6 +34,13 @@ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, betas = 1 - alphas[1:] / alphas[:-1] betas = np.clip(betas, a_min=0, a_max=0.999) + elif schedule == "squaredcos_cap_v2": # used for karlo prior + # return early + return betas_for_alpha_bar( + n_timestep, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + elif schedule == "sqrt_linear": betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) elif schedule == "sqrt": diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py index 520bb27..fcc5826 100644 --- a/ldm/modules/encoders/modules.py +++ b/ldm/modules/encoders/modules.py @@ -170,7 +170,6 @@ class ClipImageEmbedder(nn.Module): return out - class FrozenOpenCLIPEmbedder(AbstractEncoder): """ Uses the OpenCLIP transformer encoder for text @@ -251,3 +250,34 @@ class FrozenCLIPT5Encoder(AbstractEncoder): return [clip_z, t5_z] +from ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation +from ldm.modules.diffusionmodules.openaimodel import Timestep +class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation): + def __init__(self, *args, clip_stats_path, timestep_dim=256, **kwargs): + super().__init__(*args, **kwargs) + clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu") + self.register_buffer("data_mean", clip_mean[None, :], persistent=False) + self.register_buffer("data_std", clip_std[None, :], persistent=False) + self.time_embed = Timestep(timestep_dim) + + def scale(self, x): + # re-normalize to centered mean and unit variance + x = (x - self.data_mean) * 1./self.data_std + return x + + def unscale(self, x): + # back to original data stats + x = (x * self.data_std) + self.data_mean + return x + + def forward(self, x, noise_level=None): + if noise_level is None: + noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long() + else: + assert isinstance(noise_level, torch.Tensor) + x = self.scale(x) + z = self.q_sample(x, noise_level) + z = self.unscale(z) + noise_level = self.time_embed(noise_level) + return z, noise_level +