From cddd65d51f5e06e24e95174033d0da8aba54a73d Mon Sep 17 00:00:00 2001 From: Robin Rombach Date: Sun, 29 Jan 2023 23:21:30 +0100 Subject: [PATCH] make it work --- README.md | 35 +++++++++++++++---- .../v2-1-stable-unclip-h-inference.yaml | 1 - .../v2-1-stable-unclip-l-inference.yaml | 1 - ldm/modules/diffusionmodules/util.py | 3 +- ldm/modules/encoders/modules.py | 2 +- scripts/streamlit/stableunclip.py | 10 ++---- 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index eb44b04..fef1008 100644 --- a/README.md +++ b/README.md @@ -149,12 +149,33 @@ We provide two models, trained on OpenAI CLIP-L and OpenCLIP-H image embeddings, _[TODO: +++prelim private upload on HF+++]_ from [https://huggingface.co/stabilityai/stable-unclip-preview](https://huggingface.co/stabilityai/stable-unclip-preview). To use them, download from Hugging Face, and put and the weights into the `checkpoints` folder. #### Image Variations -![image-variations-h](assets/stable-samples/stable-unclip/castle.jpg) -![image-variations-h](assets/stable-samples/stable-unclip/cornmen.jpg) +![image-variations-l-1](assets/stable-samples/stable-unclip/houses_out.jpeg) +![image-variations-l-2](assets/stable-samples/stable-unclip/plates_out.jpeg) -_++TODO: Input images from the DIV2K dataset. Proceed with care++_ +_++TODO: Input images from the DIV2K dataset. check license++_ -#### Stable Diffusion Meets Karlo +Run + +``` +streamlit run scripts/streamlit/stableunclip.py +``` +to launch a streamlit script than can be used to make image variations with both models (CLIP-L and OpenCLIP-H). +These models can process a `noise_level`, which specifies an amount of Gaussian noise added to the CLIP embeddings. +This can be used to increase output variance as in the following examples. + +**noise_level = 0** +![image-variations-l-3](assets/stable-samples/stable-unclip/oldcar000.jpeg) + +**noise_level = 500** +![image-variations-l-4](assets/stable-samples/stable-unclip/oldcar500.jpeg) + +**noise_level = 800** +![image-variations-l-6](assets/stable-samples/stable-unclip/oldcar800.jpeg) + + + + +### Stable Diffusion Meets Karlo ![panda](assets/stable-samples/stable-unclip/panda.jpg) Recently, [KakaoBrain](https://kakaobrain.com/) openly released [Karlo](https://github.com/kakaobrain/karlo), a pretrained, large-scale replication of [unCLIP](https://arxiv.org/abs/2204.06125). @@ -174,10 +195,10 @@ and the finetuned SD2.1 unCLIP-L checkpoint _[TODO: +++prelim private upload on Then, run ``` -streamlit run scripts/streamlit/stablekarlo.py +streamlit run scripts/streamlit/stableunclip.py ``` - -The script optionally supports sampling from the full Karlo model. To do so, you need to download the 64x64 decoder and 64->256 upscaler +and pick the `use_karlo` option in the GUI. +The script optionally supports sampling from the full Karlo model. To use it, download the 64x64 decoder and 64->256 upscaler via ```shell cd checkpoints/karlo_models diff --git a/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml b/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml index 6c41acc..060a6d7 100644 --- a/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml +++ b/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml @@ -46,7 +46,6 @@ model: use_linear_in_transformer: True transformer_depth: 1 context_dim: 1024 - spatial_transformer_attn_type: "softmax-xformers" legacy: False first_stage_config: diff --git a/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml b/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml index ea8fa93..335fd61 100644 --- a/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml +++ b/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml @@ -50,7 +50,6 @@ model: use_linear_in_transformer: True transformer_depth: 1 context_dim: 1024 - spatial_transformer_attn_type: "softmax-xformers" legacy: False first_stage_config: diff --git a/ldm/modules/diffusionmodules/util.py b/ldm/modules/diffusionmodules/util.py index 99f6829..daf35da 100644 --- a/ldm/modules/diffusionmodules/util.py +++ b/ldm/modules/diffusionmodules/util.py @@ -225,6 +225,7 @@ class GroupNorm32(nn.GroupNorm): def forward(self, x): return super().forward(x.float()).type(x.dtype) + def conv_nd(dims, *args, **kwargs): """ Create a 1D, 2D, or 3D convolution module. @@ -274,4 +275,4 @@ class HybridConditioner(nn.Module): def noise_like(shape, device, repeat=False): repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) noise = lambda: torch.randn(shape, device=device) - return repeat_noise() if repeat else noise() \ No newline at end of file + return repeat_noise() if repeat else noise() diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py index d89481f..595228f 100644 --- a/ldm/modules/encoders/modules.py +++ b/ldm/modules/encoders/modules.py @@ -132,7 +132,6 @@ class FrozenCLIPEmbedder(AbstractEncoder): return self(text) -from clip import load as load_clip class ClipImageEmbedder(nn.Module): def __init__( self, @@ -143,6 +142,7 @@ class ClipImageEmbedder(nn.Module): ucg_rate=0. ): super().__init__() + from clip import load as load_clip self.model, _ = load_clip(name=model, device=device, jit=jit) self.antialias = antialias diff --git a/scripts/streamlit/stableunclip.py b/scripts/streamlit/stableunclip.py index 4af9051..c999605 100644 --- a/scripts/streamlit/stableunclip.py +++ b/scripts/streamlit/stableunclip.py @@ -298,15 +298,11 @@ if __name__ == "__main__": seed_everything(seed) ucg_schedule = None - sampler = st.sidebar.selectbox("Sampler", ["DDIM", "PLMS", "DPM"], 0) + sampler = st.sidebar.selectbox("Sampler", ["DDIM", "DPM"], 0) if version == "Full Karlo": pass else: - if sampler == "PLMS": - st.warning("NOTE: Some models (such as v-pred) currently only support DDIM/DPM sampling here") - sampler = PLMSSampler(state["model"]) - elif sampler == "DPM": - st.warning("NOTE: Using DPM sampler with default sampling parameters (DPM-2)") + if sampler == "DPM": sampler = DPMSolverSampler(state["model"]) elif sampler == "DDIM": sampler = DDIMSampler(state["model"]) @@ -342,7 +338,6 @@ if __name__ == "__main__": init_img = get_init_img(batch_size=number_cols) with torch.no_grad(): adm_cond = state["model"].embedder(init_img) - adm_uc = torch.zeros_like(adm_cond) if state["model"].noise_augmentor is not None: noise_level = st.number_input("Noise Augmentation for CLIP embeddings", min_value=0, max_value=state["model"].noise_augmentor.max_noise_level - 1, value=0) @@ -350,6 +345,7 @@ if __name__ == "__main__": torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols)) # assume this gives embeddings of noise levels adm_cond = torch.cat((c_adm, noise_level_emb), 1) + adm_uc = torch.zeros_like(adm_cond) if st.button("Sample"): print("running prompt:", prompt)