diff --git a/README.md b/README.md index 2dfddca..66a2110 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa *Stable UnCLIP 2.1* -- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD). +- New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD). - A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine) diff --git a/ldm/util.py b/ldm/util.py index 9ede259..b20fda8 100644 --- a/ldm/util.py +++ b/ldm/util.py @@ -9,6 +9,7 @@ from PIL import Image, ImageDraw, ImageFont def autocast(f): + """ Decorator for autocasting inside a function """ def do_autocast(*args, **kwargs): with torch.cuda.amp.autocast(enabled=True, dtype=torch.get_autocast_gpu_dtype(), @@ -19,6 +20,7 @@ def autocast(f): def log_txt_as_img(wh, xc, size=10): + """ Convert a list of strings to a list of images """ # wh a tuple of (width, height) # xc a list of captions to plot b = len(xc) diff --git a/scripts/gradio/depth2img.py b/scripts/gradio/depth2img.py index c791a4d..d559a95 100644 --- a/scripts/gradio/depth2img.py +++ b/scripts/gradio/depth2img.py @@ -17,6 +17,7 @@ torch.set_grad_enabled(False) def initialize_model(config, ckpt): + """ Initialize model from config and checkpoint. """ config = OmegaConf.load(config) model = instantiate_from_config(config.model) model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False) @@ -35,6 +36,7 @@ def make_batch_sd( num_samples=1, model_type="dpt_hybrid" ): + """ Make batch for sampling from image and text. """ image = np.array(image.convert("RGB")) image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 # sample['jpg'] is tensor hwc in [-1, 1] at this point @@ -54,6 +56,7 @@ def make_batch_sd( def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None, do_full_sample=False): + """ Paint image from text prompt. """ device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model = sampler.model @@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No def pad_image(input_image): + """ Pad image to integer multiple of 32. """ pad_w, pad_h = np.max(((2, 2), np.ceil( np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size im_padded = Image.fromarray( @@ -121,6 +125,7 @@ def pad_image(input_image): def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength): + """ Predict image from text prompt. """ init_image = input_image.convert("RGB") image = pad_image(init_image) # resize to integer multiple of 32 diff --git a/scripts/gradio/inpainting.py b/scripts/gradio/inpainting.py index 09d44f3..493c4be 100644 --- a/scripts/gradio/inpainting.py +++ b/scripts/gradio/inpainting.py @@ -17,6 +17,7 @@ torch.set_grad_enabled(False) def put_watermark(img, wm_encoder=None): + """ Put watermark on image. """ if wm_encoder is not None: img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) img = wm_encoder.encode(img, 'dwtDct') @@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None): def initialize_model(config, ckpt): + """ Initialize model from config and checkpoint. """ config = OmegaConf.load(config) model = instantiate_from_config(config.model) @@ -44,6 +46,7 @@ def make_batch_sd( txt, device, num_samples=1): + """ Make batch for sampling from image and text. """ image = np.array(image.convert("RGB")) image = image[None].transpose(0, 3, 1, 2) image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 @@ -67,6 +70,7 @@ def make_batch_sd( def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512): + """ Inpaint image with prompt. """ device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model = sampler.model @@ -135,6 +139,7 @@ def pad_image(input_image): return im_padded def predict(input_image, prompt, ddim_steps, num_samples, scale, seed): + """ Predict with prompt. """ init_image = input_image["image"].convert("RGB") init_mask = input_image["mask"].convert("RGB") image = pad_image(init_image) # resize to integer multiple of 32 diff --git a/scripts/gradio/superresolution.py b/scripts/gradio/superresolution.py index 3d08fbf..2939486 100644 --- a/scripts/gradio/superresolution.py +++ b/scripts/gradio/superresolution.py @@ -18,6 +18,7 @@ torch.set_grad_enabled(False) def initialize_model(config, ckpt): + """ Initialize model from config and checkpoint. """ config = OmegaConf.load(config) model = instantiate_from_config(config.model) model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False) @@ -35,6 +36,7 @@ def make_batch_sd( device, num_samples=1, ): + """ Make batch for sampling from image and text. """ image = np.array(image.convert("RGB")) image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 batch = { @@ -47,6 +49,7 @@ def make_batch_sd( def make_noise_augmentation(model, batch, noise_level=None): + """ Make noise augmentation for low scale model. """ x_low = batch[model.low_scale_key] x_low = x_low.to(memory_format=torch.contiguous_format).float() x_aug, noise_level = model.low_scale_model(x_low, noise_level) @@ -54,6 +57,7 @@ def make_noise_augmentation(model, batch, noise_level=None): def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None): + """ Paint image from text prompt. """ device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") model = sampler.model @@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb def pad_image(input_image): + """ Pad image to integer multiple of 32. """ pad_w, pad_h = np.max(((2, 2), np.ceil( np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size im_padded = Image.fromarray( @@ -128,6 +133,7 @@ def pad_image(input_image): def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level): + """ Predict image from text prompt. """ init_image = input_image.convert("RGB") image = pad_image(init_image) # resize to integer multiple of 32 width, height = image.size