Added method comments

2024-12-22 15:44:58 +00:00 · 2023-06-26 21:48:37 -07:00 · 2023-06-26 21:48:37 -07:00 · 35459486f7
commit 35459486f7
parent cf1d67a6fd
5 changed files with 19 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa
 *Stable UnCLIP 2.1*
- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution,  based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
+- New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution,  based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
 - A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
--- a/ldm/util.py
+++ b/ldm/util.py
@ -9,6 +9,7 @@ from PIL import Image, ImageDraw, ImageFont
 def autocast(f):
    """ Decorator for autocasting inside a function """
    def do_autocast(*args, **kwargs):
        with torch.cuda.amp.autocast(enabled=True,
                                     dtype=torch.get_autocast_gpu_dtype(),
@ -19,6 +20,7 @@ def autocast(f):
 def log_txt_as_img(wh, xc, size=10):
    """ Convert a list of strings to a list of images """
    # wh a tuple of (width, height)
    # xc a list of captions to plot
    b = len(xc)
--- a/scripts/gradio/depth2img.py
+++ b/scripts/gradio/depth2img.py
@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
 def initialize_model(config, ckpt):
    """ Initialize model from config and checkpoint. """
    config = OmegaConf.load(config)
    model = instantiate_from_config(config.model)
    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@ -35,6 +36,7 @@ def make_batch_sd(
        num_samples=1,
        model_type="dpt_hybrid"
 ):
    """ Make batch for sampling from image and text. """
    image = np.array(image.convert("RGB"))
    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
    # sample['jpg'] is tensor hwc in [-1, 1] at this point
@ -54,6 +56,7 @@ def make_batch_sd(
 def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
          do_full_sample=False):
    """ Paint image from text prompt. """
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = sampler.model
@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No
 def pad_image(input_image):
    """ Pad image to integer multiple of 32. """
    pad_w, pad_h = np.max(((2, 2), np.ceil(
        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
    im_padded = Image.fromarray(
@ -121,6 +125,7 @@ def pad_image(input_image):
 def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
    """ Predict image from text prompt. """
    init_image = input_image.convert("RGB")
    image = pad_image(init_image)  # resize to integer multiple of 32
--- a/scripts/gradio/inpainting.py
+++ b/scripts/gradio/inpainting.py
@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
 def put_watermark(img, wm_encoder=None):
    """ Put watermark on image. """
    if wm_encoder is not None:
        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        img = wm_encoder.encode(img, 'dwtDct')
@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None):
 def initialize_model(config, ckpt):
    """ Initialize model from config and checkpoint. """
    config = OmegaConf.load(config)
    model = instantiate_from_config(config.model)
@ -44,6 +46,7 @@ def make_batch_sd(
        txt,
        device,
        num_samples=1):
    """ Make batch for sampling from image and text. """
    image = np.array(image.convert("RGB"))
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
@ -67,6 +70,7 @@ def make_batch_sd(
 def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
    """ Inpaint image with prompt. """
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = sampler.model
@ -135,6 +139,7 @@ def pad_image(input_image):
    return im_padded
 def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
    """ Predict with prompt. """
    init_image = input_image["image"].convert("RGB")
    init_mask = input_image["mask"].convert("RGB")
    image = pad_image(init_image) # resize to integer multiple of 32
--- a/scripts/gradio/superresolution.py
+++ b/scripts/gradio/superresolution.py
@ -18,6 +18,7 @@ torch.set_grad_enabled(False)
 def initialize_model(config, ckpt):
    """ Initialize model from config and checkpoint. """
    config = OmegaConf.load(config)
    model = instantiate_from_config(config.model)
    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@ -35,6 +36,7 @@ def make_batch_sd(
        device,
        num_samples=1,
 ):
    """ Make batch for sampling from image and text. """
    image = np.array(image.convert("RGB"))
    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
    batch = {
@ -47,6 +49,7 @@ def make_batch_sd(
 def make_noise_augmentation(model, batch, noise_level=None):
    """ Make noise augmentation for low scale model. """
    x_low = batch[model.low_scale_key]
    x_low = x_low.to(memory_format=torch.contiguous_format).float()
    x_aug, noise_level = model.low_scale_model(x_low, noise_level)
@ -54,6 +57,7 @@ def make_noise_augmentation(model, batch, noise_level=None):
 def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
    """ Paint image from text prompt. """
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = sampler.model
@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb
 def pad_image(input_image):
    """ Pad image to integer multiple of 32. """
    pad_w, pad_h = np.max(((2, 2), np.ceil(
        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
    im_padded = Image.fromarray(
@ -128,6 +133,7 @@ def pad_image(input_image):
 def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
    """ Predict image from text prompt. """
    init_image = input_image.convert("RGB")
    image = pad_image(init_image)  # resize to integer multiple of 32
    width, height = image.size