Added method comments

This commit is contained in:
Balanagireddy M 2023-06-26 21:48:37 -07:00
parent cf1d67a6fd
commit 35459486f7
5 changed files with 19 additions and 1 deletions

View file

@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa
*Stable UnCLIP 2.1* *Stable UnCLIP 2.1*
- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD). - New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
- A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine) - A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)

View file

@ -9,6 +9,7 @@ from PIL import Image, ImageDraw, ImageFont
def autocast(f): def autocast(f):
""" Decorator for autocasting inside a function """
def do_autocast(*args, **kwargs): def do_autocast(*args, **kwargs):
with torch.cuda.amp.autocast(enabled=True, with torch.cuda.amp.autocast(enabled=True,
dtype=torch.get_autocast_gpu_dtype(), dtype=torch.get_autocast_gpu_dtype(),
@ -19,6 +20,7 @@ def autocast(f):
def log_txt_as_img(wh, xc, size=10): def log_txt_as_img(wh, xc, size=10):
""" Convert a list of strings to a list of images """
# wh a tuple of (width, height) # wh a tuple of (width, height)
# xc a list of captions to plot # xc a list of captions to plot
b = len(xc) b = len(xc)

View file

@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
def initialize_model(config, ckpt): def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config) config = OmegaConf.load(config)
model = instantiate_from_config(config.model) model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False) model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@ -35,6 +36,7 @@ def make_batch_sd(
num_samples=1, num_samples=1,
model_type="dpt_hybrid" model_type="dpt_hybrid"
): ):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB")) image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
# sample['jpg'] is tensor hwc in [-1, 1] at this point # sample['jpg'] is tensor hwc in [-1, 1] at this point
@ -54,6 +56,7 @@ def make_batch_sd(
def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None, def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
do_full_sample=False): do_full_sample=False):
""" Paint image from text prompt. """
device = torch.device( device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu") "cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model model = sampler.model
@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No
def pad_image(input_image): def pad_image(input_image):
""" Pad image to integer multiple of 32. """
pad_w, pad_h = np.max(((2, 2), np.ceil( pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray( im_padded = Image.fromarray(
@ -121,6 +125,7 @@ def pad_image(input_image):
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength): def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
""" Predict image from text prompt. """
init_image = input_image.convert("RGB") init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32 image = pad_image(init_image) # resize to integer multiple of 32

View file

@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
def put_watermark(img, wm_encoder=None): def put_watermark(img, wm_encoder=None):
""" Put watermark on image. """
if wm_encoder is not None: if wm_encoder is not None:
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img = wm_encoder.encode(img, 'dwtDct') img = wm_encoder.encode(img, 'dwtDct')
@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None):
def initialize_model(config, ckpt): def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config) config = OmegaConf.load(config)
model = instantiate_from_config(config.model) model = instantiate_from_config(config.model)
@ -44,6 +46,7 @@ def make_batch_sd(
txt, txt,
device, device,
num_samples=1): num_samples=1):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB")) image = np.array(image.convert("RGB"))
image = image[None].transpose(0, 3, 1, 2) image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
@ -67,6 +70,7 @@ def make_batch_sd(
def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512): def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
""" Inpaint image with prompt. """
device = torch.device( device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu") "cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model model = sampler.model
@ -135,6 +139,7 @@ def pad_image(input_image):
return im_padded return im_padded
def predict(input_image, prompt, ddim_steps, num_samples, scale, seed): def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
""" Predict with prompt. """
init_image = input_image["image"].convert("RGB") init_image = input_image["image"].convert("RGB")
init_mask = input_image["mask"].convert("RGB") init_mask = input_image["mask"].convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32 image = pad_image(init_image) # resize to integer multiple of 32

View file

@ -18,6 +18,7 @@ torch.set_grad_enabled(False)
def initialize_model(config, ckpt): def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config) config = OmegaConf.load(config)
model = instantiate_from_config(config.model) model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False) model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@ -35,6 +36,7 @@ def make_batch_sd(
device, device,
num_samples=1, num_samples=1,
): ):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB")) image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
batch = { batch = {
@ -47,6 +49,7 @@ def make_batch_sd(
def make_noise_augmentation(model, batch, noise_level=None): def make_noise_augmentation(model, batch, noise_level=None):
""" Make noise augmentation for low scale model. """
x_low = batch[model.low_scale_key] x_low = batch[model.low_scale_key]
x_low = x_low.to(memory_format=torch.contiguous_format).float() x_low = x_low.to(memory_format=torch.contiguous_format).float()
x_aug, noise_level = model.low_scale_model(x_low, noise_level) x_aug, noise_level = model.low_scale_model(x_low, noise_level)
@ -54,6 +57,7 @@ def make_noise_augmentation(model, batch, noise_level=None):
def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None): def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
""" Paint image from text prompt. """
device = torch.device( device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu") "cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model model = sampler.model
@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb
def pad_image(input_image): def pad_image(input_image):
""" Pad image to integer multiple of 32. """
pad_w, pad_h = np.max(((2, 2), np.ceil( pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray( im_padded = Image.fromarray(
@ -128,6 +133,7 @@ def pad_image(input_image):
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level): def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
""" Predict image from text prompt. """
init_image = input_image.convert("RGB") init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32 image = pad_image(init_image) # resize to integer multiple of 32
width, height = image.size width, height = image.size