mirror of
https://github.com/Stability-AI/stablediffusion.git
synced 2024-12-22 15:44:58 +00:00
Added method comments
This commit is contained in:
parent
cf1d67a6fd
commit
35459486f7
5 changed files with 19 additions and 1 deletions
|
@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa
|
||||||
|
|
||||||
*Stable UnCLIP 2.1*
|
*Stable UnCLIP 2.1*
|
||||||
|
|
||||||
- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
|
- New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
|
||||||
|
|
||||||
- A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
|
- A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
|
||||||
def autocast(f):
|
def autocast(f):
|
||||||
|
""" Decorator for autocasting inside a function """
|
||||||
def do_autocast(*args, **kwargs):
|
def do_autocast(*args, **kwargs):
|
||||||
with torch.cuda.amp.autocast(enabled=True,
|
with torch.cuda.amp.autocast(enabled=True,
|
||||||
dtype=torch.get_autocast_gpu_dtype(),
|
dtype=torch.get_autocast_gpu_dtype(),
|
||||||
|
@ -19,6 +20,7 @@ def autocast(f):
|
||||||
|
|
||||||
|
|
||||||
def log_txt_as_img(wh, xc, size=10):
|
def log_txt_as_img(wh, xc, size=10):
|
||||||
|
""" Convert a list of strings to a list of images """
|
||||||
# wh a tuple of (width, height)
|
# wh a tuple of (width, height)
|
||||||
# xc a list of captions to plot
|
# xc a list of captions to plot
|
||||||
b = len(xc)
|
b = len(xc)
|
||||||
|
|
|
@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(config, ckpt):
|
def initialize_model(config, ckpt):
|
||||||
|
""" Initialize model from config and checkpoint. """
|
||||||
config = OmegaConf.load(config)
|
config = OmegaConf.load(config)
|
||||||
model = instantiate_from_config(config.model)
|
model = instantiate_from_config(config.model)
|
||||||
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
|
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
|
||||||
|
@ -35,6 +36,7 @@ def make_batch_sd(
|
||||||
num_samples=1,
|
num_samples=1,
|
||||||
model_type="dpt_hybrid"
|
model_type="dpt_hybrid"
|
||||||
):
|
):
|
||||||
|
""" Make batch for sampling from image and text. """
|
||||||
image = np.array(image.convert("RGB"))
|
image = np.array(image.convert("RGB"))
|
||||||
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
||||||
# sample['jpg'] is tensor hwc in [-1, 1] at this point
|
# sample['jpg'] is tensor hwc in [-1, 1] at this point
|
||||||
|
@ -54,6 +56,7 @@ def make_batch_sd(
|
||||||
|
|
||||||
def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
|
def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
|
||||||
do_full_sample=False):
|
do_full_sample=False):
|
||||||
|
""" Paint image from text prompt. """
|
||||||
device = torch.device(
|
device = torch.device(
|
||||||
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
model = sampler.model
|
model = sampler.model
|
||||||
|
@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No
|
||||||
|
|
||||||
|
|
||||||
def pad_image(input_image):
|
def pad_image(input_image):
|
||||||
|
""" Pad image to integer multiple of 32. """
|
||||||
pad_w, pad_h = np.max(((2, 2), np.ceil(
|
pad_w, pad_h = np.max(((2, 2), np.ceil(
|
||||||
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
|
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
|
||||||
im_padded = Image.fromarray(
|
im_padded = Image.fromarray(
|
||||||
|
@ -121,6 +125,7 @@ def pad_image(input_image):
|
||||||
|
|
||||||
|
|
||||||
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
|
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
|
||||||
|
""" Predict image from text prompt. """
|
||||||
init_image = input_image.convert("RGB")
|
init_image = input_image.convert("RGB")
|
||||||
image = pad_image(init_image) # resize to integer multiple of 32
|
image = pad_image(init_image) # resize to integer multiple of 32
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ torch.set_grad_enabled(False)
|
||||||
|
|
||||||
|
|
||||||
def put_watermark(img, wm_encoder=None):
|
def put_watermark(img, wm_encoder=None):
|
||||||
|
""" Put watermark on image. """
|
||||||
if wm_encoder is not None:
|
if wm_encoder is not None:
|
||||||
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||||
img = wm_encoder.encode(img, 'dwtDct')
|
img = wm_encoder.encode(img, 'dwtDct')
|
||||||
|
@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None):
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(config, ckpt):
|
def initialize_model(config, ckpt):
|
||||||
|
""" Initialize model from config and checkpoint. """
|
||||||
config = OmegaConf.load(config)
|
config = OmegaConf.load(config)
|
||||||
model = instantiate_from_config(config.model)
|
model = instantiate_from_config(config.model)
|
||||||
|
|
||||||
|
@ -44,6 +46,7 @@ def make_batch_sd(
|
||||||
txt,
|
txt,
|
||||||
device,
|
device,
|
||||||
num_samples=1):
|
num_samples=1):
|
||||||
|
""" Make batch for sampling from image and text. """
|
||||||
image = np.array(image.convert("RGB"))
|
image = np.array(image.convert("RGB"))
|
||||||
image = image[None].transpose(0, 3, 1, 2)
|
image = image[None].transpose(0, 3, 1, 2)
|
||||||
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
||||||
|
@ -67,6 +70,7 @@ def make_batch_sd(
|
||||||
|
|
||||||
|
|
||||||
def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
|
def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
|
||||||
|
""" Inpaint image with prompt. """
|
||||||
device = torch.device(
|
device = torch.device(
|
||||||
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
model = sampler.model
|
model = sampler.model
|
||||||
|
@ -135,6 +139,7 @@ def pad_image(input_image):
|
||||||
return im_padded
|
return im_padded
|
||||||
|
|
||||||
def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
|
def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
|
||||||
|
""" Predict with prompt. """
|
||||||
init_image = input_image["image"].convert("RGB")
|
init_image = input_image["image"].convert("RGB")
|
||||||
init_mask = input_image["mask"].convert("RGB")
|
init_mask = input_image["mask"].convert("RGB")
|
||||||
image = pad_image(init_image) # resize to integer multiple of 32
|
image = pad_image(init_image) # resize to integer multiple of 32
|
||||||
|
|
|
@ -18,6 +18,7 @@ torch.set_grad_enabled(False)
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(config, ckpt):
|
def initialize_model(config, ckpt):
|
||||||
|
""" Initialize model from config and checkpoint. """
|
||||||
config = OmegaConf.load(config)
|
config = OmegaConf.load(config)
|
||||||
model = instantiate_from_config(config.model)
|
model = instantiate_from_config(config.model)
|
||||||
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
|
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
|
||||||
|
@ -35,6 +36,7 @@ def make_batch_sd(
|
||||||
device,
|
device,
|
||||||
num_samples=1,
|
num_samples=1,
|
||||||
):
|
):
|
||||||
|
""" Make batch for sampling from image and text. """
|
||||||
image = np.array(image.convert("RGB"))
|
image = np.array(image.convert("RGB"))
|
||||||
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
||||||
batch = {
|
batch = {
|
||||||
|
@ -47,6 +49,7 @@ def make_batch_sd(
|
||||||
|
|
||||||
|
|
||||||
def make_noise_augmentation(model, batch, noise_level=None):
|
def make_noise_augmentation(model, batch, noise_level=None):
|
||||||
|
""" Make noise augmentation for low scale model. """
|
||||||
x_low = batch[model.low_scale_key]
|
x_low = batch[model.low_scale_key]
|
||||||
x_low = x_low.to(memory_format=torch.contiguous_format).float()
|
x_low = x_low.to(memory_format=torch.contiguous_format).float()
|
||||||
x_aug, noise_level = model.low_scale_model(x_low, noise_level)
|
x_aug, noise_level = model.low_scale_model(x_low, noise_level)
|
||||||
|
@ -54,6 +57,7 @@ def make_noise_augmentation(model, batch, noise_level=None):
|
||||||
|
|
||||||
|
|
||||||
def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
|
def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
|
||||||
|
""" Paint image from text prompt. """
|
||||||
device = torch.device(
|
device = torch.device(
|
||||||
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
"cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
model = sampler.model
|
model = sampler.model
|
||||||
|
@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb
|
||||||
|
|
||||||
|
|
||||||
def pad_image(input_image):
|
def pad_image(input_image):
|
||||||
|
""" Pad image to integer multiple of 32. """
|
||||||
pad_w, pad_h = np.max(((2, 2), np.ceil(
|
pad_w, pad_h = np.max(((2, 2), np.ceil(
|
||||||
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
|
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
|
||||||
im_padded = Image.fromarray(
|
im_padded = Image.fromarray(
|
||||||
|
@ -128,6 +133,7 @@ def pad_image(input_image):
|
||||||
|
|
||||||
|
|
||||||
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
|
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
|
||||||
|
""" Predict image from text prompt. """
|
||||||
init_image = input_image.convert("RGB")
|
init_image = input_image.convert("RGB")
|
||||||
image = pad_image(init_image) # resize to integer multiple of 32
|
image = pad_image(init_image) # resize to integer multiple of 32
|
||||||
width, height = image.size
|
width, height = image.size
|
||||||
|
|
Loading…
Reference in a new issue