diff --git a/README.md b/README.md index 04de29ca..7720fc43 100644 --- a/README.md +++ b/README.md @@ -283,6 +283,16 @@ wget https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pt After that follow the instructions in the `Manual instructions` section starting at step `:: clone repositories for Stable Diffusion and (optionally) CodeFormer`. +### img2img alterantive test +- find it in scripts section +- put description of input image into the Original prompt field +- use Euler only +- recommended: 50 steps, low cfg scale between 1 and 2 +- denoising and seed don't matter +- decode cfg scale between 0 and 1 +- decode steps 50 +- original blue haired woman close nearly reproduces with cfg scale=1.8 + ## Credits - Stable Diffusion - https://github.com/CompVis/stable-diffusion, https://github.com/CompVis/taming-transformers - k-diffusion - https://github.com/crowsonkb/k-diffusion.git diff --git a/modules/interrogate.py b/modules/interrogate.py index 7ebb79fc..06862fcc 100644 --- a/modules/interrogate.py +++ b/modules/interrogate.py @@ -11,7 +11,7 @@ from torchvision import transforms from torchvision.transforms.functional import InterpolationMode import modules.shared as shared -from modules import devices, paths +from modules import devices, paths, lowvram blip_image_eval_size = 384 blip_model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' @@ -75,19 +75,28 @@ class InterrogateModels: self.dtype = next(self.clip_model.parameters()).dtype - def unload(self): + def send_clip_to_ram(self): if not shared.opts.interrogate_keep_models_in_memory: if self.clip_model is not None: self.clip_model = self.clip_model.to(devices.cpu) + def send_blip_to_ram(self): + if not shared.opts.interrogate_keep_models_in_memory: if self.blip_model is not None: self.blip_model = self.blip_model.to(devices.cpu) - devices.torch_gc() + def unload(self): + self.send_clip_to_ram() + self.send_blip_to_ram() + + devices.torch_gc() def rank(self, image_features, text_array, top_count=1): import clip + if shared.opts.interrogate_clip_dict_limit != 0: + text_array = text_array[0:int(shared.opts.interrogate_clip_dict_limit)] + top_count = min(top_count, len(text_array)) text_tokens = clip.tokenize([text for text in text_array]).to(shared.device) text_features = self.clip_model.encode_text(text_tokens).type(self.dtype) @@ -117,16 +126,24 @@ class InterrogateModels: res = None try: + + if shared.cmd_opts.lowvram or shared.cmd_opts.medvram: + lowvram.send_everything_to_cpu() + devices.torch_gc() + self.load() caption = self.generate_caption(pil_image) + self.send_blip_to_ram() + devices.torch_gc() + res = caption - images = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(shared.device) + cilp_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(shared.device) precision_scope = torch.autocast if shared.cmd_opts.precision == "autocast" else contextlib.nullcontext with torch.no_grad(), precision_scope("cuda"): - image_features = self.clip_model.encode_image(images).type(self.dtype) + image_features = self.clip_model.encode_image(cilp_image).type(self.dtype) image_features /= image_features.norm(dim=-1, keepdim=True) @@ -143,6 +160,7 @@ class InterrogateModels: except Exception: print(f"Error interrogating", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) + res += "" self.unload() diff --git a/modules/lowvram.py b/modules/lowvram.py index 079386c3..7eba1349 100644 --- a/modules/lowvram.py +++ b/modules/lowvram.py @@ -5,6 +5,16 @@ module_in_gpu = None cpu = torch.device("cpu") device = gpu = get_optimal_device() + +def send_everything_to_cpu(): + global module_in_gpu + + if module_in_gpu is not None: + module_in_gpu.to(cpu) + + module_in_gpu = None + + def setup_for_low_vram(sd_model, use_medvram): parents = {} diff --git a/modules/shared.py b/modules/shared.py index f9509a70..38d24fae 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -13,8 +13,6 @@ from modules.devices import get_optimal_device import modules.styles import modules.interrogate -config_filename = "config.json" - sd_model_file = os.path.join(script_path, 'model.ckpt') if not os.path.exists(sd_model_file): sd_model_file = "models/ldm/stable-diffusion-v1/model.ckpt" @@ -43,6 +41,8 @@ parser.add_argument("--port", type=int, help="launch gradio with given server po parser.add_argument("--show-negative-prompt", action='store_true', help="does not do anything", default=False) parser.add_argument("--ui-config-file", type=str, help="filename to use for ui configuration", default=os.path.join(script_path, 'ui-config.json')) parser.add_argument("--hide-ui-dir-config", action='store_true', help="hide directory configuration from webui", default=False) +parser.add_argument("--ui-settings-file", type=str, help="filename to use for ui settings", default=os.path.join(script_path, 'config.json')) +parser.add_argument("--gradio-debug", action='store_true', help="launch gradio with --debug option") cmd_opts = parser.parse_args() @@ -51,6 +51,7 @@ device = get_optimal_device() batch_cond_uncond = cmd_opts.always_batch_cond_uncond or not (cmd_opts.lowvram or cmd_opts.medvram) parallel_processing_allowed = not cmd_opts.lowvram and not cmd_opts.medvram +config_filename = cmd_opts.ui_settings_file class State: interrupted = False @@ -129,11 +130,12 @@ class Options: "multiple_tqdm": OptionInfo(True, "Add a second progress bar to the console that shows progress for an entire job. Broken in PyCharm console."), "face_restoration_model": OptionInfo(None, "Face restoration model", gr.Radio, lambda: {"choices": [x.name() for x in face_restorers]}), "code_former_weight": OptionInfo(0.5, "CodeFormer weight parameter; 0 = maximum effect; 1 = minimum effect", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01}), - "interrogate_keep_models_in_memory": OptionInfo(True, "Interrogate: keep models in VRAM"), + "interrogate_keep_models_in_memory": OptionInfo(False, "Interrogate: keep models in VRAM"), "interrogate_use_builtin_artists": OptionInfo(True, "Interrogate: use artists from artists.csv"), "interrogate_clip_num_beams": OptionInfo(1, "Interrogate: num_beams for BLIP", gr.Slider, {"minimum": 1, "maximum": 16, "step": 1}), "interrogate_clip_min_length": OptionInfo(24, "Interrogate: minimum descripton length (excluding artists, etc..)", gr.Slider, {"minimum": 1, "maximum": 128, "step": 1}), "interrogate_clip_max_length": OptionInfo(48, "Interrogate: maximum descripton length", gr.Slider, {"minimum": 1, "maximum": 256, "step": 1}), + "interrogate_clip_dict_limit": OptionInfo(1500, "Interrogate: maximum number of lines in text file (0 = No limit)"), } def __init__(self): diff --git a/modules/ui.py b/modules/ui.py index 3b7eb9bb..3a28bdab 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -270,7 +270,7 @@ def create_ui(txt2img, img2img, run_extras, run_pnginfo): batch_count = gr.Slider(minimum=1, maximum=cmd_opts.max_batch_count, step=1, label='Batch count', value=1) batch_size = gr.Slider(minimum=1, maximum=8, step=1, label='Batch size', value=1) - cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.0) + cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=7.0) with gr.Group(): height = gr.Slider(minimum=64, maximum=2048, step=64, label="Height", value=512) @@ -413,7 +413,7 @@ def create_ui(txt2img, img2img, run_extras, run_pnginfo): batch_size = gr.Slider(minimum=1, maximum=8, step=1, label='Batch size', value=1) with gr.Group(): - cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.0) + cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=7.0) denoising_strength = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label='Denoising strength', value=0.75) denoising_strength_change_factor = gr.Slider(minimum=0.9, maximum=1.1, step=0.01, label='Denoising strength change factor', value=1, visible=False) diff --git a/scripts/img2imgalt.py b/scripts/img2imgalt.py new file mode 100644 index 00000000..16a2fdf6 --- /dev/null +++ b/scripts/img2imgalt.py @@ -0,0 +1,104 @@ +import numpy as np +from tqdm import trange + +import modules.scripts as scripts +import gradio as gr + +from modules import processing, shared, sd_samplers +from modules.processing import Processed +from modules.sd_samplers import samplers +from modules.shared import opts, cmd_opts, state + +import torch +import k_diffusion as K + +from PIL import Image +from torch import autocast +from einops import rearrange, repeat + + +def find_noise_for_image(p, cond, uncond, cfg_scale, steps): + x = p.init_latent + + s_in = x.new_ones([x.shape[0]]) + dnw = K.external.CompVisDenoiser(shared.sd_model) + sigmas = dnw.get_sigmas(steps).flip(0) + + shared.state.sampling_steps = steps + + for i in trange(1, len(sigmas)): + shared.state.sampling_step += 1 + + x_in = torch.cat([x] * 2) + sigma_in = torch.cat([sigmas[i] * s_in] * 2) + cond_in = torch.cat([uncond, cond]) + + c_out, c_in = [K.utils.append_dims(k, x_in.ndim) for k in dnw.get_scalings(sigma_in)] + t = dnw.sigma_to_t(sigma_in) + + eps = shared.sd_model.apply_model(x_in * c_in, t, cond=cond_in) + denoised_uncond, denoised_cond = (x_in + eps * c_out).chunk(2) + + denoised = denoised_uncond + (denoised_cond - denoised_uncond) * cfg_scale + + d = (x - denoised) / sigmas[i] + dt = sigmas[i] - sigmas[i - 1] + + x = x + d * dt + + sd_samplers.store_latent(x) + + # This shouldn't be necessary, but solved some VRAM issues + del x_in, sigma_in, cond_in, c_out, c_in, t, + del eps, denoised_uncond, denoised_cond, denoised, d, dt + + shared.state.nextjob() + + return x / x.std() + +cache = [None, None, None, None, None] + +class Script(scripts.Script): + def title(self): + return "img2img alternative test" + + def show(self, is_img2img): + return is_img2img + + def ui(self, is_img2img): + original_prompt = gr.Textbox(label="Original prompt", lines=1) + cfg = gr.Slider(label="Decode CFG scale", minimum=0.1, maximum=3.0, step=0.1, value=1.0) + st = gr.Slider(label="Decode steps", minimum=1, maximum=150, step=1, value=50) + + return [original_prompt, cfg, st] + + def run(self, p, original_prompt, cfg, st): + p.batch_size = 1 + p.batch_count = 1 + + def sample_extra(x, conditioning, unconditional_conditioning): + lat = tuple([int(x*10) for x in p.init_latent.cpu().numpy().flatten().tolist()]) + + if cache[0] is not None and cache[1] == cfg and cache[2] == st and len(cache[3]) == len(lat) and sum(np.array(cache[3])-np.array(lat)) < 100 and cache[4] == original_prompt: + noise = cache[0] + else: + shared.state.job_count += 1 + cond = p.sd_model.get_learned_conditioning(p.batch_size * [original_prompt]) + noise = find_noise_for_image(p, cond, unconditional_conditioning, cfg, st) + cache[0] = noise + cache[1] = cfg + cache[2] = st + cache[3] = lat + cache[4] = original_prompt + + sampler = samplers[p.sampler_index].constructor(p.sd_model) + + samples_ddim = sampler.sample(p, noise, conditioning, unconditional_conditioning) + return samples_ddim + + p.sample = sample_extra + + processed = processing.process_images(p) + + return processed + diff --git a/webui.py b/webui.py index ca809f79..35c8362b 100644 --- a/webui.py +++ b/webui.py @@ -115,7 +115,7 @@ def webui(): run_pnginfo=modules.extras.run_pnginfo ) - demo.launch(share=cmd_opts.share, server_name="0.0.0.0" if cmd_opts.listen else None, server_port=cmd_opts.port) + demo.launch(share=cmd_opts.share, server_name="0.0.0.0" if cmd_opts.listen else None, server_port=cmd_opts.port, debug=cmd_opts.gradio_debug) if __name__ == "__main__":