Enhance PowerPaint (#2093)

* Update gradio_PowerPaint.py * Update gradio_PowerPaint.py * Update gradio_PowerPaint.py * Update gradio_PowerPaint.py * Update README.md * Update README.md * Update gradio_PowerPaint.py * Update gradio_PowerPaint.py * Update README.md
open-mmlab · Dec 18, 2023 · b08b6ca · b08b6ca
1 parent ab610dd
commit b08b6ca
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 73 deletions.
diff --git a/projects/powerpaint/README.md b/projects/powerpaint/README.md
@@ -4,8 +4,20 @@
 
 This README provides a step-by-step guide to download the repository, set up the required virtual environment named "PowerPaint" using conda, and run PowerPaint with or without ControlNet.
 
+## News
+
+**December 18, 2023**
+
+*Enhanced PowerPaint Model*
+
+- We are delighted to announce the release of more stable model weights. These refined weights can now be accessed on [Hugging Face](https://huggingface.co/JunhaoZhuang/PowerPaint-v1/tree/main). The `gradio_PowerPaint.py` file and [Online Demo](https://openxlab.org.cn/apps/detail/rangoliu/PowerPaint) have also been updated as part of this release.
+
+
+## Next
+
 **Stronger Model Weights Coming Soon！**
 
+________________
 <img src='https://github.com/open-mmlab/mmagic/assets/12782558/acd01391-c73f-4997-aafd-0869aebcc915'/>
 
 ## Getting Started

diff --git a/projects/powerpaint/gradio_PowerPaint.py b/projects/powerpaint/gradio_PowerPaint.py
@@ -11,7 +11,6 @@
     StableDiffusionInpaintPipeline as Pipeline
 from pipeline.pipeline_PowerPaint_ControlNet import \
     StableDiffusionControlNetInpaintPipeline as controlnetPipeline
-from safetensors.torch import load_file
 from transformers import DPTFeatureExtractor, DPTForDepthEstimation
 from utils.utils import TokenizerWrapper, add_tokens
 
@@ -21,8 +20,7 @@
 global pipe
 pipe = Pipeline.from_pretrained(
     'runwayml/stable-diffusion-inpainting',
-    torch_dtype=weight_dtype,
-    safety_checker=None)
+    torch_dtype=weight_dtype)
 pipe.tokenizer = TokenizerWrapper(
     from_pretrained='runwayml/stable-diffusion-v1-5',
     subfolder='tokenizer',
@@ -34,14 +32,13 @@
     placeholder_tokens=['P_ctxt', 'P_shape', 'P_obj'],
     initialize_tokens=['a', 'a', 'a'],
     num_vectors_per_token=10)
-pipe.unet.load_state_dict(
-    load_file(
-        './models/unet/diffusion_pytorch_model.safetensors', device='cuda'),
-    strict=False)
-pipe.text_encoder.load_state_dict(
-    torch.load('./models/text_encoder/pytorch_model.bin'), strict=False)
+
+from safetensors.torch import load_model
+load_model(pipe.unet, "./models/unet/diffusion_pytorch_model.safetensors")
+pipe.text_encoder.load_state_dict(torch.load("./models/text_encoder/pytorch_model.bin"), strict=False)
 pipe = pipe.to('cuda')
 
+
 depth_estimator = DPTForDepthEstimation.from_pretrained(
     'Intel/dpt-hybrid-midas').to('cuda')
 feature_extractor = DPTFeatureExtractor.from_pretrained(
@@ -51,7 +48,7 @@
 
 global current_control
 current_control = 'canny'
-controlnet_conditioning_scale = 0.5
+# controlnet_conditioning_scale = 0.8
 
 
 def set_seed(seed):
@@ -94,8 +91,8 @@ def add_task(prompt, negative_prompt, control_type):
     elif control_type == 'shape-guided':
         promptA = prompt + ' P_shape'
         promptB = prompt + ' P_ctxt'
-        negative_promptA = negative_prompt + ' P_shape'
-        negative_promptB = negative_prompt + ' P_ctxt'
+        negative_promptA = negative_prompt
+        negative_promptB = negative_prompt
     elif control_type == 'image-outpainting':
         promptA = prompt + ' P_ctxt'
         promptB = prompt + ' P_ctxt'
@@ -104,18 +101,18 @@ def add_task(prompt, negative_prompt, control_type):
     else:
         promptA = prompt + ' P_obj'
         promptB = prompt + ' P_obj'
-        negative_promptA = negative_prompt + ' P_obj'
-        negative_promptB = negative_prompt + ' P_obj'
+        negative_promptA = negative_prompt
+        negative_promptB = negative_prompt
 
     return promptA, promptB, negative_promptA, negative_promptB
 
 
+
 def predict(input_image, prompt, fitting_degree, ddim_steps, scale, seed,
-            negative_prompt, task, vertical_expansion_ratio,
-            horizontal_expansion_ratio):
+            negative_prompt, task,vertical_expansion_ratio,horizontal_expansion_ratio):
     size1, size2 = input_image['image'].convert('RGB').size
 
-    if task != 'image-outpainting':
+    if task!='image-outpainting':
         if size1 < size2:
             input_image['image'] = input_image['image'].convert('RGB').resize(
                 (640, int(size2 / size1 * 640)))
@@ -130,44 +127,34 @@ def predict(input_image, prompt, fitting_degree, ddim_steps, scale, seed,
             input_image['image'] = input_image['image'].convert('RGB').resize(
                 (int(size1 / size2 * 512), 512))
 
-    if (vertical_expansion_ratio is not None) and (horizontal_expansion_ratio
-                                                   is not None):  # noqa
-        o_W, o_H = input_image['image'].convert('RGB').size
-        c_W = int(horizontal_expansion_ratio * o_W)
-        c_H = int(vertical_expansion_ratio * o_H)
+    if vertical_expansion_ratio!=None and horizontal_expansion_ratio!=None:
+        o_W,o_H = input_image['image'].convert('RGB').size
+        c_W = int(horizontal_expansion_ratio*o_W)
+        c_H = int(vertical_expansion_ratio*o_H)
 
-        expand_img = np.ones((c_H, c_W, 3), dtype=np.uint8) * 127
+        expand_img = np.ones((c_H, c_W,3), dtype=np.uint8)*127
         original_img = np.array(input_image['image'])
-        expand_img[int((c_H - o_H) / 2.0):int((c_H - o_H) / 2.0) + o_H,
-                   int((c_W - o_W) / 2.0):int((c_W - o_W) / 2.0) +
-                   o_W, :] = original_img
+        expand_img[int((c_H-o_H)/2.0):int((c_H-o_H)/2.0)+o_H,int((c_W-o_W)/2.0):int((c_W-o_W)/2.0)+o_W,:] = original_img
 
         blurry_gap = 10
 
-        expand_mask = np.ones((c_H, c_W, 3), dtype=np.uint8) * 255
-        if vertical_expansion_ratio == 1 and horizontal_expansion_ratio != 1:
-            expand_mask[int((c_H - o_H) / 2.0):int((c_H - o_H) / 2.0) + o_H,
-                        int((c_W - o_W) / 2.0) +
-                        blurry_gap:int((c_W - o_W) / 2.0) + o_W -
-                        blurry_gap, :] = 0
-        elif vertical_expansion_ratio != 1 and horizontal_expansion_ratio != 1:
-            expand_mask[int((c_H - o_H) / 2.0) +
-                        blurry_gap:int((c_H - o_H) / 2.0) + o_H - blurry_gap,
-                        int((c_W - o_W) / 2.0) +
-                        blurry_gap:int((c_W - o_W) / 2.0) + o_W -
-                        blurry_gap, :] = 0
-        elif vertical_expansion_ratio != 1 and horizontal_expansion_ratio == 1:
-            expand_mask[int((c_H - o_H) / 2.0) +
-                        blurry_gap:int((c_H - o_H) / 2.0) + o_H - blurry_gap,
-                        int((c_W - o_W) /
-                            2.0):int((c_W - o_W) / 2.0) + o_W, :] = 0
+        expand_mask = np.ones((c_H, c_W,3), dtype=np.uint8)*255
+        if vertical_expansion_ratio == 1 and horizontal_expansion_ratio!=1:
+            expand_mask[int((c_H-o_H)/2.0):int((c_H-o_H)/2.0)+o_H,int((c_W-o_W)/2.0)+blurry_gap:int((c_W-o_W)/2.0)+o_W-blurry_gap,:] = 0 #noqa
+        elif vertical_expansion_ratio != 1 and horizontal_expansion_ratio!=1:
+            expand_mask[int((c_H-o_H)/2.0)+blurry_gap:int((c_H-o_H)/2.0)+o_H-blurry_gap,int((c_W-o_W)/2.0)+blurry_gap:int((c_W-o_W)/2.0)+o_W-blurry_gap,:] = 0 #noqa
+        elif vertical_expansion_ratio != 1 and horizontal_expansion_ratio==1:
+            expand_mask[int((c_H-o_H)/2.0)+blurry_gap:int((c_H-o_H)/2.0)+o_H-blurry_gap,int((c_W-o_W)/2.0):int((c_W-o_W)/2.0)+o_W,:] = 0 #noqa
 
         input_image['image'] = Image.fromarray(expand_img)
         input_image['mask'] = Image.fromarray(expand_mask)
 
+
+
+
     promptA, promptB, negative_promptA, negative_promptB = add_task(
         prompt, negative_prompt, task)
-    # print(promptA, promptB, negative_promptA, negative_promptB)
+    print(promptA, promptB, negative_promptA, negative_promptB)
     img = np.array(input_image['image'].convert('RGB'))
 
     W = int(np.shape(img)[0] - np.shape(img)[0] % 8)
@@ -191,8 +178,8 @@ def predict(input_image, prompt, fitting_degree, ddim_steps, scale, seed,
         num_inference_steps=ddim_steps).images[0]
     mask_np = np.array(input_image['mask'].convert('RGB'))
     red = np.array(result).astype('float') * 1
-    red[:, :, 0] = 0
-    red[:, :, 2] = 180.0
+    red[:, :, 0] = 180.0
+    red[:, :, 2] = 0
     red[:, :, 1] = 0
     result_m = np.array(result)
     result_m = Image.fromarray(
@@ -208,15 +195,17 @@ def predict(input_image, prompt, fitting_degree, ddim_steps, scale, seed,
 
     dict_res = [input_image['mask'].convert('RGB'), result_m]
 
-    return result_paste, dict_res
+    dict_out = [input_image['image'].convert('RGB'), result_paste]
+
+    return dict_out, dict_res
 
 
 def predict_controlnet(input_image, input_control_image, control_type, prompt,
-                       ddim_steps, scale, seed, negative_prompt):
+                       ddim_steps, scale, seed, negative_prompt,controlnet_conditioning_scale):
     promptA = prompt + ' P_obj'
     promptB = prompt + ' P_obj'
-    negative_promptA = negative_prompt + ' P_obj'
-    negative_promptB = negative_prompt + ' P_obj'
+    negative_promptA = negative_prompt
+    negative_promptB = negative_prompt
     size1, size2 = input_image['image'].convert('RGB').size
 
     if size1 < size2:
@@ -289,6 +278,7 @@ def predict_controlnet(input_image, input_control_image, control_type, prompt,
         width=H,
         height=W,
         guidance_scale=scale,
+        controlnet_conditioning_scale = controlnet_conditioning_scale,
         num_inference_steps=ddim_steps).images[0]
     red = np.array(result).astype('float') * 1
     red[:, :, 0] = 180.0
@@ -307,29 +297,29 @@ def predict_controlnet(input_image, input_control_image, control_type, prompt,
     ours_np = np.asarray(result) / 255.0
     ours_np = ours_np * m_img + (1 - m_img) * img_np
     result_paste = Image.fromarray(np.uint8(ours_np * 255))
-    return result_paste, [controlnet_image, result_m]
+    return [input_image['image'].convert('RGB'), result_paste], [controlnet_image, result_m]
 
 
 def infer(input_image, text_guided_prompt, text_guided_negative_prompt,
           shape_guided_prompt, shape_guided_negative_prompt, fitting_degree,
           ddim_steps, scale, seed, task, enable_control, input_control_image,
-          control_type, vertical_expansion_ratio, horizontal_expansion_ratio,
-          outpaint_prompt, outpaint_negative_prompt):
+          control_type,vertical_expansion_ratio,horizontal_expansion_ratio,outpaint_prompt,
+          outpaint_negative_prompt,controlnet_conditioning_scale,removal_prompt,
+          removal_negative_prompt):
     if task == 'text-guided':
         prompt = text_guided_prompt
         negative_prompt = text_guided_negative_prompt
     elif task == 'shape-guided':
         prompt = shape_guided_prompt
         negative_prompt = shape_guided_negative_prompt
     elif task == 'object-removal':
-        prompt = ''
-        negative_prompt = ''
+        prompt = removal_prompt
+        negative_prompt = removal_negative_prompt
     elif task == 'image-outpainting':
         prompt = outpaint_prompt
         negative_prompt = outpaint_negative_prompt
         return predict(input_image, prompt, fitting_degree, ddim_steps, scale,
-                       seed, negative_prompt, task, vertical_expansion_ratio,
-                       horizontal_expansion_ratio)
+                       seed, negative_prompt, task,vertical_expansion_ratio,horizontal_expansion_ratio)
     else:
         task = 'text-guided'
         prompt = text_guided_prompt
@@ -338,10 +328,10 @@ def infer(input_image, text_guided_prompt, text_guided_negative_prompt,
     if enable_control and task == 'text-guided':
         return predict_controlnet(input_image, input_control_image,
                                   control_type, prompt, ddim_steps, scale,
-                                  seed, negative_prompt)
+                                  seed, negative_prompt,controlnet_conditioning_scale)
     else:
         return predict(input_image, prompt, fitting_degree, ddim_steps, scale,
-                       seed, negative_prompt, task, None, None)
+                       seed, negative_prompt, task,None,None)
 
 
 def select_tab_text_guided():
@@ -351,7 +341,6 @@ def select_tab_text_guided():
 def select_tab_object_removal():
     return 'object-removal'
 
-
 def select_tab_image_outpainting():
     return 'image-outpainting'
 
@@ -371,16 +360,16 @@ def select_tab_shape_guided():
             "<a href='https://arxiv.org/abs/2312.03594/'>Paper</a> &ensp;"
             "<a href='https://github.com/open-mmlab/mmagic/tree/main/projects/powerpaint'>Code</a> </font></div>"  # noqa
         )
-
+    with gr.Row():
+        gr.Markdown(
+            "**Note:** Due to network-related factors, the page may experience occasional bugs！ If the inpainting results deviate significantly from expectations, consider toggling between task options to refresh the content."  # noqa
+        )
     with gr.Row():
         with gr.Column():
             gr.Markdown('### Input image and draw mask')
             input_image = gr.Image(source='upload', tool='sketch', type='pil')
 
-            task = gr.Radio([
-                'text-guided', 'object-removal', 'shape-guided',
-                'image-outpainting'
-            ],
+            task = gr.Radio(['text-guided', 'object-removal', 'shape-guided', 'image-outpainting'],
                             show_label=False,
                             visible=False)
 
@@ -397,6 +386,13 @@ def select_tab_shape_guided():
                 enable_control = gr.Checkbox(
                     label='Enable controlnet',
                     info='Enable this if you want to use controlnet')
+                controlnet_conditioning_scale = gr.Slider(
+                    label='controlnet conditioning scale',
+                    minimum=0,
+                    maximum=1,
+                    step=0.05,
+                    value=0.5,
+                )
                 control_type = gr.Radio(['canny', 'pose', 'depth', 'hed'],
                                         label='Control type')
                 input_control_image = gr.Image(source='upload', type='pil')
@@ -408,7 +404,13 @@ def select_tab_shape_guided():
                 enable_object_removal = gr.Checkbox(
                     label='Enable object removal inpainting',
                     value=True,
+                    info='The recommended configuration for the Guidance Scale is 10 or higher. \
+                    If undesired objects appear in the masked area, \
+                    you can address this by specifically increasing the Guidance Scale.',
                     interactive=False)
+                removal_prompt = gr.Textbox(label='Prompt')
+                removal_negative_prompt = gr.Textbox(
+                    label='negative_prompt')
             tab_object_removal.select(
                 fn=select_tab_object_removal, inputs=None, outputs=task)
 
@@ -417,6 +419,9 @@ def select_tab_shape_guided():
                 enable_object_removal = gr.Checkbox(
                     label='Enable image outpainting',
                     value=True,
+                    info='The recommended configuration for the Guidance Scale is 10 or higher. \
+                    If unwanted random objects appear in the extended image region, \
+                        you can enhance the cleanliness of the extension area by increasing the Guidance Scale.',
                     interactive=False)
                 outpaint_prompt = gr.Textbox(label='Outpainting_prompt')
                 outpaint_negative_prompt = gr.Textbox(
@@ -463,10 +468,7 @@ def select_tab_shape_guided():
                     label='Steps', minimum=1, maximum=50, value=45, step=1)
                 scale = gr.Slider(
                     label='Guidance Scale',
-                    info='For object removal, \
-                        it is recommended to set the value at 10 or above, \
-                        while for image outpainting, \
-                            it is advisable to set it at 18 or above.',
+                    info='For object removal and image outpainting, it is recommended to set the value at 10 or above.', #noqa
                     minimum=0.1,
                     maximum=30.0,
                     value=7.5,
@@ -480,19 +482,21 @@ def select_tab_shape_guided():
                 )
         with gr.Column():
             gr.Markdown('### Inpainting result')
-            inpaint_result = gr.Image()
+            inpaint_result = gr.Gallery(
+                label='Generated images', show_label=False, columns=2)
             gr.Markdown('### Mask')
             gallery = gr.Gallery(
-                label='Generated images', show_label=False, columns=2)
+                label='Generated masks', show_label=False, columns=2)
 
     run_button.click(
         fn=infer,
         inputs=[
             input_image, text_guided_prompt, text_guided_negative_prompt,
             shape_guided_prompt, shape_guided_negative_prompt, fitting_degree,
             ddim_steps, scale, seed, task, enable_control, input_control_image,
-            control_type, vertical_expansion_ratio, horizontal_expansion_ratio,
-            outpaint_prompt, outpaint_negative_prompt
+            control_type,vertical_expansion_ratio,horizontal_expansion_ratio,
+            outpaint_prompt,outpaint_negative_prompt,controlnet_conditioning_scale,
+            removal_prompt,removal_negative_prompt
         ],
         outputs=[inpaint_result, gallery])