Datawhale X 魔搭 AI夏令营-第四期 AIGC-Task 2

认识AI助手——通义千问

操作指南

主要功能模块

精读baseline代码

分析代码的主体架构

逐行解释代码

其他疑问-向AI追问

进行实战——基于话剧的连环画制作

提示词准备

执行Task1的30分钟速通Baseline

修改提示词

生成图片

测试美学打分

更多尝试——scepter webui

认识AI助手——通义千问

操作指南

主要功能模块

对话，支持文字输入，文件上传等模式，我们本次课程主要使用当前模块；

效率，各种学习办公小工具；

智能体，通义的智能体应用市场，大家可以根据自己的需求找到很多有意思的小应用。

精读baseline代码

先看一下baseline代码的框架结构：

baseline中所有代码如下：

!pip install simple-aesthetics-predictor

!pip install -v -e data-juicer

!pip uninstall pytorch-lightning -y
!pip install peft lightning pandas torchvision

!pip install -e DiffSynth-Studio

from modelscope.msdatasets import MsDataset

ds = MsDataset.load(
    'AI-ModelScope/lowres_anime',
    subset_name='default',
    split='train',
    cache_dir="/mnt/workspace/kolors/data"
)

import json, os
from data_juicer.utils.mm_utils import SpecialTokens
from tqdm import tqdm


os.makedirs("./data/lora_dataset/train", exist_ok=True)
os.makedirs("./data/data-juicer/input", exist_ok=True)
with open("./data/data-juicer/input/metadata.jsonl", "w") as f:
    for data_id, data in enumerate(tqdm(ds)):
        image = data["image"].convert("RGB")
        image.save(f"/mnt/workspace/kolors/data/lora_dataset/train/{data_id}.jpg")
        metadata = {"text": "二次元", "image": [f"/mnt/workspace/kolors/data/lora_dataset/train/{data_id}.jpg"]}
        f.write(json.dumps(metadata))
        f.write("\n")

data_juicer_config = """
# global parameters
project_name: 'data-process'
dataset_path: './data/data-juicer/input/metadata.jsonl'  # path to your dataset directory or file
np: 4  # number of subprocess to process your dataset

text_keys: 'text'
image_key: 'image'
image_special_token: '<__dj__image>'

export_path: './data/data-juicer/output/result.jsonl'

# process schedule
# a list of several process operators with their arguments
process:
    - image_shape_filter:
        min_width: 1024
        min_height: 1024
        any_or_all: any
    - image_aspect_ratio_filter:
        min_ratio: 0.5
        max_ratio: 2.0
        any_or_all: any
"""
with open("data/data-juicer/data_juicer_config.yaml", "w") as file:
    file.write(data_juicer_config.strip())

!dj-process --config data/data-juicer/data_juicer_config.yaml

import pandas as pd
import os, json
from PIL import Image
from tqdm import tqdm


texts, file_names = [], []
os.makedirs("./data/data-juicer/output/images", exist_ok=True)
with open("./data/data-juicer/output/result.jsonl", "r") as f:
    for line in tqdm(f):
        metadata = json.loads(line)
        texts.append(metadata["text"])
        file_names.append(metadata["image"][0])

df = pd.DataFrame({"text": texts, "file_name": file_names})
df.to_csv("./data/data-juicer/output/result.csv", index=False)

df

from transformers import CLIPProcessor, CLIPModel
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

images = [Image.open(img_path) for img_path in df["file_name"]]
inputs = processor(text=df["text"].tolist(), images=images, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the probabilities

probs

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, df, processor):
        self.texts = df["text"].tolist()
        self.images = [Image.open(img_path) for img_path in df["file_name"]]
        self.processor = processor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.processor(text=self.texts[idx], images=self.images[idx], return_tensors="pt", padding=True)
        return inputs

dataset = CustomDataset(df, processor)
dataloader = DataLoader(dataset, batch_size=8)

for batch in dataloader:
    outputs = model(**batch)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    print(probs)

import torch
from diffusers import StableDiffusionPipeline

torch.manual_seed(1)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v-1-4", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

prompt = "二次元，一个紫色长发小女孩穿着粉色吊带漏肩连衣裙，在练习室练习唱歌，手持话筒"
negative_prompt = "丑陋、变形、嘈杂、模糊、低对比度"
guidance_scale = 4
num_inference_steps = 50

image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    guidance_scale=guidance_scale,
    num_inference_steps=num_inference_steps,
    height=1024,
    width=1024,
).images[0]

image.save("example_image.png")
image

from PIL import Image

torch.manual_seed(1)
image = pipe(
    prompt="二次元，日系动漫，演唱会的观众席，人山人海，一个紫色短发小女孩穿着粉色吊带漏肩连衣裙坐在演唱会的观众席，舞台上衣着华丽的歌星们在唱歌",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("1.jpg")

torch.manual_seed(1)
image = pipe(
    prompt="二次元，一个紫色短发小女孩穿着粉色吊带漏肩连衣裙坐在演唱会的观众席，露出憧憬的神情",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度，色情擦边",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("2.jpg")

torch.manual_seed(2)
image = pipe(
    prompt="二次元，一个紫色短发小女孩穿着粉色吊带漏肩连衣裙坐在演唱会的观众席，露出憧憬的神情",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度，色情擦边",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("3.jpg")

torch.manual_seed(5)
image = pipe(
    prompt="二次元，一个紫色短发小女孩穿着粉色吊带漏肩连衣裙，对着流星许愿，闭着眼睛，十指交叉，侧面",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度，扭曲的手指，多余的手指",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("4.jpg")

torch.manual_seed(0)
image = pipe(
    prompt="二次元，一个紫色中等长度头发小女孩穿着粉色吊带漏肩连衣裙，在练习室练习唱歌",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("5.jpg")

torch.manual_seed(1)
image = pipe(
    prompt="二次元，一个紫色长发小女孩穿着粉色吊带漏肩连衣裙，在练习室练习唱歌，手持话筒",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("6.jpg")

torch.manual_seed(7)
image = pipe(
    prompt="二次元，紫色长发少女，穿着黑色连衣裙，试衣间，心情忐忑",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("7.jpg")

torch.manual_seed(0)
image = pipe(
    prompt="二次元，紫色长发少女，穿着黑色礼服，连衣裙，在台上唱歌",
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度",
    cfg_scale=4,
    num_inference_steps=50, height=1024, width=1024,
)
image.save("8.jpg")

import numpy as np
from PIL import Image


images = [np.array(Image.open(f"{i}.jpg")) for i in range(1, 9)]
image = np.concatenate([
    np.concatenate(images[0:2], axis=1),
    np.concatenate(images[2:4], axis=1),
    np.concatenate(images[4:6], axis=1),
    np.concatenate(images[6:8], axis=1),
], axis=0)
image = Image.fromarray(image).resize((1024, 2048))
image

分析代码的主体架构

输入Prompt

你是一个优秀的python开发工程师，现在我们需要你帮我们分析这个代码的主体框架，你需

要把代码按照工作流分成几部分，用中文回答我的问题。{此处替换前面的代码}

返回的结果

这段代码可以分为以下几个主要的工作流程部分：

1. 环境准备与依赖安装
这一部分通过命令行指令安装所需的Python包和库：
- 使用 `!pip install` 命令来安装或卸载所需的库。
- 安装的库包括 `simple-aesthetics-predictor`, `data-juicer`, `peft`, `lightning`, `pandas`, `torchvision`, 和 `DiffSynth-Studio`。

2. 数据集加载与预处理
这部分代码从 ModelScope 加载了一个名为 `lowres_anime` 的数据集，并将其保存到指定目录下：
- 使用 `MsDataset.load()` 方法加载数据集。
- 遍历数据集中的每条数据，将图像转换为 RGB 模式并保存到磁盘上。
- 将每条数据的元数据（包括文本描述和图像路径）写入 JSONL 文件中。

3. 数据清洗与过滤
在这一步，使用 `data-juicer` 对数据进行进一步的处理：
- 创建一个 YAML 配置文件，定义数据处理的参数，如图像尺寸过滤和宽高比过滤。
- 使用 `dj-process` 命令行工具根据配置文件对数据进行处理。

4. 数据整理与特征提取
这部分代码读取经过处理的数据，并利用 CLIP 模型计算图像与文本之间的相似度：
- 从处理后的输出文件中读取数据并创建 DataFrame。
- 使用 CLIP 模型和处理器来计算图像和文本的相似度得分。

5. 数据集构建
这里构建了一个 PyTorch 数据集类，并创建了 DataLoader 以供后续训练使用：
- 定义了一个自定义的 `CustomDataset` 类继承自 `torch.utils.data.Dataset`。
- 使用 `DataLoader` 从自定义的数据集中加载批次数据。

6. 图像生成
这部分代码利用 `diffusers` 库中的 `StableDiffusionPipeline` 生成二次元风格的图像：
- 加载预训练模型。
- 设置随机种子以确保生成结果的一致性。
- 通过调用 `pipe` 方法并传递不同的提示来生成多张图像。
- 保存生成的图像到磁盘。

7. 图像拼接展示
最后，代码将生成的多个图像拼接成一张大图以便展示：
- 使用 NumPy 将图像数组拼接在一起。
- 使用 PIL 的 `Image.fromarray` 方法创建最终的拼接图像，并调整其大小。

综上所述，这段代码实现了从数据集加载到数据处理，再到特征提取、数据集构建，以及最终的图像生成与展示的完整流程。

逐行解释代码

输入Prompt

你是一个优秀的python开发工程师，现在我们需要你帮我们逐行分析这个代码，用中文回答我的问题。{此处替换前面的代码}

返回的结果

# 安装 Data-Juicer 和 DiffSynth-Studio
!pip install simple-aesthetics-predictor # 安装simple-aesthetics-predictor
!pip install -v -e data-juicer # 安装data-juicer
!pip uninstall pytorch-lightning -y # 卸载pytorch-lightning
!pip install peft lightning pandas torchvision # 安装 peft lightning pandas torchvision
!pip install -e DiffSynth-Studio # 安装DiffSynth-Studio

# 从魔搭数据集中下载数据集AI-ModelScope/lowres_anime
from modelscope.msdatasets import MsDataset  #引入数据集模块msdatasets
ds = MsDataset.load(
    'AI-ModelScope/lowres_anime',
    subset_name='default',
    split='train',
    cache_dir="/mnt/workspace/kolors/data" # 指定缓存目录
) # 从魔搭数据集中下载数据集AI-ModelScope/lowres_anime，赋值给参数ds

# 生成数据集
import json, os # 导入json和os模块
from data_juicer.utils.mm_utils import SpecialTokens # 导入SpecialTokens
from tqdm import tqdm # 导入tqdm进度条管理
os.makedirs("./data/lora_dataset/train", exist_ok=True) # 创建文件夹./data/lora_dataset/train
os.makedirs("./data/data-juicer/input", exist_ok=True) # 创建文件夹./data/data-juicer/input
with open("./data/data-juicer/input/metadata.jsonl", "w") as f:
    for data_id, data in enumerate(tqdm(ds)): # 遍历数据集ds
        image = data["image"].convert("RGB") # 将数据集的图片转换为RGB
        image.save(f"/mnt/workspace/kolors/data/lora_dataset/train/{data_id}.jpg") # 保存数据集的图片
        metadata = {"text": "二次元", "image": [f"/mnt/workspace/kolors/data/lora_dataset/train/{data_id}.jpg"]} # 生成当前图片的索引数据
        f.write(json.dumps(metadata)) # 将索引数据写入文件./data/data-juicer/input/metadata.jsonl
        f.write("\n")

# 配置data-juicer，并进行数据筛选过滤
# 配置过滤的规则
data_juicer_config = """
# global parameters
project_name: 'data-process' # 名称
dataset_path: './data/data-juicer/input/metadata.jsonl'  # 你前面生成的数据的索引文件
np: 4  # 线程数

text_keys: 'text' # 文件./data/data-juicer/input/metadata.jsonl的描述的字段名
image_key: 'image' # 文件./data/data-juicer/input/metadata.jsonl的图片字段名
image_special_token: '<__dj__image>'

export_path: './data/data-juicer/output/result.jsonl' # 筛选通过的图片结果保存的的索引文件

# process schedule
# a list of several process operators with their arguments
# 过滤的规则
process:
    - image_shape_filter: # 图片尺寸过滤
        min_width: 1024 # 最小宽度1024
        min_height: 1024 # 最小高度1024
        any_or_all: any # 符合前面条件的图片才会被保留
    - image_aspect_ratio_filter: # 图片长宽比过滤
        min_ratio: 0.5 # 最小长宽比0.5
        max_ratio: 2.0 # 最大长宽比2.0
        any_or_all: any # 符合前面条件的图片才会被保留
"""

# 保存data-juicer配置到data/data-juicer/data_juicer_config.yaml
with open("data/data-juicer/data_juicer_config.yaml", "w") as file:
    file.write(data_juicer_config.strip())
# data-juicer开始执行数据筛选
!dj-process --config data/data-juicer/data_juicer_config.yaml


# 通过前面通过data-juicer筛选的图片索引信息./data/data-juicer/output/result.jsonl，生成数据集
import pandas as pd # 导入pandas
import os, json # 导入os和json
from PIL import Image # 导入Image
from tqdm import tqdm # 导入tqdm进度条管理
texts, file_names = [], [] # 定义两个空列表，分别存储图片描述和图片名称
os.makedirs("./data/lora_dataset_processed/train", exist_ok=True) # 创建文件夹./data/lora_dataset_processed/train
with open("./data/data-juicer/output/result.jsonl", "r") as file: # 打开前面data-juicer筛选的图片索引文件./data/data-juicer/output/result.jsonl
    for data_id, data in enumerate(tqdm(file.readlines())): # 遍历文件./data/data-juicer/output/result.jsonl
        data = json.loads(data) # 将json字符串转换为对象
        text = data["text"] # 获取对象中的text属性，也就是图片的描述信息
        texts.append(text) # 将图片的描述信息添加到texts列表中
        image = Image.open(data["image"][0]) # 获取对象中的image属性，也就是图片的路径,然后用这个路径打开图片
        image_path = f"./data/lora_dataset_processed/train/{data_id}.jpg" # 生成保存图片的路径
        image.save(image_path) # 将图片保存到./data/lora_dataset_processed/train文件夹中
        file_names.append(f"{data_id}.jpg") # 将图片名称添加到file_names列表中
data_frame = pd.DataFrame() # 创建空的DataFrame
data_frame["file_name"] = file_names # 将图片名称添加到data_frame中
data_frame["text"] = texts # 将图片描述添加到data_frame中
data_frame.to_csv("./data/lora_dataset_processed/train/metadata.csv", index=False, encoding="utf-8-sig") # 将data_frame保存到./data/lora_dataset_processed/train/metadata.csv
data_frame # 查看data_frame


# 下载可图模型
from diffsynth import download_models # 导入download_models
download_models(["Kolors", "SDXL-vae-fp16-fix"]) # 下载可图模型
# DiffSynth-Studio提供了可图的Lora训练脚本，查看脚本信息
!python DiffSynth-Studio/examples/train/kolors/train_kolors_lora.py -h


# 执行可图Lora训练
import os
cmd = """
python DiffSynth-Studio/examples/train/kolors/train_kolors_lora.py \ # 选择使用可图的Lora训练脚本DiffSynth-Studio/examples/train/kolors/train_kolors_lora.py
  --pretrained_unet_path models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors \ # 选择unet模型
  --pretrained_text_encoder_path models/kolors/Kolors/text_encoder \ # 选择text_encoder
  --pretrained_fp16_vae_path models/sdxl-vae-fp16-fix/diffusion_pytorch_model.safetensors \ # 选择vae模型
  --lora_rank 16 \ # lora_rank 16 表示在权衡模型表达能力和训练效率时，选择了使用 16 作为秩，适合在不显著降低模型性能的前提下，通过 LoRA 减少计算和内存的需求
  --lora_alpha 4.0 \ # 设置 LoRA 的 alpha 值，影响调整的强度
  --dataset_path data/lora_dataset_processed \ # 指定数据集路径，用于训练模型
  --output_path ./models \ # 指定输出路径，用于保存模型
  --max_epochs 1 \ # 设置最大训练轮数为 1
  --center_crop \ # 启用中心裁剪，这通常用于图像预处理
  --use_gradient_checkpointing \ # 启用梯度检查点技术，以节省内存
  --precision "16-mixed" # 指定训练时的精度为混合 16 位精度（half precision），这可以加速训练并减少显存使用
""".strip()
os.system(cmd) # 执行可图Lora训练


# 加载lora微调后的模型
from diffsynth import ModelManager, SDXLImagePipeline # 导入ModelManager和SDXLImagePipeline
from peft import LoraConfig, inject_adapter_in_model # 导入LoraConfig和inject_adapter_in_model
import torch # 导入torch
# 加载LoRA配置并注入模型
def load_lora(model, lora_rank, lora_alpha, lora_path):
    lora_config = LoraConfig(
        r=lora_rank, # 设置LoRA的秩(rank)
        lora_alpha=lora_alpha, # 设置LoRA的alpha值，控制LoRA的影响权重
        init_lora_weights="gaussian", # 初始化LoRA权重为高斯分布
        target_modules=["to_q", "to_k", "to_v", "to_out"], # 指定要应用LoRA的模块
    )
    model = inject_adapter_in_model(lora_config, model) # 将LoRA配置注入到模型中
    state_dict = torch.load(lora_path, map_location="cpu") # 加载LoRA微调后的权重
    model.load_state_dict(state_dict, strict=False) # 将权重加载到模型中，允许部分权重不匹配
    return model # 返回注入LoRA后的模型
# 加载预训练模型
model_manager = ModelManager(
    torch_dtype=torch.float16, # 设置模型的数据类型为float16，减少显存占用
    device="cuda", # 指定使用GPU进行计算
    file_path_list=[
        "models/kolors/Kolors/text_encoder", # 文本编码器的路径
        "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors", # UNet模型的路径
        "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors" # VAE模型的路径
    ]
)
# 初始化图像生成管道
pipe = SDXLImagePipeline.from_model_manager(model_manager) # 从模型管理器中加载模型并初始化管道
# 加载并应用LoRA权重到UNet模型
pipe.unet = load_lora(
    pipe.unet, 
    lora_rank=16, # 设置LoRA的秩(rank)，与训练脚本中的参数保持一致
    lora_alpha=2.0, # 设置LoRA的alpha值，控制LoRA对模型的影响权重
    lora_path="models/lightning_logs/version_0/checkpoints/epoch=0-step=500.ckpt" # 指定LoRA权重的文件路径
)


# 生成图像
torch.manual_seed(0) # 设置随机种子，确保生成的图像具有可重复性。如果想要每次生成不同的图像，可以将种子值改为随机值。
image = pipe(
    prompt="二次元，一个紫色短发小女孩，在家中沙发上坐着，双手托着腮，很无聊，全身，粉色连衣裙", # 设置正向提示词，用于指导模型生成图像的内容
    negative_prompt="丑陋、变形、嘈杂、模糊、低对比度", # 设置负向提示词，模型会避免生成包含这些特征的图像
    cfg_scale=4, # 设置分类自由度 (Classifier-Free Guidance) 的比例，数值越高，模型越严格地遵循提示词
    num_inference_steps=50, # 设置推理步数，步数越多，生成的图像细节越丰富，但生成时间也更长
    height=1024, width=1024, # 设置生成图像的高度和宽度，这里生成 1024x1024 像素的图像
)
image.save("1.jpg") # 将生成的图像保存为 "1.jpg" 文件


# 图像拼接，展示总体拼接大图
import numpy as np  # 导入numpy库，用于处理数组和数值计算
from PIL import Image  # 导入PIL库中的Image模块，用于图像处理
images = [np.array(Image.open(f"{i}.jpg")) for i in range(1, 9)]  # 读取1.jpg到8.jpg的图像，转换为numpy数组，并存储在列表images中
image = np.concatenate([  # 将四组图像在垂直方向上拼接
    np.concatenate(images[0:2], axis=1),  # 将第1组（images[0:2]）的两张图像在水平方向上拼接
    np.concatenate(images[2:4], axis=1),  # 将第2组（images[2:4]）的两张图像在水平方向上拼接
    np.concatenate(images[4:6], axis=1),  # 将第3组（images[4:6]）的两张图像在水平方向上拼接
    np.concatenate(images[6:8], axis=1),  # 将第4组（images[6:8]）的两张图像在水平方向上拼接
], axis=0)  # 将四组拼接后的图像在垂直方向上拼接
image = Image.fromarray(image).resize((1024, 2048))  # 将拼接后的numpy数组转换为图像对象，并调整大小为1024x2048像素
image  # 输出最终生成的图像对象，用于显示图像

其他疑问-向AI追问

可以向AI追问不懂的问题，例如：

我对其中{替换成你的问题}还是不太理解，给我再详细介绍一下

用好AI，可以帮助我们更高效地学习。

进行实战——基于话剧的连环画制作

提示词准备

你是一个文生图专家，我们现在要做一个实战项目，就是要编排一个文生图话剧
话剧由8张场景图片生成，你需要输出每张图片的生图提示词

具体的场景图片
1、女主正在上课
2、开始睡着了
3、进入梦乡，梦到自己站在路旁
4、王子骑马而来
5、两人相谈甚欢
6、一起坐在马背上
7、下课了，梦醒了
8、又回到了学习生活中

生图提示词要求
1、风格为古风
2、根据场景确定是使用全身还是上半身
3、人物描述
4、场景描述
5、做啥事情

例子：
古风，水墨画，一个黑色长发少女，坐在教室里，盯着黑板，深思，上半身，红色长裙

询问通义【可以多问几次，修改到自己理想的状态】

根据AI助手的回答，按以下格式整理出场景表格

图片编号

场景描述

正向提示词

反向提示词

...... ...... ...... ......

执行Task1的30分钟速通Baseline

Datawhale X 魔搭 AI夏令营-第四期 AIGC-Task 1-CSDN博客

修改提示词

双击进入baseline文件

找到生成图像的板块

依次替换8张图片的正向提示词和反向提示词