MuseV-test / mmcm /utils /table_util.py
kevinwang676's picture
Upload folder using huggingface_hub
6755a2d verified
raw
history blame
8.73 kB
import io
from typing import List, Dict, Tuple, BinaryIO
from copy import deepcopy
import pandas as pd
from PIL import Image
from xlsxwriter.utility import xl_col_to_name
def read_image_data(path: str, target_image_height: int) -> Tuple[BinaryIO, int, int]:
"""读取图像,按照目标高做resize处理,并转化成二进制格式,返回最终图像的宽和高
Args:
path (str): 图像路径
target_image_height (int): 目标高
Returns:
Tuple[BinaryIO, int, int]: 图像二进制格式,返回最终图像的宽和高
"""
image = Image.open(path)
image_width_in_excel = int(image.width / (image.height / target_image_height))
image = image.resize(size=(image_width_in_excel, target_image_height))
image_byte = io.BytesIO()
image.save(image_byte, format="png")
return image_byte, image.width, image.height
def set_text_column_dynamic_width(worksheet, df, format, default_width=50):
"""将df在excel workshhet中的列按照实际内容长度设置列宽以及文本格式
Args:
worksheet (_type_): 待处理的excel worksheet
df (_type_): worksheet中原来对应的DataFrame格式
format (_type_): 对应列的文本格式
default_width (int, optional): 默认目标宽度. Defaults to 50.
"""
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
width = min(column_width, default_width)
worksheet.set_column(col_idx, col_idx, width, format)
def convert_tasks2clean(tasks):
tasks = [{"prompt": task["prompt"]} for task in tasks]
return tasks
def split_tasks_by_images_lst(tasks, save_images_path_key: str = "save_images_path"):
new_tasks = []
for task in tasks:
for image_path in task[save_images_path_key]:
new_task = deepcopy(task)
new_task[save_images_path_key] = image_path
new_tasks.append(new_task)
return new_tasks
def save_texts_images_2_csv(tasks: List[Dict], save_path: str):
"""存储相关结果为csv表格
tasks (List[Dict]): 待转换的字典列表
save_path (str): 表格存储路径
"""
df = pd.DataFrame(tasks)
df.to_csv(save_path, encoding="utf_8_sig", index=False)
def add_multi_data_validation(workbook, worksheet, validates, validate_idxs, n_rows):
for i, validate in enumerate(validates):
validate_idx = validate_idxs[i]
worksheet = add_data_validation(
workbook=workbook,
worksheet=worksheet,
col=validate_idx,
head=validate["col_name"],
candidates=validate["candidates"],
colors=validate["colors"],
n_rows=n_rows,
)
return worksheet
def add_data_validation(
workbook, worksheet, col: int, head, candidates, n_rows, colors
):
col = xl_col_to_name(col)
# Adding the header and Datavalidation list
worksheet.write('{}1'.format(col), head)
colors_fmt = [workbook.add_format({'bg_color': color}) for color in colors]
for row in range(n_rows):
cell_idx = '{}{}'.format(col, row + 2)
worksheet.data_validation(cell_idx, {'validate': 'list', 'source': candidates})
for i_c in range(len(candidates)):
worksheet.conditional_format(
cell_idx,
{
'type': 'formula',
'criteria': '=${}=\"{}\"'.format(cell_idx, candidates[i_c]),
'format': colors_fmt[i_c],
},
)
return worksheet
def insert_cell_image(
worksheet,
row,
col,
image_path,
image_height_in_table,
text_format,
row_ratio,
col_ratio,
):
image_byte, new_image_width, new_image_height = read_image_data(
image_path, target_image_height=image_height_in_table
)
# TODO:现在的图像列并不是预期内的和图像等宽,而是宽了很多
worksheet.set_column(
col,
col,
int(new_image_width / col_ratio),
)
worksheet.insert_image(
row,
col,
image_path,
{"image_data": image_byte},
)
worksheet.set_row(row, int(new_image_height / row_ratio), text_format)
return worksheet
def save_texts_images_2_excel(
tasks: List[Dict],
save_path: str,
image_height_in_table: int = 120,
row_ratio: float = 1.3,
col_ratio: float = 5,
validates: List = None,
):
"""将任务列表和生成的图像统一存储在表格中,方便观看对比实验结果。
Args:
tasks (List[Dict]): 待转换的字典列表
save_path (str): 表格存储路径
image_height_in_table (int, optional): 表格中缩略图的高. Defaults to 120.
row_ratio (float, optional): excel的单元格宽高和实际图像边长需要做比例转换. Defaults to 1.2.
col_ratio (float, optional): excel的单元格宽高和实际图像边长需要做比例转换. Defaults to 7.5.
need_add_checker_column (bool, optional): 是否新增一列用于审核检查状态. Defaults to False.
"""
df = pd.DataFrame(tasks)
# 先找到需要插入图像的列,插入图像列
keys_with_image = [
k for k in tasks[0].keys() if "images_path" in k and k != "save_images_path"
]
high_priority_col_idx = 0
# 默认save_images_path是生成图像,放在后面
if "save_images_path" in tasks[0]:
keys_with_image.append("save_images_path")
for img_key in keys_with_image:
maxlen_img_key_value = max(
[
len(task[img_key]) if isinstance(task[img_key], list) else 1
for task in tasks
]
)
for i in range(maxlen_img_key_value):
column = "{}_{}".format(img_key, i)
if column not in df.columns:
df.insert(
loc=high_priority_col_idx,
column=column,
value="",
)
high_priority_col_idx += 1
validate_start_idx = high_priority_col_idx
if validates is not None:
for i, validate in enumerate(validates):
if validate["col_name"] not in df.columns:
col_idx = validate_start_idx + i
df.insert(loc=col_idx, column=validate["col_name"], value="")
validate_idxs = range(validate_start_idx, validate_start_idx + len(validates))
writer = pd.ExcelWriter(save_path, engine="xlsxwriter")
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name="Sheet1", index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets["Sheet1"]
n_col = len(df.columns)
n_row = len(df)
text_format = workbook.add_format({"text_wrap": True})
set_text_column_dynamic_width(worksheet=worksheet, df=df, format=text_format)
# Insert an image.
for row in range(len(df)):
task = tasks[row]
cell_col = 0
for im_idx, img_key in enumerate(keys_with_image):
images_path = task[img_key]
if not isinstance(images_path, list):
cell_row = 1 + row
worksheet.write(cell_row, cell_col, img_key)
if len(images_path) == 0:
continue
worksheet = insert_cell_image(
worksheet=worksheet,
row=cell_row,
col=cell_col,
image_path=images_path,
image_height_in_table=image_height_in_table,
text_format=text_format,
row_ratio=row_ratio,
col_ratio=col_ratio,
)
cell_col += 1
else:
for i, image_path in enumerate(images_path):
worksheet = insert_cell_image(
worksheet=worksheet,
row=cell_row,
col=cell_col,
image_path=image_path,
image_height_in_table=image_height_in_table,
text_format=text_format,
row_ratio=row_ratio,
col_ratio=col_ratio,
)
cell_col += 1
if validates is not None:
worksheet = add_multi_data_validation(
workbook,
worksheet,
validates=validates,
validate_idxs=validate_idxs,
n_rows=len(df),
)
# Close the Pandas Excel writer and output the Excel file.
writer.save()