lhoestq's picture
lhoestq HF staff
wip
4fbb557
raw
history blame
4.97 kB
from itertools import count, islice
from typing import Any, Iterable
import gradio as gr
import pandas as pd
import requests
from gradio_huggingfacehub_search import HuggingfaceHubSearch
session = requests.Session()
empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
NUM_ROWS_PREVIEW = 5
with gr.Blocks() as demo:
gr.Markdown(
"# 🤗 Dataset ReWriter ✍️✨\n\n"
"Adjust, translate or transform completely existing datasets.\n\n"
)
with gr.Row():
with gr.Column(scale=3):
dataset_search = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
)
subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
input_query = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
gr.Markdown("### Input")
input_preview = gr.DataFrame(interactive=False, wrap=True)
gr.Markdown("### Output")
output_preview = gr.DataFrame(interactive=False, wrap=True)
save_button = gr.Button("Save ReWriten Dataset", interactive=False)
############
#
# Utils
#
###########
def stream_rows(dataset: str, subset: str, split: str, batch_size: int = 100) -> Iterable[dict[str, Any]]:
for i in count():
rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={subset}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
if "error" in rows_resp:
raise RuntimeError(rows_resp["error"])
if not rows_resp["rows"]:
break
for row_item in rows_resp["rows"]:
yield row_item["row"]
############
#
# Events
#
###########
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str) -> dict:
if "/" not in dataset.strip().strip("/"):
return None, None, {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
}
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
if "error" in info_resp:
return None, None, {
subset_dropdown: gr.Dropdown(visible=False),
split_dropdown: gr.Dropdown(visible=False),
}
subsets: list[str] = list(info_resp["dataset_info"])
subset = default_subset if default_subset in subsets else subsets[0]
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
split = default_split if default_split in splits else splits[0]
return subset, split, {
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
}
def _show_input_preview(dataset: str, default_subset: str, default_split: str) -> dict:
subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
if subset is None or split is None:
return output
return {
input_preview: pd.DataFrame(islice(({
k: str(v) for k, v in row.items()}
for row in stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)
), NUM_ROWS_PREVIEW)),
**output
}
@dataset_search.change(inputs=[dataset_search], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_dataset_search(dataset: str) -> dict:
return _show_input_preview(dataset, default_subset="default", default_split="train")
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
return _show_input_preview(dataset, default_subset=subset, default_split="train")
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[input_preview, subset_dropdown, split_dropdown])
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
return _show_input_preview(dataset, default_subset=subset, default_split=split)
@rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, input_preview], outputs=[output_preview])
def rewrite(dataset: str, subset: str, split: str, input_preview_df: pd.DataFrame) -> dict:
# TODO: implement
return {output_preview: pd.DataFrame([{"TODO": ["implement"]}])}
demo.launch()