Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import json | |
from io import BytesIO | |
import requests | |
def dataset_converter(input_file, conversion_type, parquet_url): | |
# Initialize variables for file data and extension | |
file_bytes = None | |
file_name = None | |
file_extension = None | |
# Read the input file if provided | |
if input_file is not None: | |
try: | |
file_bytes = input_file.read() | |
file_name = input_file.name | |
except AttributeError: | |
file_name = input_file | |
with open(file_name, "rb") as f: | |
file_bytes = f.read() | |
file_extension = file_name.lower().split('.')[-1] | |
# Conversion: CSV to Parquet | |
if conversion_type == "CSV to Parquet": | |
if input_file is None or file_extension != "csv": | |
raise ValueError("For CSV to Parquet conversion, please upload a CSV file. π") | |
df = pd.read_csv(BytesIO(file_bytes)) | |
output_file = "output.parquet" | |
df.to_parquet(output_file, index=False) | |
converted_format = "Parquet" | |
preview_str = df.head(10).to_string(index=False) | |
# Conversion: Parquet to CSV | |
elif conversion_type == "Parquet to CSV": | |
if input_file is None or file_extension != "parquet": | |
raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. π") | |
df = pd.read_parquet(BytesIO(file_bytes)) | |
output_file = "output.csv" | |
df.to_csv(output_file, index=False) | |
converted_format = "CSV" | |
preview_str = df.head(10).to_string(index=False) | |
# Conversion: CSV to JSONL | |
elif conversion_type == "CSV to JSONL": | |
if input_file is None or file_extension != "csv": | |
raise ValueError("For CSV to JSONL conversion, please upload a CSV file. π") | |
# Read CSV with latin1 encoding | |
df = pd.read_csv(BytesIO(file_bytes), encoding='latin1') | |
output_file = "metadata.jsonl" | |
total_data = [] | |
for index, row in df.iterrows(): | |
data = {} | |
file_name_val = None # Initialize file_name for each row | |
for column in df.columns: | |
if column == 'file_name': | |
file_name_val = row[column] | |
data[column] = row[column] | |
row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)} | |
total_data.append(row_data) | |
# Write JSONL output (using write mode so previous data is overwritten) | |
with open(output_file, 'w', encoding='utf-8') as f: | |
for row_data in total_data: | |
f.write(json.dumps(row_data) + '\n') | |
converted_format = "JSONL" | |
preview_str = df.head(10).to_string(index=False) | |
# Conversion: Parquet to JSONL | |
elif conversion_type == "Parquet to JSONL": | |
# Use uploaded file if available; otherwise try the provided URL | |
if input_file is not None: | |
df = pd.read_parquet(BytesIO(file_bytes)) | |
elif parquet_url: | |
response = requests.get(parquet_url) | |
response.raise_for_status() # Ensure the request was successful | |
df = pd.read_parquet(BytesIO(response.content)) | |
file_name = "from_url.parquet" | |
else: | |
raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. π") | |
output_file = "output.jsonl" | |
# Recursive function to decode bytes to UTF-8 strings | |
def recursive_sanitize(val): | |
if isinstance(val, bytes): | |
return val.decode("utf-8", errors="replace") | |
elif isinstance(val, dict): | |
return {k: recursive_sanitize(v) for k, v in val.items()} | |
elif isinstance(val, list): | |
return [recursive_sanitize(item) for item in val] | |
else: | |
return val | |
records = df.to_dict(orient="records") | |
with open(output_file, "w", encoding="utf-8") as f: | |
for record in records: | |
sanitized_record = recursive_sanitize(record) | |
f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n") | |
converted_format = "JSONL" | |
preview_str = df.head(10).to_string(index=False) | |
else: | |
raise ValueError("Invalid conversion type selected. β οΈ") | |
info_message = ( | |
f"Input file: {file_name if file_name is not None else 'N/A'}\n" | |
f"Converted file format: {converted_format}\n\n" | |
f"Preview (Top 10 Rows):\n{preview_str}\n\n" | |
"Community: https://discord.gg/openfreeai π" | |
) | |
return output_file, info_message | |
# Custom CSS for a modern and sleek look | |
custom_css = """ | |
body { | |
background-color: #f4f4f4; | |
font-family: 'Helvetica Neue', Arial, sans-serif; | |
} | |
.gradio-container { | |
max-width: 900px; | |
margin: 40px auto; | |
padding: 20px; | |
background-color: #ffffff; | |
border-radius: 12px; | |
box-shadow: 0 8px 16px rgba(0,0,0,0.1); | |
} | |
h1, h2 { | |
color: #333333; | |
} | |
.gradio-input, .gradio-output { | |
margin-bottom: 20px; | |
} | |
.gradio-button { | |
background-color: #4CAF50 !important; | |
color: white !important; | |
border: none !important; | |
padding: 10px 20px !important; | |
font-size: 16px !important; | |
border-radius: 6px !important; | |
cursor: pointer; | |
} | |
.gradio-button:hover { | |
background-color: #45a049 !important; | |
} | |
""" | |
with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo: | |
gr.Markdown("# Datasets Convertor π") | |
gr.Markdown( | |
"Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) " | |
"and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. β¨" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_file = gr.File(label="Upload CSV or Parquet File π") | |
with gr.Column(scale=1): | |
conversion_type = gr.Radio( | |
choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"], | |
label="Conversion Type π" | |
) | |
# Optional URL input for Parquet to JSONL conversion | |
parquet_url = gr.Textbox(label="Parquet File URL (Optional) π", placeholder="Enter URL if not uploading a file") | |
convert_button = gr.Button("Convert β‘", elem_classes=["gradio-button"]) | |
with gr.Row(): | |
output_file = gr.File(label="Converted File πΎ") | |
preview = gr.Textbox(label="Preview (Top 10 Rows) π", lines=15) | |
convert_button.click( | |
fn=dataset_converter, | |
inputs=[input_file, conversion_type, parquet_url], | |
outputs=[output_file, preview] | |
) | |
gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) π€") | |
demo.launch() | |