Datasets-Convertor

Running

App Files Files Community

openfree commited on 9 days ago

Commit

0df8fba

verified ·

1 Parent(s): 90f89f0

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -37

app.py CHANGED Viewed

@@ -3,62 +3,96 @@ import pandas as pd
 import requests
 from io import BytesIO
-def convert_hf_dataset(file_url: str):
-    file_url = file_url.strip()
-    # Check that the URL is from Hugging Face
-    if "huggingface.co" not in file_url:
-        raise ValueError("Please provide a URL from Hugging Face datasets.")
-    # Ensure the URL has a scheme; if not, add "https://"
-    if not file_url.lower().startswith(("http://", "https://")):
-        file_url = "https://" + file_url
-    # Download the content from the URL
-    response = requests.get(file_url)
-    response.raise_for_status()
-    content = response.content
-    # Determine file type from URL extension and convert accordingly
-    if file_url.lower().endswith(".csv"):
-        # If it's a CSV, read it and convert to Parquet
-        df = pd.read_csv(BytesIO(content))
-        output_file = "output.parquet"
         df.to_parquet(output_file, index=False)
-        converted_format = "Parquet"
-    elif file_url.lower().endswith(".parquet"):
-        # If it's a Parquet file, read it and convert to CSV
-        df = pd.read_parquet(BytesIO(content))
-        output_file = "output.csv"
-        df.to_csv(output_file, index=False)
-        converted_format = "CSV"
     else:
-        raise ValueError("The URL must point to a .csv or .parquet file.")
-    # Create a preview of the top 10 rows
     preview = df.head(10).to_string(index=False)
     info_message = (
-        f"Input file: {file_url.split('/')[-1]}\n"
         f"Converted file format: {converted_format}\n\n"
         f"Preview (Top 10 Rows):\n{preview}"
     )
     return output_file, info_message
 demo = gr.Interface(
     fn=convert_hf_dataset,
-    inputs=gr.Textbox(
-        label="Hugging Face Dataset URL",
-        placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
-    ),
     outputs=[
         gr.File(label="Converted File"),
         gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
     ],
     title="Hugging Face CSV <-> Parquet Converter",
     description=(
-        "Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). "
-        "The app will automatically detect the file type, convert it to the opposite format, "
-        "and display a preview of the top 10 rows."
     )
 )

 import requests
 from io import BytesIO
+def convert_hf_dataset(input_file, file_url):
+    """
+    This function accepts either an uploaded file or a Hugging Face dataset URL.
+    It automatically determines the file type (CSV or Parquet) based on the file extension,
+    converts the file to the opposite format, and returns the converted file along with a preview
+    of the top 10 rows.
+    """
+    df = None
+    source = None
+    converted_format = None
+    output_file = None
+    # If no file is provided via upload and URL is empty, raise an error.
+    if input_file is None and (file_url is None or file_url.strip() == ""):
+        raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.")
+    if input_file is not None:
+        # Process the uploaded file.
+        source = input_file.name
+        file_extension = source.lower().split('.')[-1]
+        file_bytes = input_file.read()  # read the file content
+        if file_extension == "csv":
+            df = pd.read_csv(BytesIO(file_bytes))
+            converted_format = "Parquet"
+            output_file = "output.parquet"
+        elif file_extension == "parquet":
+            df = pd.read_parquet(BytesIO(file_bytes))
+            converted_format = "CSV"
+            output_file = "output.csv"
+        else:
+            raise ValueError("Uploaded file must have a .csv or .parquet extension.")
+    else:
+        # Process the URL input.
+        file_url = file_url.strip()
+        if "huggingface.co" not in file_url:
+            raise ValueError("Please provide a URL from Hugging Face datasets.")
+        if not file_url.lower().startswith(("http://", "https://")):
+            file_url = "https://" + file_url
+        source = file_url.split('/')[-1]
+        response = requests.get(file_url)
+        response.raise_for_status()
+        content = response.content
+        if file_url.lower().endswith(".csv"):
+            df = pd.read_csv(BytesIO(content))
+            converted_format = "Parquet"
+            output_file = "output.parquet"
+        elif file_url.lower().endswith(".parquet"):
+            df = pd.read_parquet(BytesIO(content))
+            converted_format = "CSV"
+            output_file = "output.csv"
+        else:
+            raise ValueError("The URL must point to a .csv or .parquet file.")
+    # Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV.
+    if converted_format == "Parquet":
         df.to_parquet(output_file, index=False)
     else:
+        df.to_csv(output_file, index=False)
+    # Create a preview (top 10 rows) of the DataFrame.
     preview = df.head(10).to_string(index=False)
     info_message = (
+        f"Input file: {source}\n"
         f"Converted file format: {converted_format}\n\n"
         f"Preview (Top 10 Rows):\n{preview}"
     )
     return output_file, info_message
 demo = gr.Interface(
     fn=convert_hf_dataset,
+    inputs=[
+        gr.File(label="Uploaded File (Optional)"),
+        gr.Textbox(
+            label="Hugging Face Dataset URL (Optional)",
+            placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
+        )
+    ],
     outputs=[
         gr.File(label="Converted File"),
         gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
     ],
     title="Hugging Face CSV <-> Parquet Converter",
     description=(
+        "Upload a file or enter the URL of a Hugging Face dataset file. "
+        "The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, "
+        "and displays a preview of the top 10 rows."
     )
 )