openfree commited on
Commit
4089d90
Β·
verified Β·
1 Parent(s): 5322786

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -9
app.py CHANGED
@@ -3,6 +3,27 @@ import pandas as pd
3
  import json
4
  from io import BytesIO
5
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def dataset_converter(input_file, conversion_type, parquet_url):
8
  # Initialize variables for file data and extension
@@ -45,7 +66,6 @@ def dataset_converter(input_file, conversion_type, parquet_url):
45
  elif conversion_type == "CSV to JSONL":
46
  if input_file is None or file_extension != "csv":
47
  raise ValueError("For CSV to JSONL conversion, please upload a CSV file. πŸ“„")
48
- # Read CSV with latin1 encoding
49
  df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
50
  output_file = "metadata.jsonl"
51
  total_data = []
@@ -58,7 +78,6 @@ def dataset_converter(input_file, conversion_type, parquet_url):
58
  data[column] = row[column]
59
  row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
60
  total_data.append(row_data)
61
- # Write JSONL output (using write mode so previous data is overwritten)
62
  with open(output_file, 'w', encoding='utf-8') as f:
63
  for row_data in total_data:
64
  f.write(json.dumps(row_data) + '\n')
@@ -67,19 +86,17 @@ def dataset_converter(input_file, conversion_type, parquet_url):
67
 
68
  # Conversion: Parquet to JSONL
69
  elif conversion_type == "Parquet to JSONL":
70
- # Use uploaded file if available; otherwise try the provided URL
71
  if input_file is not None:
72
  df = pd.read_parquet(BytesIO(file_bytes))
73
  elif parquet_url:
74
  response = requests.get(parquet_url)
75
- response.raise_for_status() # Ensure the request was successful
76
  df = pd.read_parquet(BytesIO(response.content))
77
  file_name = "from_url.parquet"
78
  else:
79
  raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
80
 
81
  output_file = "output.jsonl"
82
- # Recursive function to decode bytes to UTF-8 strings
83
  def recursive_sanitize(val):
84
  if isinstance(val, bytes):
85
  return val.decode("utf-8", errors="replace")
@@ -98,6 +115,29 @@ def dataset_converter(input_file, conversion_type, parquet_url):
98
  converted_format = "JSONL"
99
  preview_str = df.head(10).to_string(index=False)
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  else:
102
  raise ValueError("Invalid conversion type selected. ⚠️")
103
 
@@ -116,7 +156,7 @@ body {
116
  font-family: 'Helvetica Neue', Arial, sans-serif;
117
  }
118
  .gradio-container {
119
- max-width: 900px;
120
  margin: 40px auto;
121
  padding: 20px;
122
  background-color: #ffffff;
@@ -146,7 +186,7 @@ h1, h2 {
146
  with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
147
  gr.Markdown("# Datasets Convertor πŸš€")
148
  gr.Markdown(
149
- "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
150
  "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
151
  )
152
 
@@ -155,11 +195,10 @@ with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
155
  input_file = gr.File(label="Upload CSV or Parquet File πŸ“„")
156
  with gr.Column(scale=1):
157
  conversion_type = gr.Radio(
158
- choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
159
  label="Conversion Type πŸ”„"
160
  )
161
 
162
- # Optional URL input for Parquet to JSONL conversion
163
  parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
164
 
165
  convert_button = gr.Button("Convert ⚑", elem_classes=["gradio-button"])
 
3
  import json
4
  from io import BytesIO
5
  import requests
6
+ import re
7
+ from openpyxl import Workbook
8
+
9
+ def sanitize_value(val):
10
+ """
11
+ Convert complex types to a string and remove illegal characters
12
+ that Excel does not accept.
13
+ """
14
+ if isinstance(val, bytes):
15
+ try:
16
+ s = val.decode("utf-8", errors="replace")
17
+ except Exception:
18
+ s = str(val)
19
+ # Remove control characters (except newline and tab if desired)
20
+ return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
21
+ elif isinstance(val, str):
22
+ return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', val)
23
+ elif isinstance(val, (dict, list)):
24
+ return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', str(val))
25
+ else:
26
+ return val
27
 
28
  def dataset_converter(input_file, conversion_type, parquet_url):
29
  # Initialize variables for file data and extension
 
66
  elif conversion_type == "CSV to JSONL":
67
  if input_file is None or file_extension != "csv":
68
  raise ValueError("For CSV to JSONL conversion, please upload a CSV file. πŸ“„")
 
69
  df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
70
  output_file = "metadata.jsonl"
71
  total_data = []
 
78
  data[column] = row[column]
79
  row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
80
  total_data.append(row_data)
 
81
  with open(output_file, 'w', encoding='utf-8') as f:
82
  for row_data in total_data:
83
  f.write(json.dumps(row_data) + '\n')
 
86
 
87
  # Conversion: Parquet to JSONL
88
  elif conversion_type == "Parquet to JSONL":
 
89
  if input_file is not None:
90
  df = pd.read_parquet(BytesIO(file_bytes))
91
  elif parquet_url:
92
  response = requests.get(parquet_url)
93
+ response.raise_for_status()
94
  df = pd.read_parquet(BytesIO(response.content))
95
  file_name = "from_url.parquet"
96
  else:
97
  raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
98
 
99
  output_file = "output.jsonl"
 
100
  def recursive_sanitize(val):
101
  if isinstance(val, bytes):
102
  return val.decode("utf-8", errors="replace")
 
115
  converted_format = "JSONL"
116
  preview_str = df.head(10).to_string(index=False)
117
 
118
+ # Conversion: Parquet to XLS
119
+ elif conversion_type == "Parquet to XLS":
120
+ if input_file is not None:
121
+ df = pd.read_parquet(BytesIO(file_bytes))
122
+ elif parquet_url:
123
+ response = requests.get(parquet_url)
124
+ response.raise_for_status()
125
+ df = pd.read_parquet(BytesIO(response.content))
126
+ file_name = "from_url.parquet"
127
+ else:
128
+ raise ValueError("For Parquet to XLS conversion, please upload a file or provide a URL. 🌐")
129
+
130
+ output_file = "output.xlsx"
131
+ wb = Workbook(write_only=True)
132
+ ws = wb.create_sheet()
133
+ ws.append(list(df.columns))
134
+ for row in df.itertuples(index=False, name=None):
135
+ sanitized_row = [sanitize_value(cell) for cell in row]
136
+ ws.append(sanitized_row)
137
+ wb.save(output_file)
138
+ converted_format = "XLS"
139
+ preview_str = df.head(10).to_string(index=False)
140
+
141
  else:
142
  raise ValueError("Invalid conversion type selected. ⚠️")
143
 
 
156
  font-family: 'Helvetica Neue', Arial, sans-serif;
157
  }
158
  .gradio-container {
159
+ max-width: 1000px;
160
  margin: 40px auto;
161
  padding: 20px;
162
  background-color: #ffffff;
 
186
  with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
187
  gr.Markdown("# Datasets Convertor πŸš€")
188
  gr.Markdown(
189
+ "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL/XLS conversion) "
190
  "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
191
  )
192
 
 
195
  input_file = gr.File(label="Upload CSV or Parquet File πŸ“„")
196
  with gr.Column(scale=1):
197
  conversion_type = gr.Radio(
198
+ choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL", "Parquet to XLS"],
199
  label="Conversion Type πŸ”„"
200
  )
201
 
 
202
  parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
203
 
204
  convert_button = gr.Button("Convert ⚑", elem_classes=["gradio-button"])