taesiri commited on
Commit
227a2de
·
1 Parent(s): b84aa2b
Files changed (3) hide show
  1. app.py +892 -0
  2. requirements.txt +1 -0
  3. utils.py +140 -0
app.py ADDED
@@ -0,0 +1,892 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ import json
4
+ import os
5
+ import shutil
6
+ import uuid
7
+ import glob
8
+ from huggingface_hub import CommitScheduler, HfApi, snapshot_download
9
+ from pathlib import Path
10
+ import git
11
+ from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
12
+ import threading
13
+ import time
14
+ from utils import process_and_push_dataset
15
+
16
+ api = HfApi(token=os.environ["HF_TOKEN"])
17
+ DATASET_REPO = "taesiri/ZB"
18
+ FINAL_REPO = "taesiri/DatasetOfHardQuestions5"
19
+
20
+
21
+ # Download existing data from hub
22
+ def sync_with_hub():
23
+ """
24
+ Synchronize local data with the hub by cloning the dataset repo
25
+ """
26
+ print("Starting sync with hub...")
27
+ data_dir = Path("./data")
28
+ if data_dir.exists():
29
+ # Backup existing data
30
+ backup_dir = Path("./data_backup")
31
+ if backup_dir.exists():
32
+ shutil.rmtree(backup_dir)
33
+ shutil.copytree(data_dir, backup_dir)
34
+
35
+ # Clone/pull latest data from hub
36
+ repo_url = f"https://huggingface.co/datasets/{DATASET_REPO}"
37
+ hub_data_dir = Path("hub_data")
38
+
39
+ if hub_data_dir.exists():
40
+ # If repo exists, do a git pull
41
+ print("Pulling latest changes...")
42
+ repo = git.Repo(hub_data_dir)
43
+ origin = repo.remotes.origin
44
+ origin.pull()
45
+ else:
46
+ # Clone the repo
47
+ print("Cloning repository...")
48
+ git.Repo.clone_from(repo_url, hub_data_dir)
49
+
50
+ # Merge hub data with local data
51
+ hub_data_source = hub_data_dir / "data"
52
+ if hub_data_source.exists():
53
+ # Create data dir if it doesn't exist
54
+ data_dir.mkdir(exist_ok=True)
55
+
56
+ # Copy files from hub
57
+ for item in hub_data_source.glob("*"):
58
+ if item.is_dir():
59
+ dest = data_dir / item.name
60
+ if not dest.exists(): # Only copy if doesn't exist locally
61
+ shutil.copytree(item, dest)
62
+
63
+ # Clean up cloned repo
64
+ if hub_data_dir.exists():
65
+ shutil.rmtree(hub_data_dir)
66
+ print("Finished syncing with hub!")
67
+
68
+
69
+ scheduler = CommitScheduler(
70
+ repo_id=DATASET_REPO,
71
+ repo_type="dataset",
72
+ folder_path="./data",
73
+ path_in_repo="data",
74
+ every=1,
75
+ )
76
+
77
+
78
+ def load_existing_questions():
79
+ """
80
+ Load all existing questions from the data directory
81
+ Returns a list of tuples (question_id, question_preview)
82
+ """
83
+ questions = []
84
+ data_dir = "./data"
85
+ if not os.path.exists(data_dir):
86
+ return questions
87
+
88
+ for question_dir in glob.glob(os.path.join(data_dir, "*")):
89
+ if os.path.isdir(question_dir):
90
+ json_path = os.path.join(question_dir, "question.json")
91
+ if os.path.exists(json_path):
92
+ try:
93
+ with open(json_path, "r", encoding="utf-8") as f:
94
+ data = json.loads(f.read().strip())
95
+ question_id = os.path.basename(question_dir)
96
+ preview = (
97
+ f"{data['question'][:100]}..."
98
+ if len(data["question"]) > 100
99
+ else data["question"]
100
+ )
101
+ questions.append((question_id, f"{question_id}: {preview}"))
102
+ except:
103
+ continue
104
+
105
+ return sorted(questions, key=lambda x: x[1])
106
+
107
+
108
+ def load_question_data(question_id):
109
+ """
110
+ Load a specific question's data
111
+ Returns a tuple of all form fields
112
+ """
113
+ if not question_id:
114
+ return [None] * 26 + [None] # Changed from gr.State(value=None) to just None
115
+
116
+ # Extract the ID part before the colon from the dropdown selection
117
+ question_id = (
118
+ question_id.split(":")[0].strip() if ":" in question_id else question_id
119
+ )
120
+
121
+ json_path = os.path.join("./data", question_id, "question.json")
122
+ if not os.path.exists(json_path):
123
+ print(f"Question file not found: {json_path}")
124
+ return [None] * 26 + [None]
125
+
126
+ try:
127
+ with open(json_path, "r", encoding="utf-8") as f:
128
+ data = json.loads(f.read().strip())
129
+
130
+ # Load images
131
+ def load_image(image_path):
132
+ if not image_path:
133
+ return None
134
+ full_path = os.path.join(
135
+ "./data", question_id, os.path.basename(image_path)
136
+ )
137
+ return full_path if os.path.exists(full_path) else None
138
+
139
+ question_images = data.get("question_images", [])
140
+ rationale_images = data.get("rationale_images", [])
141
+
142
+ return [
143
+ data["author_info"]["name"],
144
+ data["author_info"]["email_address"],
145
+ data["author_info"]["institution"],
146
+ (
147
+ ",".join(data["question_categories"])
148
+ if isinstance(data["question_categories"], list)
149
+ else data["question_categories"]
150
+ ),
151
+ data.get("subquestions_1_text", "N/A"),
152
+ data.get("subquestions_1_answer", "N/A"),
153
+ data.get("subquestions_2_text", "N/A"),
154
+ data.get("subquestions_2_answer", "N/A"),
155
+ data.get("subquestions_3_text", "N/A"),
156
+ data.get("subquestions_3_answer", "N/A"),
157
+ data.get("subquestions_4_text", "N/A"),
158
+ data.get("subquestions_4_answer", "N/A"),
159
+ data.get("subquestions_5_text", "N/A"),
160
+ data.get("subquestions_5_answer", "N/A"),
161
+ data["question"],
162
+ data["final_answer"],
163
+ data.get("rationale_text", ""),
164
+ data["image_attribution"],
165
+ load_image(question_images[0] if question_images else None),
166
+ load_image(question_images[1] if len(question_images) > 1 else None),
167
+ load_image(question_images[2] if len(question_images) > 2 else None),
168
+ load_image(question_images[3] if len(question_images) > 3 else None),
169
+ load_image(rationale_images[0] if rationale_images else None),
170
+ load_image(rationale_images[1] if len(rationale_images) > 1 else None),
171
+ question_id, # Changed from gr.State(value=question_id) to just question_id
172
+ ]
173
+ except Exception as e:
174
+ print(f"Error loading question {question_id}: {str(e)}")
175
+ return [None] * 26 + [None]
176
+
177
+
178
+ def generate_json_files(
179
+ name,
180
+ email_address,
181
+ institution,
182
+ question_categories,
183
+ subquestion_1_text,
184
+ subquestion_1_answer,
185
+ subquestion_2_text,
186
+ subquestion_2_answer,
187
+ subquestion_3_text,
188
+ subquestion_3_answer,
189
+ subquestion_4_text,
190
+ subquestion_4_answer,
191
+ subquestion_5_text,
192
+ subquestion_5_answer,
193
+ question,
194
+ final_answer,
195
+ rationale_text,
196
+ image_attribution,
197
+ image1,
198
+ image2,
199
+ image3,
200
+ image4,
201
+ rationale_image1,
202
+ rationale_image2,
203
+ existing_id=None, # New parameter for updating existing questions
204
+ ):
205
+ """
206
+ For each request:
207
+ 1) Create a unique folder under ./data/ (or use existing if updating)
208
+ 2) Copy uploaded images (question + rationale) into that folder
209
+ 3) Produce JSON file with question data
210
+ 4) Return path to the JSON file
211
+ """
212
+
213
+ # Use existing ID if updating, otherwise generate new one
214
+ request_id = existing_id if existing_id else str(uuid.uuid4())
215
+
216
+ # Create parent data folder if it doesn't exist
217
+ parent_data_folder = "./data"
218
+ os.makedirs(parent_data_folder, exist_ok=True)
219
+
220
+ # Create or clean request folder
221
+ request_folder = os.path.join(parent_data_folder, request_id)
222
+ if os.path.exists(request_folder):
223
+ # If updating, remove old image files but only if new images are provided
224
+ for f in glob.glob(os.path.join(request_folder, "*.png")):
225
+ # Only remove if we have a new image to replace it
226
+ filename = os.path.basename(f)
227
+ if (
228
+ ("question_image_1" in filename and image1)
229
+ or ("question_image_2" in filename and image2)
230
+ or ("question_image_3" in filename and image3)
231
+ or ("question_image_4" in filename and image4)
232
+ or ("rationale_image_1" in filename and rationale_image1)
233
+ or ("rationale_image_2" in filename and rationale_image2)
234
+ ):
235
+ os.remove(f)
236
+ else:
237
+ os.makedirs(request_folder)
238
+
239
+ # Convert None strings
240
+ def safe_str(val):
241
+ return val if val is not None else ""
242
+
243
+ name = safe_str(name)
244
+ email_address = safe_str(email_address)
245
+ institution = safe_str(institution)
246
+ image_attribution = safe_str(image_attribution)
247
+ # Convert question_categories to list
248
+ question_categories = (
249
+ [cat.strip() for cat in safe_str(question_categories).split(",")]
250
+ if question_categories
251
+ else []
252
+ )
253
+ subquestion_1_text = safe_str(subquestion_1_text)
254
+ subquestion_1_answer = safe_str(subquestion_1_answer)
255
+ subquestion_2_text = safe_str(subquestion_2_text)
256
+ subquestion_2_answer = safe_str(subquestion_2_answer)
257
+ subquestion_3_text = safe_str(subquestion_3_text)
258
+ subquestion_3_answer = safe_str(subquestion_3_answer)
259
+ subquestion_4_text = safe_str(subquestion_4_text)
260
+ subquestion_4_answer = safe_str(subquestion_4_answer)
261
+ subquestion_5_text = safe_str(subquestion_5_text)
262
+ subquestion_5_answer = safe_str(subquestion_5_answer)
263
+ question = safe_str(question)
264
+ final_answer = safe_str(final_answer)
265
+ rationale_text = safe_str(rationale_text)
266
+
267
+ # Collect image-like fields so we can process them in one loop
268
+ all_images = [
269
+ ("question_image_1", image1),
270
+ ("question_image_2", image2),
271
+ ("question_image_3", image3),
272
+ ("question_image_4", image4),
273
+ ("rationale_image_1", rationale_image1),
274
+ ("rationale_image_2", rationale_image2),
275
+ ]
276
+
277
+ # If updating, load existing images that haven't been replaced
278
+ if existing_id:
279
+ json_path = os.path.join(parent_data_folder, existing_id, "question.json")
280
+ if os.path.exists(json_path):
281
+ try:
282
+ with open(json_path, "r", encoding="utf-8") as f:
283
+ existing_data = json.loads(f.read().strip())
284
+ existing_question_images = existing_data.get("question_images", [])
285
+ existing_rationale_images = existing_data.get(
286
+ "rationale_images", []
287
+ )
288
+
289
+ # Keep existing images if no new ones provided
290
+ if not image1 and existing_question_images:
291
+ all_images[0] = (
292
+ "question_image_1",
293
+ existing_question_images[0],
294
+ )
295
+ if not image2 and len(existing_question_images) > 1:
296
+ all_images[1] = (
297
+ "question_image_2",
298
+ existing_question_images[1],
299
+ )
300
+ if not image3 and len(existing_question_images) > 2:
301
+ all_images[2] = (
302
+ "question_image_3",
303
+ existing_question_images[2],
304
+ )
305
+ if not image4 and len(existing_question_images) > 3:
306
+ all_images[3] = (
307
+ "question_image_4",
308
+ existing_question_images[3],
309
+ )
310
+ if not rationale_image1 and existing_rationale_images:
311
+ all_images[4] = (
312
+ "rationale_image_1",
313
+ existing_rationale_images[0],
314
+ )
315
+ if not rationale_image2 and len(existing_rationale_images) > 1:
316
+ all_images[5] = (
317
+ "rationale_image_2",
318
+ existing_rationale_images[1],
319
+ )
320
+ except:
321
+ pass
322
+
323
+ files_list = []
324
+ for idx, (img_label, img_obj) in enumerate(all_images):
325
+ if img_obj is not None:
326
+ temp_path = os.path.join(request_folder, f"{img_label}.png")
327
+ if isinstance(img_obj, str):
328
+ # If image is a file path
329
+ if os.path.exists(img_obj):
330
+ if (
331
+ img_obj != temp_path
332
+ ): # Only copy if source and destination are different
333
+ shutil.copy2(img_obj, temp_path)
334
+ files_list.append((img_label, temp_path))
335
+ else:
336
+ # If image is a numpy array
337
+ gr.processing_utils.save_image(img_obj, temp_path)
338
+ files_list.append((img_label, temp_path))
339
+
340
+ # Build user content in two flavors: local file paths vs base64
341
+ # We'll store text fields as simple dictionaries, and then images separately.
342
+ content_list_urls = [
343
+ {"type": "field", "label": "name", "value": name},
344
+ {"type": "field", "label": "email_address", "value": email_address},
345
+ {"type": "field", "label": "institution", "value": institution},
346
+ {"type": "field", "label": "question_categories", "value": question_categories},
347
+ {"type": "field", "label": "image_attribution", "value": image_attribution},
348
+ {"type": "field", "label": "subquestion_1_text", "value": subquestion_1_text},
349
+ {
350
+ "type": "field",
351
+ "label": "subquestion_1_answer",
352
+ "value": subquestion_1_answer,
353
+ },
354
+ {"type": "field", "label": "subquestion_2_text", "value": subquestion_2_text},
355
+ {
356
+ "type": "field",
357
+ "label": "subquestion_2_answer",
358
+ "value": subquestion_2_answer,
359
+ },
360
+ {"type": "field", "label": "subquestion_3_text", "value": subquestion_3_text},
361
+ {
362
+ "type": "field",
363
+ "label": "subquestion_3_answer",
364
+ "value": subquestion_3_answer,
365
+ },
366
+ {"type": "field", "label": "subquestion_4_text", "value": subquestion_4_text},
367
+ {
368
+ "type": "field",
369
+ "label": "subquestion_4_answer",
370
+ "value": subquestion_4_answer,
371
+ },
372
+ {"type": "field", "label": "subquestion_5_text", "value": subquestion_5_text},
373
+ {
374
+ "type": "field",
375
+ "label": "subquestion_5_answer",
376
+ "value": subquestion_5_answer,
377
+ },
378
+ {"type": "field", "label": "question", "value": question},
379
+ {"type": "field", "label": "final_answer", "value": final_answer},
380
+ {"type": "field", "label": "rationale_text", "value": rationale_text},
381
+ ]
382
+
383
+ # Append image references
384
+ for img_label, file_path in files_list:
385
+ # 1) Local path (URL) version
386
+ rel_path = os.path.join(".", os.path.basename(file_path))
387
+ content_list_urls.append(
388
+ {
389
+ "type": "image_url",
390
+ "label": img_label,
391
+ "image_url": {"url": {"data:image/png;path": rel_path}},
392
+ }
393
+ )
394
+
395
+ # Build the final JSON structures for each approach
396
+ # A) URLs JSON
397
+ item_urls = {
398
+ "custom_id": f"question___{request_id}",
399
+ # Metadata at top level
400
+ "author_info": {
401
+ "name": name,
402
+ "email_address": email_address,
403
+ "institution": institution,
404
+ },
405
+ "question_categories": question_categories,
406
+ "image_attribution": image_attribution,
407
+ "question": question,
408
+ "question_images": [
409
+ item["image_url"]["url"]["data:image/png;path"]
410
+ for item in content_list_urls
411
+ if item.get("type") == "image_url"
412
+ and "question_image" in item.get("label", "")
413
+ ],
414
+ "final_answer": final_answer,
415
+ "rationale_text": rationale_text,
416
+ "rationale_images": [
417
+ item["image_url"]["url"]["data:image/png;path"]
418
+ for item in content_list_urls
419
+ if item.get("type") == "image_url"
420
+ and "rationale_image" in item.get("label", "")
421
+ ],
422
+ "subquestions_1_text": subquestion_1_text,
423
+ "subquestions_1_answer": subquestion_1_answer,
424
+ "subquestions_2_text": subquestion_2_text,
425
+ "subquestions_2_answer": subquestion_2_answer,
426
+ "subquestions_3_text": subquestion_3_text,
427
+ "subquestions_3_answer": subquestion_3_answer,
428
+ "subquestions_4_text": subquestion_4_text,
429
+ "subquestions_4_answer": subquestion_4_answer,
430
+ "subquestions_5_text": subquestion_5_text,
431
+ "subquestions_5_answer": subquestion_5_answer,
432
+ }
433
+
434
+ # Convert each to JSON line format
435
+ urls_json_line = json.dumps(item_urls, ensure_ascii=False)
436
+
437
+ # 3) Write out JSON file in request_folder
438
+ urls_jsonl_path = os.path.join(request_folder, "question.json")
439
+
440
+ with open(urls_jsonl_path, "w", encoding="utf-8") as f:
441
+ f.write(urls_json_line + "\n")
442
+
443
+ return urls_jsonl_path
444
+
445
+
446
+ # Build the Gradio app
447
+ with gr.Blocks() as demo:
448
+ gr.Markdown("# BugsBunny Eval Builder")
449
+ # Add a global state variable at the top level
450
+ loaded_question_id = gr.State()
451
+
452
+ with gr.Accordion("Instructions", open=True):
453
+ gr.HTML(
454
+ """
455
+ <h3>Instructions:</h3>
456
+ <p>Welcome to the Hugging Face space for collecting questions for the BugsBunny benchmark.</p>
457
+ TBA
458
+ """
459
+ )
460
+ gr.Markdown("## Author Information")
461
+ with gr.Row():
462
+ name_input = gr.Textbox(label="Name", lines=1)
463
+ email_address_input = gr.Textbox(label="Email Address", lines=1)
464
+ institution_input = gr.Textbox(
465
+ label="Institution or 'Independent'",
466
+ lines=1,
467
+ placeholder="e.g. MIT, Google, Independent, etc.",
468
+ )
469
+
470
+ gr.Markdown("## Question Information")
471
+
472
+ # image
473
+ gr.Markdown("### Images Attribution")
474
+ image_attribution_input = gr.Textbox(
475
+ label="Images Attribution",
476
+ lines=1,
477
+ placeholder="Include attribution information for the images used in this question (or 'Own' if you created/took them)",
478
+ )
479
+
480
+ # Question Images - Individual Tabs
481
+ with gr.Tabs():
482
+ with gr.Tab("Image 1"):
483
+ image1 = gr.Image(label="Question Image 1", type="filepath")
484
+ with gr.Tab("Image 2 (Optional)"):
485
+ image2 = gr.Image(label="Question Image 2", type="filepath")
486
+ with gr.Tab("Image 3 (Optional)"):
487
+ image3 = gr.Image(label="Question Image 3", type="filepath")
488
+ with gr.Tab("Image 4 (Optional)"):
489
+ image4 = gr.Image(label="Question Image 4", type="filepath")
490
+
491
+ question_input = gr.Textbox(
492
+ label="Question", lines=15, placeholder="Type your question here..."
493
+ )
494
+
495
+ question_categories_input = gr.Textbox(
496
+ label="Question Categories",
497
+ lines=1,
498
+ placeholder="Comma-separated tags, e.g. math, geometry",
499
+ )
500
+
501
+ # Answer Section
502
+ gr.Markdown("## Answer ")
503
+
504
+ final_answer_input = gr.Textbox(
505
+ label="Final Answer",
506
+ lines=1,
507
+ placeholder="Enter the short/concise final answer...",
508
+ )
509
+
510
+ rationale_text_input = gr.Textbox(
511
+ label="Rationale Text",
512
+ lines=5,
513
+ placeholder="Enter the reasoning or explanation for the answer...",
514
+ )
515
+
516
+ # Rationale Images - Individual Tabs
517
+ with gr.Tabs():
518
+ with gr.Tab("Rationale 1 (Optional)"):
519
+ rationale_image1 = gr.Image(label="Rationale Image 1", type="filepath")
520
+ with gr.Tab("Rationale 2 (Optional)"):
521
+ rationale_image2 = gr.Image(label="Rationale Image 2", type="filepath")
522
+
523
+ # Subquestions Section
524
+ gr.Markdown("## Subquestions")
525
+ with gr.Row():
526
+ subquestion_1_text_input = gr.Textbox(
527
+ label="Subquestion 1 Text",
528
+ lines=2,
529
+ placeholder="First sub-question...",
530
+ value="N/A",
531
+ )
532
+ subquestion_1_answer_input = gr.Textbox(
533
+ label="Subquestion 1 Answer",
534
+ lines=2,
535
+ placeholder="Answer to sub-question 1...",
536
+ value="N/A",
537
+ )
538
+
539
+ with gr.Row():
540
+ subquestion_2_text_input = gr.Textbox(
541
+ label="Subquestion 2 Text",
542
+ lines=2,
543
+ placeholder="Second sub-question...",
544
+ value="N/A",
545
+ )
546
+ subquestion_2_answer_input = gr.Textbox(
547
+ label="Subquestion 2 Answer",
548
+ lines=2,
549
+ placeholder="Answer to sub-question 2...",
550
+ value="N/A",
551
+ )
552
+
553
+ with gr.Row():
554
+ subquestion_3_text_input = gr.Textbox(
555
+ label="Subquestion 3 Text",
556
+ lines=2,
557
+ placeholder="Third sub-question...",
558
+ value="N/A",
559
+ )
560
+ subquestion_3_answer_input = gr.Textbox(
561
+ label="Subquestion 3 Answer",
562
+ lines=2,
563
+ placeholder="Answer to sub-question 3...",
564
+ value="N/A",
565
+ )
566
+
567
+ with gr.Row():
568
+ subquestion_4_text_input = gr.Textbox(
569
+ label="Subquestion 4 Text",
570
+ lines=2,
571
+ placeholder="Fourth sub-question...",
572
+ value="N/A",
573
+ )
574
+ subquestion_4_answer_input = gr.Textbox(
575
+ label="Subquestion 4 Answer",
576
+ lines=2,
577
+ placeholder="Answer to sub-question 4...",
578
+ value="N/A",
579
+ )
580
+
581
+ with gr.Row():
582
+ subquestion_5_text_input = gr.Textbox(
583
+ label="Subquestion 5 Text",
584
+ lines=2,
585
+ placeholder="Fifth sub-question...",
586
+ value="N/A",
587
+ )
588
+ subquestion_5_answer_input = gr.Textbox(
589
+ label="Subquestion 5 Answer",
590
+ lines=2,
591
+ placeholder="Answer to sub-question 5...",
592
+ value="N/A",
593
+ )
594
+
595
+ with gr.Row():
596
+ submit_button = gr.Button("Submit")
597
+ clear_button = gr.Button("Clear Form")
598
+
599
+ with gr.Row():
600
+ output_file_urls = gr.File(
601
+ label="Download URLs JSON", interactive=False, visible=False
602
+ )
603
+ output_file_base64 = gr.File(
604
+ label="Download Base64 JSON", interactive=False, visible=False
605
+ )
606
+
607
+ with gr.Accordion("Load Existing Question", open=False):
608
+ gr.Markdown("## Load Existing Question")
609
+
610
+ with gr.Row():
611
+ existing_questions = gr.Dropdown(
612
+ label="Load Existing Question",
613
+ choices=load_existing_questions(),
614
+ type="value",
615
+ allow_custom_value=False,
616
+ )
617
+ refresh_button = gr.Button("🔄 Refresh")
618
+ load_button = gr.Button("Load Selected Question")
619
+
620
+ def refresh_questions():
621
+ return gr.Dropdown(choices=load_existing_questions())
622
+
623
+ refresh_button.click(fn=refresh_questions, inputs=[], outputs=[existing_questions])
624
+
625
+ # Load button functionality
626
+ load_button.click(
627
+ fn=load_question_data,
628
+ inputs=[existing_questions],
629
+ outputs=[
630
+ name_input,
631
+ email_address_input,
632
+ institution_input,
633
+ question_categories_input,
634
+ subquestion_1_text_input,
635
+ subquestion_1_answer_input,
636
+ subquestion_2_text_input,
637
+ subquestion_2_answer_input,
638
+ subquestion_3_text_input,
639
+ subquestion_3_answer_input,
640
+ subquestion_4_text_input,
641
+ subquestion_4_answer_input,
642
+ subquestion_5_text_input,
643
+ subquestion_5_answer_input,
644
+ question_input,
645
+ final_answer_input,
646
+ rationale_text_input,
647
+ image_attribution_input,
648
+ image1,
649
+ image2,
650
+ image3,
651
+ image4,
652
+ rationale_image1,
653
+ rationale_image2,
654
+ loaded_question_id,
655
+ ],
656
+ )
657
+
658
+ # Modify validate_and_generate to handle updates
659
+ def validate_and_generate(
660
+ nm,
661
+ em,
662
+ inst,
663
+ qcats,
664
+ sq1t,
665
+ sq1a,
666
+ sq2t,
667
+ sq2a,
668
+ sq3t,
669
+ sq3a,
670
+ sq4t,
671
+ sq4a,
672
+ sq5t,
673
+ sq5a,
674
+ q,
675
+ fa,
676
+ rt,
677
+ ia,
678
+ i1,
679
+ i2,
680
+ i3,
681
+ i4,
682
+ ri1,
683
+ ri2,
684
+ stored_question_id, # Add this parameter
685
+ ):
686
+ # Validation code remains the same
687
+ missing_fields = []
688
+ if not nm or not nm.strip():
689
+ missing_fields.append("Name")
690
+ if not em or not em.strip():
691
+ missing_fields.append("Email Address")
692
+ if not inst or not inst.strip():
693
+ missing_fields.append("Institution")
694
+ if not q or not q.strip():
695
+ missing_fields.append("Question")
696
+ if not fa or not fa.strip():
697
+ missing_fields.append("Final Answer")
698
+ if not i1:
699
+ missing_fields.append("First Question Image")
700
+ if not ia or not ia.strip():
701
+ missing_fields.append("Image Attribution")
702
+ if not sq1t or not sq1t.strip() or not sq1a or not sq1a.strip():
703
+ missing_fields.append("First Sub-question and Answer")
704
+ if not sq2t or not sq2t.strip() or not sq2a or not sq2a.strip():
705
+ missing_fields.append("Second Sub-question and Answer")
706
+ if not sq3t or not sq3t.strip() or not sq3a or not sq3a.strip():
707
+ missing_fields.append("Third Sub-question and Answer")
708
+ if not sq4t or not sq4t.strip() or not sq4a or not sq4a.strip():
709
+ missing_fields.append("Fourth Sub-question and Answer")
710
+ if not sq5t or not sq5t.strip() or not sq5a or not sq5a.strip():
711
+ missing_fields.append("Fifth Sub-question and Answer")
712
+
713
+ if missing_fields:
714
+ warning_msg = f"Required fields missing: {', '.join(missing_fields)} ⛔️"
715
+ gr.Warning(warning_msg, duration=5)
716
+ return gr.Button(interactive=True), gr.Dropdown(
717
+ choices=load_existing_questions()
718
+ )
719
+
720
+ # Use the stored ID instead of extracting from dropdown
721
+ existing_id = stored_question_id if stored_question_id else None
722
+
723
+ results = generate_json_files(
724
+ nm,
725
+ em,
726
+ inst,
727
+ qcats,
728
+ sq1t,
729
+ sq1a,
730
+ sq2t,
731
+ sq2a,
732
+ sq3t,
733
+ sq3a,
734
+ sq4t,
735
+ sq4a,
736
+ sq5t,
737
+ sq5a,
738
+ q,
739
+ fa,
740
+ rt,
741
+ ia,
742
+ i1,
743
+ i2,
744
+ i3,
745
+ i4,
746
+ ri1,
747
+ ri2,
748
+ existing_id,
749
+ )
750
+
751
+ action = "updated" if existing_id else "created"
752
+ gr.Info(
753
+ f"Dataset item {action} successfully! 🎉 Clear the form to submit a new one"
754
+ )
755
+
756
+ return gr.update(interactive=False), gr.Dropdown(
757
+ choices=load_existing_questions()
758
+ )
759
+
760
+ # Update submit button click handler to match inputs/outputs correctly
761
+ submit_button.click(
762
+ fn=validate_and_generate,
763
+ inputs=[
764
+ name_input,
765
+ email_address_input,
766
+ institution_input,
767
+ question_categories_input,
768
+ subquestion_1_text_input,
769
+ subquestion_1_answer_input,
770
+ subquestion_2_text_input,
771
+ subquestion_2_answer_input,
772
+ subquestion_3_text_input,
773
+ subquestion_3_answer_input,
774
+ subquestion_4_text_input,
775
+ subquestion_4_answer_input,
776
+ subquestion_5_text_input,
777
+ subquestion_5_answer_input,
778
+ question_input,
779
+ final_answer_input,
780
+ rationale_text_input,
781
+ image_attribution_input,
782
+ image1,
783
+ image2,
784
+ image3,
785
+ image4,
786
+ rationale_image1,
787
+ rationale_image2,
788
+ loaded_question_id,
789
+ ],
790
+ outputs=[submit_button, existing_questions],
791
+ )
792
+
793
+ # Fix the clear_form_fields function
794
+ def clear_form_fields(name, email, inst, *args):
795
+ outputs = [
796
+ name, # Preserve name
797
+ email, # Preserve email
798
+ inst, # Preserve institution
799
+ gr.update(value=""), # Clear question categories
800
+ gr.update(value="N/A"), # Reset subquestion 1 text to N/A
801
+ gr.update(value="N/A"), # Reset subquestion 1 answer to N/A
802
+ gr.update(value="N/A"), # Reset subquestion 2 text to N/A
803
+ gr.update(value="N/A"), # Reset subquestion 2 answer to N/A
804
+ gr.update(value="N/A"), # Reset subquestion 3 text to N/A
805
+ gr.update(value="N/A"), # Reset subquestion 3 answer to N/A
806
+ gr.update(value="N/A"), # Reset subquestion 4 text to N/A
807
+ gr.update(value="N/A"), # Reset subquestion 4 answer to N/A
808
+ gr.update(value="N/A"), # Reset subquestion 5 text to N/A
809
+ gr.update(value="N/A"), # Reset subquestion 5 answer to N/A
810
+ gr.update(value=""), # Clear question
811
+ gr.update(value=""), # Clear final answer
812
+ gr.update(value=""), # Clear rationale text
813
+ gr.update(value=""), # Clear image attribution
814
+ None, # Clear image1
815
+ None, # Clear image2
816
+ None, # Clear image3
817
+ None, # Clear image4
818
+ None, # Clear rationale image1
819
+ None, # Clear rationale image2
820
+ None, # Clear output file urls
821
+ gr.Button(interactive=True), # Re-enable submit button
822
+ gr.update(choices=load_existing_questions()), # Update dropdown
823
+ None, # Changed from gr.State(value=None) to just None
824
+ ]
825
+ gr.Info("Form cleared! Ready for new submission 🔄")
826
+ return outputs
827
+
828
+ # Update the clear button click handler
829
+ clear_button.click(
830
+ fn=clear_form_fields,
831
+ inputs=[
832
+ name_input,
833
+ email_address_input,
834
+ institution_input,
835
+ ],
836
+ outputs=[
837
+ name_input,
838
+ email_address_input,
839
+ institution_input,
840
+ question_categories_input,
841
+ subquestion_1_text_input,
842
+ subquestion_1_answer_input,
843
+ subquestion_2_text_input,
844
+ subquestion_2_answer_input,
845
+ subquestion_3_text_input,
846
+ subquestion_3_answer_input,
847
+ subquestion_4_text_input,
848
+ subquestion_4_answer_input,
849
+ subquestion_5_text_input,
850
+ subquestion_5_answer_input,
851
+ question_input,
852
+ final_answer_input,
853
+ rationale_text_input,
854
+ image_attribution_input,
855
+ image1,
856
+ image2,
857
+ image3,
858
+ image4,
859
+ rationale_image1,
860
+ rationale_image2,
861
+ output_file_urls,
862
+ submit_button,
863
+ existing_questions,
864
+ loaded_question_id,
865
+ ],
866
+ )
867
+
868
+
869
+ def process_thread():
870
+ while True:
871
+ try:
872
+ process_and_push_dataset(
873
+ "./data",
874
+ FINAL_REPO,
875
+ token=os.environ["HF_TOKEN"],
876
+ private=True,
877
+ )
878
+ except Exception as e:
879
+ print(f"Error in process thread: {e}")
880
+ time.sleep(120) # Sleep for 2 minutes
881
+
882
+
883
+ if __name__ == "__main__":
884
+ print("Initializing app...")
885
+ sync_with_hub() # Sync before launching the app
886
+ print("Starting Gradio interface...")
887
+
888
+ # Start the processing thread when the app starts
889
+ processing_thread = threading.Thread(target=process_thread, daemon=True)
890
+ processing_thread.start()
891
+
892
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ GitPython
utils.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
6
+
7
+
8
+ def process_and_push_dataset(
9
+ data_dir: str, hub_repo: str, token: str, private: bool = True
10
+ ):
11
+ """
12
+ Process local dataset files and push to Hugging Face Hub.
13
+
14
+ Args:
15
+ data_dir (str): Path to the data directory containing submission folders
16
+ hub_repo (str): Name of the Hugging Face repository to push to
17
+ private (bool): Whether to make the pushed dataset private
18
+
19
+ Returns:
20
+ datasets.Dataset: The processed dataset
21
+ """
22
+ # List to store all records
23
+ all_records = []
24
+
25
+ # Walk through all subdirectories in data_dir
26
+ for root, dirs, files in os.walk(data_dir):
27
+ for file in files:
28
+ if file == "question.json":
29
+ file_path = Path(root) / file
30
+ try:
31
+ # Read the JSON file
32
+ with open(file_path, "r", encoding="utf-8") as f:
33
+ record = json.load(f)
34
+
35
+ # Get the folder path for this record
36
+ folder_path = os.path.dirname(file_path)
37
+
38
+ # Fix image paths to include full path
39
+ if "question_images" in record:
40
+ record["question_images"] = [
41
+ str(Path(folder_path) / img_path)
42
+ for img_path in record["question_images"]
43
+ if img_path
44
+ ]
45
+
46
+ if "rationale_images" in record:
47
+ record["rationale_images"] = [
48
+ str(Path(folder_path) / img_path)
49
+ for img_path in record["rationale_images"]
50
+ if img_path
51
+ ]
52
+
53
+ # Flatten author_info dictionary
54
+ author_info = record.pop("author_info", {})
55
+ record.update(
56
+ {f"author_{k}": v for k, v in author_info.items()}
57
+ )
58
+
59
+ # Add the record
60
+ all_records.append(record)
61
+ except Exception as e:
62
+ print(f"Error processing {file_path}: {e}")
63
+
64
+ # Convert to DataFrame
65
+ df = pd.DataFrame(all_records)
66
+
67
+ # Sort by custom_id for consistency
68
+ if not df.empty and "custom_id" in df.columns:
69
+ df = df.sort_values("custom_id")
70
+
71
+ # Ensure all required columns exist with default values
72
+ required_columns = {
73
+ "custom_id": "",
74
+ "author_name": "",
75
+ "author_email_address": "",
76
+ "author_institution": "",
77
+ "question_categories": [],
78
+ "question": "",
79
+ "question_images": [],
80
+ "final_answer": "",
81
+ "rationale_text": "",
82
+ "rationale_images": [],
83
+ "image_attribution": "",
84
+ "subquestions_1_text": "",
85
+ "subquestions_1_answer": "",
86
+ "subquestions_2_text": "",
87
+ "subquestions_2_answer": "",
88
+ "subquestions_3_text": "",
89
+ "subquestions_3_answer": "",
90
+ "subquestions_4_text": "",
91
+ "subquestions_4_answer": "",
92
+ "subquestions_5_text": "",
93
+ "subquestions_5_answer": "",
94
+ }
95
+
96
+ for col, default_value in required_columns.items():
97
+ if col not in df.columns:
98
+ df[col] = default_value
99
+
100
+ # Define features
101
+ features = Features(
102
+ {
103
+ "custom_id": Value("string"),
104
+ "question": Value("string"),
105
+ "question_images": Sequence(ImageFeature()),
106
+ "question_categories": Sequence(Value("string")),
107
+ "final_answer": Value("string"),
108
+ "rationale_text": Value("string"),
109
+ "rationale_images": Sequence(ImageFeature()),
110
+ "image_attribution": Value("string"),
111
+ "subquestions_1_text": Value("string"),
112
+ "subquestions_1_answer": Value("string"),
113
+ "subquestions_2_text": Value("string"),
114
+ "subquestions_2_answer": Value("string"),
115
+ "subquestions_3_text": Value("string"),
116
+ "subquestions_3_answer": Value("string"),
117
+ "subquestions_4_text": Value("string"),
118
+ "subquestions_4_answer": Value("string"),
119
+ "subquestions_5_text": Value("string"),
120
+ "subquestions_5_answer": Value("string"),
121
+ "author_name": Value("string"),
122
+ "author_email_address": Value("string"),
123
+ "author_institution": Value("string"),
124
+ }
125
+ )
126
+
127
+ # Convert DataFrame to dict of lists (Hugging Face Dataset format)
128
+ dataset_dict = {col: df[col].tolist() for col in features.keys()}
129
+
130
+ # Create Dataset directly from dict
131
+ dataset = Dataset.from_dict(dataset_dict, features=features)
132
+
133
+ # Push to hub
134
+ dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)
135
+
136
+ print(f"\nDataset Statistics:")
137
+ print(f"Total number of submissions: {len(dataset)}")
138
+ print(f"\nSuccessfully pushed dataset to {hub_repo}")
139
+
140
+ return dataset