import yaml
import json
from sklearn.model_selection import train_test_split

# Load the YAML file
with open("data/data.yaml", "r") as f:
    data = yaml.safe_load(f)

# Separate the data by category
easy = [item for item in data if item["category"] == "easy"]
medium = [item for item in data if item["category"] == "medium"]
hard = [item for item in data if item["category"] == "hard"]

# Split each category into validation and test sets
easy_val, easy_test = train_test_split(easy, test_size=0.5, random_state=42)
medium_val, medium_test = train_test_split(medium, test_size=0.5, random_state=42)
hard_val, hard_test = train_test_split(hard, test_size=0.5, random_state=42)

# Combine the validation and test sets
validation = easy_val + medium_val + hard_val
test = easy_test + medium_test + hard_test

# Write the validation set to a JSON file
with open("validation.json", "w") as f:
    json.dump(validation, f)

# Write the test set to a JSON file
with open("test.json", "w") as f:
    json.dump(test, f)