from datasets import load_dataset import json # Load the first few examples of the dataset dataset = load_dataset("asierhv/composite_corpus_eu_v2.1", split="train", streaming=True) # Get the first example examples = [] for i, example in enumerate(dataset): if i >= 3: # Get first 3 examples break examples.append(example) # Print the structure and content for i, example in enumerate(examples): print(f"\nExample {i+1}:") for key, value in example.items(): if key == "audio": print(f"audio keys: {value.keys()}") for audio_key, audio_value in value.items(): if isinstance(audio_value, bytes) or isinstance(audio_value, memoryview): print(f" {audio_key}: ") else: print(f" {audio_key}: {audio_value}") else: print(f"{key}: {value}")