from datasets import load_dataset | |
import json | |
# Load the first few examples of the dataset | |
dataset = load_dataset("asierhv/composite_corpus_eu_v2.1", split="train", streaming=True) | |
# Get the first example | |
examples = [] | |
for i, example in enumerate(dataset): | |
if i >= 3: # Get first 3 examples | |
break | |
examples.append(example) | |
# Print the structure and content | |
for i, example in enumerate(examples): | |
print(f"\nExample {i+1}:") | |
for key, value in example.items(): | |
if key == "audio": | |
print(f"audio keys: {value.keys()}") | |
for audio_key, audio_value in value.items(): | |
if isinstance(audio_value, bytes) or isinstance(audio_value, memoryview): | |
print(f" {audio_key}: <binary data>") | |
else: | |
print(f" {audio_key}: {audio_value}") | |
else: | |
print(f"{key}: {value}") |