Spaces:
Sleeping
Sleeping
import gradio as gr | |
from datasets import load_dataset | |
import tempfile | |
import re | |
# List of common titles that end with a period | |
TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."} | |
def is_latin(text): | |
"""Check if the text contains only Latin characters.""" | |
# Regex to match non-Latin characters | |
return not re.search(r'[^\x00-\x7F]', text) | |
def clean_text(text): | |
"""Remove non-Latin text and ** from the text.""" | |
# Remove ** | |
text = re.sub(r'\*\*', '', text) | |
# Split text into sentences and filter out non-Latin sentences | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
cleaned_sentences = [s for s in sentences if is_latin(s)] | |
return ' '.join(cleaned_sentences) | |
def process_text(text): | |
"""Insert a newline after periods, except for titles and ." | |
Also replace '### Simplified Version' with 'Chapter N' where N increments.""" | |
# Split text into words | |
words = text.split() | |
processed_text = "" | |
chapter_counter = 3 # Initialize chapter counter | |
for i, word in enumerate(words): | |
# Check if the word is a title (e.g., Mr., Mrs.) | |
if word in TITLES: | |
processed_text += word + " " | |
# Check if the word ends with a period and is not followed by a quote | |
elif word.endswith('.') and not word.endswith('."'): | |
processed_text += word + "\n" | |
# Replace '### Simplified Version' with 'Chapter N' | |
elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version": | |
processed_text += f"Chapter {chapter_counter} " | |
chapter_counter += 1 # Increment chapter counter | |
words[i + 1] = "" # Skip the next two words | |
words[i + 2] = "" | |
else: | |
processed_text += word + " " | |
# Remove trailing spaces and newlines | |
return processed_text.strip() | |
def combine_dataset_texts(dataset_name, split, text_column): | |
try: | |
# Load the dataset from Hugging Face Hub | |
dataset = load_dataset(dataset_name, split=split) | |
# Verify the text column exists | |
if text_column not in dataset.column_names: | |
raise gr.Error(f"Column '{text_column}' not found in dataset") | |
# Combine all texts into a single string without separating datapoints | |
combined_text = " ".join([example[text_column] for example in dataset]) | |
# Clean the text: remove non-Latin and ** | |
cleaned_text = clean_text(combined_text) | |
# Process the text: insert newlines after periods, except for titles and ." | |
# Also replace '### Simplified Version' with 'Chapter N' | |
processed_text = process_text(cleaned_text) | |
# Create a temporary file | |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: | |
f.write(processed_text) | |
return f.name | |
except Exception as e: | |
raise gr.Error(f"Error processing dataset: {str(e)}") | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Hugging Face Dataset Text Combiner") | |
gr.Markdown("Combine all text files from a Hugging Face dataset into a single file") | |
with gr.Row(): | |
dataset_input = gr.Textbox(label="Dataset Name", | |
placeholder="username/dataset-name") | |
split_input = gr.Textbox(label="Split", value="train") | |
column_input = gr.Textbox(label="Text Column", value="text") | |
submit_btn = gr.Button("Combine Texts") | |
with gr.Row(): | |
output_file = gr.File(label="Combined Text File") | |
error_out = gr.Textbox(label="Error Output", visible=False) | |
submit_btn.click( | |
fn=combine_dataset_texts, | |
inputs=[dataset_input, split_input, column_input], | |
outputs=output_file, | |
api_name="combine_texts" | |
) | |
if __name__ == "__main__": | |
demo.launch() |