TeacherPuffy commited on
Commit
4e3915c
·
verified ·
1 Parent(s): 1153ecb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -3
app.py CHANGED
@@ -1,6 +1,26 @@
1
  import gradio as gr
2
  from datasets import load_dataset
3
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def combine_dataset_texts(dataset_name, split, text_column):
6
  try:
@@ -11,12 +31,18 @@ def combine_dataset_texts(dataset_name, split, text_column):
11
  if text_column not in dataset.column_names:
12
  raise gr.Error(f"Column '{text_column}' not found in dataset")
13
 
14
- # Combine all texts
15
- combined_text = "\n\n".join([example[text_column] for example in dataset])
 
 
 
 
 
 
16
 
17
  # Create a temporary file
18
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
19
- f.write(combined_text)
20
  return f.name
21
 
22
  except Exception as e:
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
  import tempfile
4
+ import re
5
+ from langdetect import detect
6
+
7
+ def is_english(text):
8
+ """Check if the text is in English."""
9
+ try:
10
+ return detect(text) == 'en'
11
+ except:
12
+ return False
13
+
14
+ def clean_text(text):
15
+ """Remove non-English text and ** from the text."""
16
+ # Remove **
17
+ text = re.sub(r'\*\*', '', text)
18
+
19
+ # Split text into sentences and filter out non-English sentences
20
+ sentences = re.split(r'(?<=[.!?])\s+', text)
21
+ cleaned_sentences = [s for s in sentences if is_english(s)]
22
+
23
+ return ' '.join(cleaned_sentences)
24
 
25
  def combine_dataset_texts(dataset_name, split, text_column):
26
  try:
 
31
  if text_column not in dataset.column_names:
32
  raise gr.Error(f"Column '{text_column}' not found in dataset")
33
 
34
+ # Combine all texts into a single string without separating datapoints
35
+ combined_text = " ".join([example[text_column] for example in dataset])
36
+
37
+ # Clean the text: remove non-English and **
38
+ cleaned_text = clean_text(combined_text)
39
+
40
+ # Insert a newline after each period (.) except for ."
41
+ processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
42
 
43
  # Create a temporary file
44
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
45
+ f.write(processed_text)
46
  return f.name
47
 
48
  except Exception as e: