Raubachm commited on
Commit
93c0a02
·
verified ·
1 Parent(s): b78862e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +141 -85
README.md CHANGED
@@ -56,9 +56,21 @@ pip install -U sentence-transformers
56
  Then you can implement like this:
57
 
58
  ```python
59
- ---
60
- license: mit
61
- ---
 
 
 
 
 
 
 
 
 
 
 
 
62
  import nltk
63
  from nltk.tokenize import sent_tokenize
64
  from sentence_transformers import SentenceTransformer
@@ -66,94 +78,138 @@ from sklearn.metrics.pairwise import cosine_similarity
66
  import numpy as np
67
  import matplotlib.pyplot as plt
68
 
69
- # Load and tokenize the text
70
- def load_and_tokenize(file_path):
71
- with open(file_path, 'r') as f:
72
- text = f.read()
73
- return sent_tokenize(text)
74
-
75
- # Combine sentences with their neighbors
76
- def combine_sentences(sentences, buffer=1):
77
- combined = []
78
- for i in range(len(sentences)):
79
- start = max(0, i - buffer)
80
- end = min(len(sentences), i + buffer + 1)
81
- combined.append(' '.join(sentences[start:end]))
82
- return combined
83
-
84
- # Calculate cosine distances between embeddings
85
- def calc_cosine_distances(embeddings):
86
- distances = []
87
- for i in range(len(embeddings) - 1):
88
- sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
89
- distances.append(1 - sim)
90
- return distances
91
-
92
- # Find breakpoints based on distance threshold
93
- def find_breakpoints(distances, percentile=95):
94
- threshold = np.percentile(distances, percentile)
95
- return [i for i, d in enumerate(distances) if d > threshold]
96
-
97
- # Create chunks based on breakpoints
98
- def create_chunks(sentences, breakpoints):
99
- chunks = []
100
- start = 0
101
- for bp in breakpoints:
102
- chunks.append(' '.join(sentences[start:bp + 1]))
103
- start = bp + 1
104
- chunks.append(' '.join(sentences[start:]))
105
- return chunks
106
-
107
- # Merge small chunks with their most similar neighbor
108
- def merge_small_chunks(chunks, embeddings, min_size=3):
109
- merged = [chunks[0]]
110
- merged_emb = [embeddings[0]]
111
-
112
- for i in range(1, len(chunks) - 1):
113
- if len(chunks[i].split('. ')) < min_size:
114
- prev_sim = cosine_similarity([embeddings[i]], [merged_emb[-1]])[0][0]
115
- next_sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
116
 
117
- if prev_sim > next_sim:
118
- merged[-1] += ' ' + chunks[i]
119
- merged_emb[-1] = (merged_emb[-1] + embeddings[i]) / 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
- chunks[i + 1] = chunks[i] + ' ' + chunks[i + 1]
122
- embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
123
- else:
124
- merged.append(chunks[i])
125
- merged_emb.append(embeddings[i])
126
-
127
- merged.append(chunks[-1])
128
- merged_emb.append(embeddings[-1])
129
- return merged, merged_emb
130
-
131
- # Main process
132
- def chunk_text(file_path):
133
- # Load the model
134
- model = SentenceTransformer('sentence-transformers/all-mpnet-base-v1')
135
-
136
- # Process the text
137
- sentences = load_and_tokenize(file_path)
138
- combined = combine_sentences(sentences)
139
- embeddings = model.encode(combined)
140
-
141
- # Find breakpoints and create initial chunks
142
- distances = calc_cosine_distances(embeddings)
143
- breakpoints = find_breakpoints(distances)
144
- chunks = create_chunks(sentences, breakpoints)
145
 
146
- # Merge small chunks
147
- chunk_embeddings = model.encode(chunks)
148
- final_chunks, _ = merge_small_chunks(chunks, chunk_embeddings)
 
 
 
 
 
149
 
150
- return final_chunks
 
 
 
 
151
 
152
  if __name__ == "__main__":
153
- file_path = "/path/to/your/text/file.txt"
154
- result = chunk_text(file_path)
155
- print(f"Number of chunks: {len(result)}")
156
- print("First chunk:", result[0][:100] + "...")
157
  ```
158
  ## Evaluation Results
159
 
 
56
  Then you can implement like this:
57
 
58
  ```python
59
+ """
60
+ Text Chunking Utility
61
+
62
+ This module provides functionality to intelligently chunk text documents into semantically coherent sections
63
+ using sentence embeddings and cosine similarity. It's particularly useful for processing large documents
64
+ while maintaining contextual relationships between sentences.
65
+
66
+ Requirements:
67
+ - nltk
68
+ - sentence-transformers
69
+ - scikit-learn
70
+ - numpy
71
+ - matplotlib
72
+ """
73
+
74
  import nltk
75
  from nltk.tokenize import sent_tokenize
76
  from sentence_transformers import SentenceTransformer
 
78
  import numpy as np
79
  import matplotlib.pyplot as plt
80
 
81
+
82
+ class TextChunker:
83
+ def __init__(self, model_name='sentence-transformers/all-mpnet-base-v1'):
84
+ """Initialize the TextChunker with a specified sentence transformer model."""
85
+ self.model = SentenceTransformer(model_name)
86
+
87
+ def process_file(self, file_path, context_window=1, percentile_threshold=95, min_chunk_size=3):
88
+ """
89
+ Process a text file and split it into semantically meaningful chunks.
90
+
91
+ Args:
92
+ file_path: Path to the text file
93
+ context_window: Number of sentences to consider on either side for context
94
+ percentile_threshold: Percentile threshold for identifying breakpoints
95
+ min_chunk_size: Minimum number of sentences in a chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ Returns:
98
+ list: Semantically coherent text chunks
99
+ """
100
+ # Process the text file
101
+ sentences = self._load_text(file_path)
102
+ contextualized = self._add_context(sentences, context_window)
103
+ embeddings = self.model.encode(contextualized)
104
+
105
+ # Create and refine chunks
106
+ distances = self._calculate_distances(embeddings)
107
+ breakpoints = self._identify_breakpoints(distances, percentile_threshold)
108
+ initial_chunks = self._create_chunks(sentences, breakpoints)
109
+
110
+ # Merge small chunks for better coherence
111
+ chunk_embeddings = self.model.encode(initial_chunks)
112
+ final_chunks = self._merge_small_chunks(initial_chunks, chunk_embeddings, min_chunk_size)
113
+
114
+ return final_chunks
115
+
116
+ def _load_text(self, file_path):
117
+ """Load and tokenize text from a file."""
118
+ with open(file_path, 'r', encoding='utf-8') as file:
119
+ text = file.read()
120
+ return sent_tokenize(text)
121
+
122
+ def _add_context(self, sentences, window_size):
123
+ """Combine sentences with their neighbors for better context."""
124
+ contextualized = []
125
+ for i in range(len(sentences)):
126
+ start = max(0, i - window_size)
127
+ end = min(len(sentences), i + window_size + 1)
128
+ context = ' '.join(sentences[start:end])
129
+ contextualized.append(context)
130
+ return contextualized
131
+
132
+ def _calculate_distances(self, embeddings):
133
+ """Calculate cosine distances between consecutive embeddings."""
134
+ distances = []
135
+ for i in range(len(embeddings) - 1):
136
+ similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
137
+ distance = 1 - similarity
138
+ distances.append(distance)
139
+ return distances
140
+
141
+ def _identify_breakpoints(self, distances, threshold_percentile):
142
+ """Find natural breaking points in the text based on semantic distances."""
143
+ threshold = np.percentile(distances, threshold_percentile)
144
+ return [i for i, dist in enumerate(distances) if dist > threshold]
145
+
146
+ def _create_chunks(self, sentences, breakpoints):
147
+ """Create initial text chunks based on identified breakpoints."""
148
+ chunks = []
149
+ start_idx = 0
150
+
151
+ for breakpoint in breakpoints:
152
+ chunk = ' '.join(sentences[start_idx:breakpoint + 1])
153
+ chunks.append(chunk)
154
+ start_idx = breakpoint + 1
155
+
156
+ # Add the final chunk
157
+ final_chunk = ' '.join(sentences[start_idx:])
158
+ chunks.append(final_chunk)
159
+
160
+ return chunks
161
+
162
+ def _merge_small_chunks(self, chunks, embeddings, min_size):
163
+ """Merge small chunks with their most similar neighbor."""
164
+ final_chunks = [chunks[0]]
165
+ merged_embeddings = [embeddings[0]]
166
+
167
+ for i in range(1, len(chunks) - 1):
168
+ current_chunk_size = len(chunks[i].split('. '))
169
+
170
+ if current_chunk_size < min_size:
171
+ # Calculate similarities
172
+ prev_similarity = cosine_similarity([embeddings[i]], [merged_embeddings[-1]])[0][0]
173
+ next_similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
174
+
175
+ if prev_similarity > next_similarity:
176
+ # Merge with previous chunk
177
+ final_chunks[-1] = f"{final_chunks[-1]} {chunks[i]}"
178
+ merged_embeddings[-1] = (merged_embeddings[-1] + embeddings[i]) / 2
179
+ else:
180
+ # Merge with next chunk
181
+ chunks[i + 1] = f"{chunks[i]} {chunks[i + 1]}"
182
+ embeddings[i + 1] = (embeddings[i] + embeddings[i + 1]) / 2
183
  else:
184
+ final_chunks.append(chunks[i])
185
+ merged_embeddings.append(embeddings[i])
186
+
187
+ final_chunks.append(chunks[-1])
188
+ return final_chunks
189
+
190
+
191
+ def main():
192
+ """Example usage of the TextChunker class."""
193
+ # Initialize the chunker
194
+ chunker = TextChunker()
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ # Process a text file
197
+ file_path = "path/to/your/document.txt"
198
+ chunks = chunker.process_file(
199
+ file_path,
200
+ context_window=1,
201
+ percentile_threshold=95,
202
+ min_chunk_size=3
203
+ )
204
 
205
+ # Print results
206
+ print(f"Successfully split text into {len(chunks)} chunks")
207
+ print("\nFirst chunk preview:")
208
+ print(f"{chunks[0][:200]}...")
209
+
210
 
211
  if __name__ == "__main__":
212
+ main()
 
 
 
213
  ```
214
  ## Evaluation Results
215