ruggsea commited on
Commit
a37b18d
·
1 Parent(s): 0070556
.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exclude PDF files
2
+ *.pdf
3
+
4
+ # Exclude Word files
5
+ *.docx
6
+
7
+ # Exclude txt files
8
+ *.txt
9
+
10
+ # Exclude Python cache files
11
+ __pycache__/
12
+
13
+ # Exclude EPUB files
14
+ *.epub
15
+
16
+ # Exclude .vscode folder
17
+ .vscode/
18
+
19
+ # exclude zip and txt files
20
+ *.zip
21
+ *.txt
22
+
23
+ # exclude json files
24
+ freud_index/*.json
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [server]
2
+ port = 7860
3
+ address = "0.0.0.0"
4
+
5
+ [browser]
6
+ serverAddress = "0.0.0.0"
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
3
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
+ from llama_index.core import Settings
5
+ from llama_index.retrievers.bm25 import BM25Retriever
6
+ from llama_index.core.retrievers import QueryFusionRetriever
7
+
8
+ # Page config
9
+ st.set_page_config(
10
+ page_title="Freud Works Search",
11
+ page_icon="📚",
12
+ layout="wide"
13
+ )
14
+
15
+ # Title
16
+ st.title("Freud Works Hybrid Search")
17
+ st.markdown("""
18
+ This demo allows you to search through Freud's complete works using a hybrid approach combining:
19
+ - BM25 (keyword-based search)
20
+ - Vector search (semantic similarity)
21
+ """)
22
+
23
+ @st.cache_resource
24
+ def load_indices():
25
+ """Load the index and create retrievers"""
26
+ # Load embeddings
27
+ embed_model = HuggingFaceEmbedding(model_name="multi-qa-MiniLM-L6-cos-v1")
28
+ Settings.embed_model = embed_model
29
+
30
+ # Load index
31
+ storage_context = StorageContext.from_defaults(persist_dir="freud_index")
32
+ index = load_index_from_storage(storage_context=storage_context)
33
+
34
+ # Create retrievers
35
+ vector_retriever = index.as_retriever(similarity_top_k=10)
36
+ bm25_retriever = BM25Retriever.from_defaults(
37
+ index, similarity_top_k=10
38
+ )
39
+
40
+ # Create hybrid retriever
41
+ hybrid_retriever = QueryFusionRetriever(
42
+ [vector_retriever, bm25_retriever],
43
+ similarity_top_k=10,
44
+ num_queries=1, # set this to 1 to disable query generation
45
+ mode="reciprocal_rerank",
46
+ use_async=True,
47
+ verbose=True,
48
+ )
49
+
50
+ return index, vector_retriever, bm25_retriever, hybrid_retriever
51
+
52
+ # Load indices
53
+ index, vector_retriever, bm25_retriever, hybrid_retriever = load_indices()
54
+
55
+ # Search interface
56
+ search_query = st.text_input("Enter your search query:", placeholder="e.g. Oedipus complex")
57
+
58
+ # Add top_k selector
59
+ top_k = st.slider("Number of results to return:", min_value=1, max_value=20, value=10)
60
+
61
+ # Update retrievers with new top_k
62
+ vector_retriever.similarity_top_k = top_k
63
+ bm25_retriever.similarity_top_k = top_k
64
+ hybrid_retriever.similarity_top_k = top_k
65
+
66
+ # Search type selector
67
+ search_type = st.radio(
68
+ "Select search method:",
69
+ ["Hybrid", "Vector", "BM25"],
70
+ horizontal=True,
71
+ help="""
72
+ - **BM25**: Keyword-based search that works best for exact matches and specific terms. Similar to traditional search engines.
73
+ - **Vector**: Semantic search that understands the meaning of your query, even if it uses different words than the source text.
74
+ - **Hybrid**: Combines both approaches for better overall results, balancing exact matches with semantic understanding.
75
+ """
76
+ )
77
+
78
+ if search_query:
79
+ with st.spinner('Searching...'):
80
+ if search_type == "Hybrid":
81
+ nodes = hybrid_retriever.retrieve(search_query)
82
+ elif search_type == "Vector":
83
+ nodes = vector_retriever.retrieve(search_query)
84
+ else: # BM25
85
+ nodes = bm25_retriever.retrieve(search_query)
86
+
87
+ # Display results
88
+ st.subheader(f"Search Results")
89
+
90
+ for i, node in enumerate(nodes, 1):
91
+ # Create a preview of the text (first 200 characters)
92
+ preview = node.text[:200] + "..." if len(node.text) > 200 else node.text
93
+
94
+ # Format score to 3 decimal places
95
+ score = f"{node.score:.3f}" if hasattr(node, 'score') else "N/A"
96
+
97
+ # Create expandable container with new title format
98
+ with st.expander(f"Result {i} (score: {score})\n\n{preview}", expanded=False):
99
+ st.markdown(node.text)
100
+ if node.metadata:
101
+ st.markdown("---")
102
+ st.markdown("**Source:**")
103
+ st.json(node.metadata)
104
+
105
+ # Add sidebar with information
106
+ with st.sidebar:
107
+ st.header("About")
108
+ st.markdown("""
109
+ This demo searches through Freud's complete works using:
110
+
111
+ - **BM25**: Traditional keyword-based search
112
+ - **Vector Search**: Semantic similarity using embeddings
113
+ - **Hybrid**: Combines both approaches
114
+ """)
115
+
data_preparation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
epub2txt-all.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copied verbatim from https://github.com/soskek/bookcorpus
5
+
6
+ # Requirements:
7
+ #
8
+ # beautifulsoup4>=4.6.3
9
+ # html2text>=2018.1.9
10
+ # blingfire>=0.0.9
11
+ # progressbar>=2.5
12
+ # lxml>=4.3.2
13
+
14
+ import re
15
+ import os
16
+ import sys
17
+ import urllib
18
+ try:
19
+ from urllib import unquote
20
+ except:
21
+ from urllib.parse import unquote
22
+ import zipfile
23
+
24
+ import xml.parsers.expat
25
+ import html2text
26
+ from glob import glob
27
+ from pprint import pprint as pp
28
+
29
+ from natsort import natsorted
30
+
31
+ import json
32
+
33
+
34
+ class ContainerParser():
35
+ def __init__(self, xmlcontent=None):
36
+ self.rootfile = ""
37
+ self.xml = xmlcontent
38
+
39
+ def startElement(self, name, attributes):
40
+ if name == "rootfile":
41
+ self.buffer = ""
42
+ self.rootfile = attributes["full-path"]
43
+
44
+ def parseContainer(self):
45
+ parser = xml.parsers.expat.ParserCreate()
46
+ parser.StartElementHandler = self.startElement
47
+ parser.Parse(self.xml, 1)
48
+ return self.rootfile
49
+
50
+
51
+ class BookParser():
52
+ def __init__(self, xmlcontent=None):
53
+ self.xml = xmlcontent
54
+ self.title = ""
55
+ self.author = ""
56
+ self.inTitle = 0
57
+ self.inAuthor = 0
58
+ self.ncx = ""
59
+
60
+ def startElement(self, name, attributes):
61
+ if name == "dc:title":
62
+ self.buffer = ""
63
+ self.inTitle = 1
64
+ elif name == "dc:creator":
65
+ self.buffer = ""
66
+ self.inAuthor = 1
67
+ elif name == "item":
68
+ if attributes["id"] == "ncx" or attributes["id"] == "toc" or attributes["id"] == "ncxtoc":
69
+ self.ncx = attributes["href"]
70
+
71
+ def characters(self, data):
72
+ if self.inTitle:
73
+ self.buffer += data
74
+ elif self.inAuthor:
75
+ self.buffer += data
76
+
77
+ def endElement(self, name):
78
+ if name == "dc:title":
79
+ self.inTitle = 0
80
+ self.title = self.buffer
81
+ self.buffer = ""
82
+ elif name == "dc:creator":
83
+ self.inAuthor = 0
84
+ self.author = self.buffer
85
+ self.buffer = ""
86
+
87
+ def parseBook(self):
88
+ parser = xml.parsers.expat.ParserCreate()
89
+ parser.StartElementHandler = self.startElement
90
+ parser.EndElementHandler = self.endElement
91
+ parser.CharacterDataHandler = self.characters
92
+ parser.Parse(self.xml, 1)
93
+ return self.title, self.author, self.ncx
94
+
95
+
96
+ class NavPoint():
97
+ def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
98
+ self.id = id
99
+ self.content = content
100
+ self.playorder = playorder
101
+ self.level = level
102
+ self.text = text
103
+
104
+
105
+ class TocParser():
106
+ def __init__(self, xmlcontent=None):
107
+ self.xml = xmlcontent
108
+ self.currentNP = None
109
+ self.stack = []
110
+ self.inText = 0
111
+ self.toc = []
112
+
113
+ def startElement(self, name, attributes):
114
+ # TODO: what to do when no navpoints? Example: https://imgur.com/gAWuSaf
115
+ if name == "navPoint":
116
+ level = len(self.stack)
117
+ self.currentNP = NavPoint(
118
+ attributes["id"], attributes["playOrder"], level)
119
+ self.stack.append(self.currentNP)
120
+ self.toc.append(self.currentNP)
121
+ elif name == "content":
122
+ self.currentNP.content = unquote(attributes["src"])
123
+ elif name == "text":
124
+ self.buffer = ""
125
+ self.inText = 1
126
+
127
+ def characters(self, data):
128
+ if self.inText:
129
+ self.buffer += data
130
+
131
+ def endElement(self, name):
132
+ if name == "navPoint":
133
+ self.currentNP = self.stack.pop()
134
+ elif name == "text":
135
+ if self.inText and self.currentNP:
136
+ self.currentNP.text = self.buffer
137
+ self.inText = 0
138
+
139
+ def parseToc(self):
140
+ parser = xml.parsers.expat.ParserCreate()
141
+ parser.StartElementHandler = self.startElement
142
+ parser.EndElementHandler = self.endElement
143
+ parser.CharacterDataHandler = self.characters
144
+ parser.Parse(self.xml, 1)
145
+ return self.toc
146
+
147
+ def epub_name_matches(pattern, name):
148
+ rx = re.compile(r'[^a-zA-Z_\-/.]', re.IGNORECASE)
149
+ norm = re.sub(rx, '', name)
150
+ return re.search(pattern, norm)
151
+
152
+ def epub_toc_file(filelist):
153
+ for name in filelist:
154
+ if epub_name_matches(r'\b(toc|table.?of)', name):
155
+ return name
156
+
157
+ def extract_markdown_links(text):
158
+ for match in re.finditer(r'(?![!])\[(.*?)\]\((.*?)\)', text):
159
+ yield match.groups()
160
+
161
+ def extract_html_links(text):
162
+ for match in re.finditer(r'"([^"]+?[.][a-zA-Z]{2,}(?:[#][^"]+)?)"', text):
163
+ yield match.groups()
164
+
165
+ def html_links(text):
166
+ return [x[0] for x in extract_html_links(text)]
167
+
168
+ def flatten(xs):
169
+ r = []
170
+ for x in xs:
171
+ if isinstance(x, list):
172
+ r.extend(flatten(x))
173
+ else:
174
+ r.append(x)
175
+ return r
176
+
177
+ def string_bucket(buckets, strings, flat=False):
178
+ strings = [x for x in strings]
179
+ results = []
180
+ for bucket in buckets:
181
+ if isinstance(bucket, str):
182
+ bucket = bucket.split(',')
183
+ out = []
184
+ for pattern in bucket:
185
+ for s in strings:
186
+ if s not in out and epub_name_matches(pattern, s):
187
+ out.append(s)
188
+ for string in out:
189
+ strings.remove(string)
190
+ results.append(out)
191
+ results.append(strings)
192
+ if flat:
193
+ results = flatten(results)
194
+ return results
195
+
196
+ def sort_epub_files(filelist):
197
+ *front, outro, chapters, other = string_bucket([
198
+ 'cover',
199
+ 'title',
200
+ 'copyright',
201
+ 'toc,table.?of,contents',
202
+ 'frontmatter,acknowledge',
203
+ 'intro,forward',
204
+ 'index,outro,epilogue',
205
+ '[.]htm[l]?$',
206
+ ], natsorted(filelist), flat=False)
207
+ return flatten(front) + chapters + outro + other
208
+
209
+
210
+ def extract_epub_rootfile(file):
211
+ filelist = [x.filename for x in file.filelist]
212
+ if 'META-INF/container.xml' in filelist:
213
+ result = file.read('META-INF/container.xml').decode('utf8')
214
+ result = re.sub("='(.*?)'", r'="\1"', result)
215
+ return result
216
+
217
+
218
+ def extract_epub_opf(file, meta=None):
219
+ if meta is None:
220
+ meta = extract_epub_rootfile(file)
221
+ root = [line for line in meta.split('\n') if '<rootfile ' in line]
222
+ assert len(root) > 0
223
+ rootpath = html_links(root[0])[0]
224
+ assert rootpath.endswith('opf')
225
+ result = file.read(rootpath).decode('utf8')
226
+ result = re.sub("""='(.*?)'""", r'="\1"', result)
227
+ return result
228
+
229
+
230
+ def rmblanklines(text):
231
+ return '\n'.join([x for x in text.split('\n') if len(x.strip()) > 0])
232
+
233
+
234
+ def extract_epub_section(name, file, opf=None):
235
+ if opf is None:
236
+ opf = extract_epub_opf(file)
237
+ result = re.sub(re.compile('.*<{name}.*?>(.*?)</{name}>.*'.format(name=name), re.DOTALL), r'\1', opf)
238
+ return rmblanklines(result)
239
+
240
+
241
+ def extract_epub_guide(file, opf=None):
242
+ return extract_epub_section("guide", file=file, opf=opf)
243
+
244
+
245
+
246
+ def extract_epub_manifest(file, opf=None):
247
+ return extract_epub_section("manifest", file=file, opf=opf)
248
+
249
+ def xmlnode(element, text):
250
+ # strip html comments
251
+ text = re.sub(re.compile(r'<!--.*?-->', re.DOTALL), '', text)
252
+ rx = r'<(?P<tag>{element})\s?(?P<props>.*?)(?:/>|>(?P<value>.*?)</{element}>)'.format(element=element)
253
+ rx = re.compile(rx, re.DOTALL)
254
+ items = re.finditer(rx, text)
255
+ items = [item.groupdict() for item in items]
256
+ for item in items:
257
+ props = dict(re.findall(r'([^\s]*?)="(.*?)"', item['props']))
258
+ #item['props'] = props # ehh, just merge it
259
+ del item['props']
260
+ item.update(props)
261
+ return items
262
+
263
+ def extract_epub_items(file, opf=None):
264
+ manifest = extract_epub_manifest(file, opf=opf)
265
+ return xmlnode('item', manifest)
266
+
267
+ def extract_epub_spine(file, opf=None):
268
+ spine = extract_epub_section("spine", file=file, opf=opf)
269
+ return xmlnode('itemref', spine)
270
+
271
+ # def extract_epub_ids(file, opf=None):
272
+ # items = extract_epub_items(file, opf=opf)
273
+ # return {x['id']: x['href'] for x in items}
274
+
275
+ def href2filename(file, href, filelist):
276
+ href = href.split('#', 1)[0] # strip anchor
277
+ href = unquote(href) # urldecode
278
+ for name in filelist:
279
+ if name == href or name.endswith('/' + href):
280
+ return name
281
+ sys.stderr.write('href2filename: failed to find href {href!r} in epub {epub!r} with filelist {filelist!r}\n'.format(epub=file.filename, href=href, filelist=filelist))
282
+ if args.debug:
283
+ import pdb; pdb.set_trace()
284
+
285
+
286
+ def htmlfiles(filelist):
287
+ return [filename for filename in filelist if filename.endswith('htm') or filename.endswith('html')]
288
+
289
+ def extract_epub_order(file, opf=None):
290
+ filelist = sort_epub_files([x.filename for x in file.filelist])
291
+ items = extract_epub_items(file, opf=opf)
292
+ spine = extract_epub_spine(file, opf=opf)
293
+ ids = {x['id']: x['href'] for x in items}
294
+ try:
295
+ found = uniq([href2filename(file, ids[ref['idref']], filelist) for ref in spine])
296
+ except KeyError as e:
297
+ sys.stderr.write('error: KeyError for {!r}: ids is {!r}, filelist is {!r}\n'.format(file.filename, ids, filelist))
298
+ if args.debug:
299
+ import pdb; pdb.set_trace()
300
+ raise e
301
+ found = [x for x in found if x is not None] # href2filename can fail
302
+ for filename in found:
303
+ if filename not in filelist:
304
+ sys.stderr.write('Unknown found filename for {!r}: {!r}, filelist is {!r}\n'.format(file.filename, filename, filelist))
305
+ else:
306
+ filelist.remove(filename)
307
+ # This file seems to be unreferenced by anything else, sometimes.
308
+ if 'META-INF/nav.xhtml' in filelist:
309
+ filelist.remove('META-INF/nav.xhtml')
310
+ hfiles = htmlfiles(filelist)
311
+ if len(hfiles) > 0:
312
+ sys.stderr.write('Leftover HTML files for {!r}: {!r}\n'.format(file.filename, hfiles))
313
+ if args.debug:
314
+ import pdb; pdb.set_trace()
315
+ return found + filelist
316
+
317
+
318
+ def extract_epub_toc(file):
319
+ filelist = [x.filename for x in file.filelist]
320
+ meta = extract_epub_rootfile(file)
321
+ if meta is not None:
322
+ opf = extract_epub_opf(file, meta)
323
+ guide = extract_epub_guide(file, opf)
324
+ links = html_links(toc)
325
+ leftover = sort_epub_files(filelist)
326
+ result = []
327
+ for link in links:
328
+ link = link.split('#', 1)[0] # strip anchor
329
+ for name in leftover:
330
+ if name.endswith('/' + link):
331
+ result.append(name)
332
+ leftover.remove(name)
333
+ break
334
+ return result, leftover
335
+ else:
336
+ return None, None
337
+
338
+
339
+ def epub_html_files(file):
340
+ files, leftover = extract_epub_toc(file)
341
+ if files is not None and leftover is not None:
342
+ return [x for x in files + leftover if x.endswith('htm') or x.endswith('html')]
343
+
344
+
345
+ def uniq(xs):
346
+ r = []
347
+ for x in xs:
348
+ if x not in r:
349
+ r.append(x)
350
+ return r
351
+
352
+ def subst_1(pattern, replacement, lines, ignore=None):
353
+ for line in lines:
354
+ if ignore is None or not re.match(ignore, line):
355
+ line = re.sub(pattern, replacement, line)
356
+ yield line
357
+
358
+
359
+ def subst(pattern, replacement, lines, ignore=None):
360
+ if isinstance(lines, str):
361
+ return '\n'.join(subst_1(pattern, replacement, lines.split('\n'), ignore=ignore))
362
+ else:
363
+ return subst_1(pattern, replacement, lines, ignore=ignore)
364
+
365
+
366
+ from io import BytesIO
367
+
368
+
369
+ class epub2txt():
370
+ def __init__(self, epubfile=None):
371
+ self.epub = epubfile if epubfile != '-' and not epubfile.startswith('/dev/fd/') else BytesIO(sys.stdin.buffer.read())
372
+ self.epub_name = epubfile
373
+
374
+ def convert(self):
375
+ # print "Processing %s ..." % self.epub
376
+ file = zipfile.ZipFile(self.epub, "r")
377
+ # rootfile = ContainerParser(
378
+ # file.read("META-INF/container.xml")).parseContainer()
379
+ # title, author, ncx = BookParser(file.read(rootfile)).parseBook()
380
+ # ops = "/".join(rootfile.split("/")[:-1])
381
+ # if ops != "":
382
+ # ops = ops+"/"
383
+ # toc = TocParser(file.read(ops + ncx)).parseToc()
384
+
385
+
386
+ # filelist = [x.filename for x in file.filelist]
387
+ # tocfile = epub_toc_file(filelist)
388
+ # if not tocfile:
389
+ # import pdb; pdb.set_trace()
390
+ # toc = file.read(tocfile).decode('utf-8')
391
+ # pp(list(extract_html_links(toc)))
392
+ # import pdb; pdb.set_trace()
393
+ # files = {x.filename: file.read(x).decode('utf-8') for x in file.filelist if x.filename.endswith('htm') or x.filename.endswith('html')}
394
+ # from natsort import natsorted
395
+ # import json
396
+ # file_order = natsorted(list(files.keys()))
397
+
398
+ if False:
399
+ files = {x.filename: file.read(x).decode('utf-8') for x in file.filelist if x.filename.endswith('htm') or x.filename.endswith('html')}
400
+ file_order = natsorted(list(files.keys()))
401
+ # file_order = list(files.keys())
402
+ else:
403
+ meta = extract_epub_rootfile(file)
404
+ if meta is None: import pdb; pdb.set_trace()
405
+ opf = extract_epub_opf(file, meta=meta)
406
+ if opf is None: import pdb; pdb.set_trace()
407
+ # file_order = epub_html_files(file)
408
+ file_order = htmlfiles(extract_epub_order(file, opf=opf))
409
+ if file_order is None: import pdb; pdb.set_trace()
410
+
411
+ files = {x: file.read(x).decode('utf-8') for x in file_order}
412
+
413
+ content = []
414
+ for xmlfile in file_order:
415
+ html = files[xmlfile]
416
+ if not args.quiet:
417
+ sys.stderr.write(self.epub_name+'/'+xmlfile + '\n')
418
+ h = html2text.HTML2Text()
419
+ h.body_width = 0
420
+ text = h.handle(html)
421
+ if not text.endswith('\n'):
422
+ text += '\n'
423
+ filename = self.epub_name+'/'+xmlfile
424
+ #name, ext = os.path.splitext(filename)
425
+ bookname = filename + '.md'
426
+ if not args.no_metadata:
427
+ content.append('<|file name={}|>'.format(json.dumps(bookname)) + '\n')
428
+ content.append(text)
429
+ if not args.no_metadata:
430
+ content.append('<|/file name={}|>'.format(json.dumps(bookname)) + '\n')
431
+
432
+ file.close()
433
+ result = ''.join(content)
434
+ # final postprocessing fixups: tables come out all weird, so
435
+ # fix them with a hack.
436
+ result = result.replace('\n\n| \n\n', ' | ')
437
+
438
+ if args.ftfy:
439
+ import ftfy
440
+ result = ftfy.fix_text(result)
441
+ # replace unicode … with ... which ftfy doesn't do by default
442
+ # NOTE: this departs from openai's convention of calling
443
+ # ftfy.fix_text() with default arguments. In particular,
444
+ # OpenAI's GPT-2 models do generate unicode ellipses.
445
+ # Nonetheless, we replace unicdoe ellipses with ... to
446
+ # increase the chances of semantic understanding.
447
+ result = result.replace(' …', '...') # first pass: convert "foo …" to "foo..."
448
+ #result = result.replace(' …', '...') # second pass: convert "foo …" to "foo..."
449
+ result = result.replace('…', '...') # final pass: convert "foo…" to "foo..."
450
+
451
+ result = result.split('\n') # split into lines for performance in the following sections.
452
+
453
+ ignore_ul_item = r'[*]\s'
454
+ ignore_ol_item = r'[0-9]+[.]\s'
455
+ ignore_li = '(?!(' + ignore_ul_item + ')|(' + ignore_ol_item + '))'
456
+ ignore_code='^[ ]{4,}' + ignore_li + r'[^\s]'
457
+
458
+ def sub(pattern, replacement, text):
459
+ return subst(pattern, replacement, text, ignore=ignore_code)
460
+
461
+ if args.plain_text:
462
+ #result = unmark(result)
463
+ # get rid of images
464
+ result = sub('[!]\s*[\[].*?[\]][(].*?[)]', ' ', result)
465
+ # remove reference links, e.g. [3](e9781429926119_bm01.html#end_en12)
466
+ result = sub('\[([0-9]+?)\][(].*?[)]', '', result)
467
+ # replace [foo](www.example.com) with foo
468
+ result = sub('[!]?\[(.*?)\][(].*?[)]', r'\1', result)
469
+
470
+ # fix up cases like this:
471
+ #
472
+ # 1\. foo
473
+ #
474
+ # 2\. bar
475
+ #
476
+ # For [\n][0-9]+[\\][.], strip the backslash.
477
+ #result = re.sub(r'[\n]([0-9]+)[\\][.][ ]', r'\1. ', result)
478
+ result = sub(re.compile(r'([0-9]+)[\\][.][ ]', re.DOTALL), r'\1. ', result)
479
+
480
+ # convert lines back to text
481
+ result = '\n'.join(result)
482
+
483
+ if not args.no_collapse_blanks:
484
+ # replace long runs of blank lines with three blank lines
485
+ rx = re.compile(r'([\r\t ]*[\n]+){2,}', re.DOTALL)
486
+ result = re.sub(rx, r'\n\n', result)
487
+
488
+ # fix up cases like this:
489
+ #
490
+ # ... some text...
491
+ # ## Chapter 1
492
+ #
493
+ # Put a newline before the "## Chapter 1", to have a blank
494
+ # line before headings.
495
+ result = re.sub(r'\n([^\n]+)[\n]#', r'\n\1\n\n#', result)
496
+
497
+ if not args.no_collapse_blanks:
498
+ # replace long runs of blank lines with three blank lines
499
+ rx = re.compile(r'([\r\t ]*[\n]+){3,}', re.DOTALL)
500
+ result = re.sub(rx, r'\n\n\n', result)
501
+
502
+ if args.append is not None:
503
+ append = str.encode(args.append).decode('unicode-escape')
504
+ result += append
505
+
506
+ return result
507
+
508
+
509
+ # # https://stackoverflow.com/questions/761824/python-how-to-convert-markdown-formatted-text-to-text
510
+
511
+ # from markdown import Markdown
512
+ # from io import StringIO
513
+
514
+
515
+ # def unmark_element(element, stream=None):
516
+ # if stream is None:
517
+ # stream = StringIO()
518
+ # if element.text:
519
+ # stream.write(element.text)
520
+ # for sub in element:
521
+ # unmark_element(sub, stream)
522
+ # if element.tail:
523
+ # stream.write(element.tail)
524
+ # return stream.getvalue()
525
+
526
+
527
+ # def unmark(text):
528
+ # # patching Markdown
529
+ # Markdown.output_formats["plain"] = unmark_element
530
+ # __md = Markdown(output_format="plain")
531
+ # __md.stripTopLevelTags = False
532
+ # return __md.convert(text)
533
+
534
+
535
+ #==============================================================================
536
+ # Cmdline
537
+ #==============================================================================
538
+ import argparse
539
+
540
+ parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
541
+ description="""
542
+ TODO
543
+ """)
544
+
545
+ parser.add_argument('infile', default='-', nargs='?')
546
+ parser.add_argument('outfile', default='-', nargs='?')
547
+
548
+ parser.add_argument('-v', '--verbose',
549
+ action="store_true",
550
+ help="verbose output" )
551
+
552
+ parser.add_argument('-n', '--no-metadata',
553
+ action="store_true",
554
+ help="Don't output <|file name=...|>" )
555
+
556
+ parser.add_argument('-f', '--ftfy',
557
+ action="store_true",
558
+ help="Run text through ftfy.fix_text()" )
559
+
560
+ parser.add_argument('-a', '--append',
561
+ default=None,
562
+ help="Append this string to the end of the text (useful for adding <|endoftext|>)")
563
+
564
+ parser.add_argument('-p', '--plain-text',
565
+ action="store_true",
566
+ help="Convert markdown to plain text")
567
+
568
+ parser.add_argument('-q', '--quiet',
569
+ action="store_true",
570
+ help="Don't output ToC info to stderr")
571
+
572
+ parser.add_argument('-nc', '--no-collapse-blanks',
573
+ action="store_true",
574
+ help="Don't collapse long runs of blank lines into three blank lines" )
575
+
576
+ parser.add_argument('--debug',
577
+ action="store_true",
578
+ help="pdb.set_trace() on error conditions" )
579
+
580
+ args = None
581
+
582
+
583
+ import time
584
+
585
+ def main():
586
+ global args
587
+ if not args:
588
+ args, leftovers = parser.parse_known_args()
589
+ args.args = leftovers
590
+ filenames = glob(args.infile) if '*' in args.infile else [args.infile]
591
+ out = None
592
+ for filename in filenames:
593
+ try:
594
+ txt = epub2txt(filename).convert()
595
+ except:
596
+ sys.stderr.write('Error converting {!r}:\n'.format(filename))
597
+ raise
598
+ if len(txt.strip()) > 0:
599
+ if out is None:
600
+ out = open(args.outfile, "w", encoding="utf-8") if args.outfile != '-' else sys.stdout
601
+ out.write(txt)
602
+ out.flush()
603
+
604
+ if __name__ == "__main__":
605
+ main()
606
+
607
+ # TODO: Look into this bug: https://i.imgur.com/zDora9Y.png
608
+ # TODO: 2020-09-03 bug https://imgur.com/BVMXhEs
609
+ # TODO: 2020-09-03 bug album: https://imgur.com/a/3n7BPBQ
epubs_to_txts.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+
4
+ def convert_epubs_to_txt(epubs_dir, txt_dir):
5
+ if not os.path.exists(txt_dir):
6
+ os.makedirs(txt_dir)
7
+
8
+ for root, dirs, files in os.walk(epubs_dir):
9
+ for file in files:
10
+ if file.endswith('.epub'):
11
+ epub_path = os.path.join(root, file)
12
+ txt_filename = os.path.splitext(file)[0] + '.txt'
13
+ txt_path = os.path.join(txt_dir, txt_filename)
14
+
15
+ # Run the epub2txt-all.py script
16
+ subprocess.run(['python', 'epub2txt-all.py', epub_path, txt_path, "-q"])
17
+
18
+ if __name__ == "__main__":
19
+ epubs_directory = './epubs' # Change this to the directory containing your EPUB files
20
+ txt_directory = './txt'
21
+ convert_epubs_to_txt(epubs_directory, txt_directory)
unpack_german_freud.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+ import os
3
+ import shutil
4
+ import re
5
+
6
+ def extract_epubs_from_zip(zip_path, extract_to):
7
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
8
+ for file_info in zip_ref.infolist():
9
+ if file_info.filename.endswith('.epub'):
10
+ # Extract the EPUB file directly to the target directory
11
+ zip_ref.extract(file_info, extract_to)
12
+ # Move the EPUB file to the target directory
13
+ extracted_path = os.path.join(extract_to, file_info.filename)
14
+ shutil.move(extracted_path, os.path.join(extract_to, os.path.basename(file_info.filename)))
15
+
16
+ def unpack_zips_and_extract_epubs(root_dir, epubs_dir):
17
+ if not os.path.exists(epubs_dir):
18
+ os.makedirs(epubs_dir)
19
+
20
+ for root, dirs, files in os.walk(root_dir):
21
+ for file in files:
22
+ if file.endswith('.zip'):
23
+ zip_path = os.path.join(root, file)
24
+ temp_extract_dir = os.path.join(root, 'temp_extract')
25
+
26
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
27
+ zip_ref.extractall(temp_extract_dir)
28
+
29
+ for temp_root, temp_dirs, temp_files in os.walk(temp_extract_dir):
30
+ for temp_file in temp_files:
31
+ if temp_file.endswith('.epub'):
32
+ epub_path = os.path.join(temp_root, temp_file)
33
+ shutil.move(epub_path, os.path.join(epubs_dir, os.path.basename(epub_path)))
34
+ elif temp_file.endswith('.zip'):
35
+ nested_zip_path = os.path.join(temp_root, temp_file)
36
+ if zipfile.is_zipfile(nested_zip_path):
37
+ extract_epubs_from_zip(nested_zip_path, epubs_dir)
38
+
39
+ # Clean up the temporary extraction directory
40
+ shutil.rmtree(temp_extract_dir)
41
+
42
+ # Remove any empty directories in the epubs directory
43
+ for root, dirs, files in os.walk(epubs_dir, topdown=False):
44
+ for dir in dirs:
45
+ dir_path = os.path.join(root, dir)
46
+ if not os.listdir(dir_path):
47
+ os.rmdir(dir_path)
48
+
49
+ if __name__ == "__main__":
50
+ root_directory = '.' # Change this to the directory containing your ZIP files
51
+ epubs_directory = './epubs'
52
+ unpack_zips_and_extract_epubs(root_directory, epubs_directory)