|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import os |
|
import sys |
|
import urllib |
|
try: |
|
from urllib import unquote |
|
except: |
|
from urllib.parse import unquote |
|
import zipfile |
|
|
|
import xml.parsers.expat |
|
import html2text |
|
from glob import glob |
|
from pprint import pprint as pp |
|
|
|
from natsort import natsorted |
|
|
|
import json |
|
|
|
|
|
class ContainerParser(): |
|
def __init__(self, xmlcontent=None): |
|
self.rootfile = "" |
|
self.xml = xmlcontent |
|
|
|
def startElement(self, name, attributes): |
|
if name == "rootfile": |
|
self.buffer = "" |
|
self.rootfile = attributes["full-path"] |
|
|
|
def parseContainer(self): |
|
parser = xml.parsers.expat.ParserCreate() |
|
parser.StartElementHandler = self.startElement |
|
parser.Parse(self.xml, 1) |
|
return self.rootfile |
|
|
|
|
|
class BookParser(): |
|
def __init__(self, xmlcontent=None): |
|
self.xml = xmlcontent |
|
self.title = "" |
|
self.author = "" |
|
self.inTitle = 0 |
|
self.inAuthor = 0 |
|
self.ncx = "" |
|
|
|
def startElement(self, name, attributes): |
|
if name == "dc:title": |
|
self.buffer = "" |
|
self.inTitle = 1 |
|
elif name == "dc:creator": |
|
self.buffer = "" |
|
self.inAuthor = 1 |
|
elif name == "item": |
|
if attributes["id"] == "ncx" or attributes["id"] == "toc" or attributes["id"] == "ncxtoc": |
|
self.ncx = attributes["href"] |
|
|
|
def characters(self, data): |
|
if self.inTitle: |
|
self.buffer += data |
|
elif self.inAuthor: |
|
self.buffer += data |
|
|
|
def endElement(self, name): |
|
if name == "dc:title": |
|
self.inTitle = 0 |
|
self.title = self.buffer |
|
self.buffer = "" |
|
elif name == "dc:creator": |
|
self.inAuthor = 0 |
|
self.author = self.buffer |
|
self.buffer = "" |
|
|
|
def parseBook(self): |
|
parser = xml.parsers.expat.ParserCreate() |
|
parser.StartElementHandler = self.startElement |
|
parser.EndElementHandler = self.endElement |
|
parser.CharacterDataHandler = self.characters |
|
parser.Parse(self.xml, 1) |
|
return self.title, self.author, self.ncx |
|
|
|
|
|
class NavPoint(): |
|
def __init__(self, id=None, playorder=None, level=0, content=None, text=None): |
|
self.id = id |
|
self.content = content |
|
self.playorder = playorder |
|
self.level = level |
|
self.text = text |
|
|
|
|
|
class TocParser(): |
|
def __init__(self, xmlcontent=None): |
|
self.xml = xmlcontent |
|
self.currentNP = None |
|
self.stack = [] |
|
self.inText = 0 |
|
self.toc = [] |
|
|
|
def startElement(self, name, attributes): |
|
|
|
if name == "navPoint": |
|
level = len(self.stack) |
|
self.currentNP = NavPoint( |
|
attributes["id"], attributes["playOrder"], level) |
|
self.stack.append(self.currentNP) |
|
self.toc.append(self.currentNP) |
|
elif name == "content": |
|
self.currentNP.content = unquote(attributes["src"]) |
|
elif name == "text": |
|
self.buffer = "" |
|
self.inText = 1 |
|
|
|
def characters(self, data): |
|
if self.inText: |
|
self.buffer += data |
|
|
|
def endElement(self, name): |
|
if name == "navPoint": |
|
self.currentNP = self.stack.pop() |
|
elif name == "text": |
|
if self.inText and self.currentNP: |
|
self.currentNP.text = self.buffer |
|
self.inText = 0 |
|
|
|
def parseToc(self): |
|
parser = xml.parsers.expat.ParserCreate() |
|
parser.StartElementHandler = self.startElement |
|
parser.EndElementHandler = self.endElement |
|
parser.CharacterDataHandler = self.characters |
|
parser.Parse(self.xml, 1) |
|
return self.toc |
|
|
|
def epub_name_matches(pattern, name): |
|
rx = re.compile(r'[^a-zA-Z_\-/.]', re.IGNORECASE) |
|
norm = re.sub(rx, '', name) |
|
return re.search(pattern, norm) |
|
|
|
def epub_toc_file(filelist): |
|
for name in filelist: |
|
if epub_name_matches(r'\b(toc|table.?of)', name): |
|
return name |
|
|
|
def extract_markdown_links(text): |
|
for match in re.finditer(r'(?![!])\[(.*?)\]\((.*?)\)', text): |
|
yield match.groups() |
|
|
|
def extract_html_links(text): |
|
for match in re.finditer(r'"([^"]+?[.][a-zA-Z]{2,}(?:[#][^"]+)?)"', text): |
|
yield match.groups() |
|
|
|
def html_links(text): |
|
return [x[0] for x in extract_html_links(text)] |
|
|
|
def flatten(xs): |
|
r = [] |
|
for x in xs: |
|
if isinstance(x, list): |
|
r.extend(flatten(x)) |
|
else: |
|
r.append(x) |
|
return r |
|
|
|
def string_bucket(buckets, strings, flat=False): |
|
strings = [x for x in strings] |
|
results = [] |
|
for bucket in buckets: |
|
if isinstance(bucket, str): |
|
bucket = bucket.split(',') |
|
out = [] |
|
for pattern in bucket: |
|
for s in strings: |
|
if s not in out and epub_name_matches(pattern, s): |
|
out.append(s) |
|
for string in out: |
|
strings.remove(string) |
|
results.append(out) |
|
results.append(strings) |
|
if flat: |
|
results = flatten(results) |
|
return results |
|
|
|
def sort_epub_files(filelist): |
|
*front, outro, chapters, other = string_bucket([ |
|
'cover', |
|
'title', |
|
'copyright', |
|
'toc,table.?of,contents', |
|
'frontmatter,acknowledge', |
|
'intro,forward', |
|
'index,outro,epilogue', |
|
'[.]htm[l]?$', |
|
], natsorted(filelist), flat=False) |
|
return flatten(front) + chapters + outro + other |
|
|
|
|
|
def extract_epub_rootfile(file): |
|
filelist = [x.filename for x in file.filelist] |
|
if 'META-INF/container.xml' in filelist: |
|
result = file.read('META-INF/container.xml').decode('utf8') |
|
result = re.sub("='(.*?)'", r'="\1"', result) |
|
return result |
|
|
|
|
|
def extract_epub_opf(file, meta=None): |
|
if meta is None: |
|
meta = extract_epub_rootfile(file) |
|
root = [line for line in meta.split('\n') if '<rootfile ' in line] |
|
assert len(root) > 0 |
|
rootpath = html_links(root[0])[0] |
|
assert rootpath.endswith('opf') |
|
result = file.read(rootpath).decode('utf8') |
|
result = re.sub("""='(.*?)'""", r'="\1"', result) |
|
return result |
|
|
|
|
|
def rmblanklines(text): |
|
return '\n'.join([x for x in text.split('\n') if len(x.strip()) > 0]) |
|
|
|
|
|
def extract_epub_section(name, file, opf=None): |
|
if opf is None: |
|
opf = extract_epub_opf(file) |
|
result = re.sub(re.compile('.*<{name}.*?>(.*?)</{name}>.*'.format(name=name), re.DOTALL), r'\1', opf) |
|
return rmblanklines(result) |
|
|
|
|
|
def extract_epub_guide(file, opf=None): |
|
return extract_epub_section("guide", file=file, opf=opf) |
|
|
|
|
|
|
|
def extract_epub_manifest(file, opf=None): |
|
return extract_epub_section("manifest", file=file, opf=opf) |
|
|
|
def xmlnode(element, text): |
|
|
|
text = re.sub(re.compile(r'<!--.*?-->', re.DOTALL), '', text) |
|
rx = r'<(?P<tag>{element})\s?(?P<props>.*?)(?:/>|>(?P<value>.*?)</{element}>)'.format(element=element) |
|
rx = re.compile(rx, re.DOTALL) |
|
items = re.finditer(rx, text) |
|
items = [item.groupdict() for item in items] |
|
for item in items: |
|
props = dict(re.findall(r'([^\s]*?)="(.*?)"', item['props'])) |
|
|
|
del item['props'] |
|
item.update(props) |
|
return items |
|
|
|
def extract_epub_items(file, opf=None): |
|
manifest = extract_epub_manifest(file, opf=opf) |
|
return xmlnode('item', manifest) |
|
|
|
def extract_epub_spine(file, opf=None): |
|
spine = extract_epub_section("spine", file=file, opf=opf) |
|
return xmlnode('itemref', spine) |
|
|
|
|
|
|
|
|
|
|
|
def href2filename(file, href, filelist): |
|
href = href.split('#', 1)[0] |
|
href = unquote(href) |
|
for name in filelist: |
|
if name == href or name.endswith('/' + href): |
|
return name |
|
sys.stderr.write('href2filename: failed to find href {href!r} in epub {epub!r} with filelist {filelist!r}\n'.format(epub=file.filename, href=href, filelist=filelist)) |
|
if args.debug: |
|
import pdb; pdb.set_trace() |
|
|
|
|
|
def htmlfiles(filelist): |
|
return [filename for filename in filelist if filename.endswith('htm') or filename.endswith('html')] |
|
|
|
def extract_epub_order(file, opf=None): |
|
filelist = sort_epub_files([x.filename for x in file.filelist]) |
|
items = extract_epub_items(file, opf=opf) |
|
spine = extract_epub_spine(file, opf=opf) |
|
ids = {x['id']: x['href'] for x in items} |
|
try: |
|
found = uniq([href2filename(file, ids[ref['idref']], filelist) for ref in spine]) |
|
except KeyError as e: |
|
sys.stderr.write('error: KeyError for {!r}: ids is {!r}, filelist is {!r}\n'.format(file.filename, ids, filelist)) |
|
if args.debug: |
|
import pdb; pdb.set_trace() |
|
raise e |
|
found = [x for x in found if x is not None] |
|
for filename in found: |
|
if filename not in filelist: |
|
sys.stderr.write('Unknown found filename for {!r}: {!r}, filelist is {!r}\n'.format(file.filename, filename, filelist)) |
|
else: |
|
filelist.remove(filename) |
|
|
|
if 'META-INF/nav.xhtml' in filelist: |
|
filelist.remove('META-INF/nav.xhtml') |
|
hfiles = htmlfiles(filelist) |
|
if len(hfiles) > 0: |
|
sys.stderr.write('Leftover HTML files for {!r}: {!r}\n'.format(file.filename, hfiles)) |
|
if args.debug: |
|
import pdb; pdb.set_trace() |
|
return found + filelist |
|
|
|
|
|
def extract_epub_toc(file): |
|
filelist = [x.filename for x in file.filelist] |
|
meta = extract_epub_rootfile(file) |
|
if meta is not None: |
|
opf = extract_epub_opf(file, meta) |
|
guide = extract_epub_guide(file, opf) |
|
links = html_links(toc) |
|
leftover = sort_epub_files(filelist) |
|
result = [] |
|
for link in links: |
|
link = link.split('#', 1)[0] |
|
for name in leftover: |
|
if name.endswith('/' + link): |
|
result.append(name) |
|
leftover.remove(name) |
|
break |
|
return result, leftover |
|
else: |
|
return None, None |
|
|
|
|
|
def epub_html_files(file): |
|
files, leftover = extract_epub_toc(file) |
|
if files is not None and leftover is not None: |
|
return [x for x in files + leftover if x.endswith('htm') or x.endswith('html')] |
|
|
|
|
|
def uniq(xs): |
|
r = [] |
|
for x in xs: |
|
if x not in r: |
|
r.append(x) |
|
return r |
|
|
|
def subst_1(pattern, replacement, lines, ignore=None): |
|
for line in lines: |
|
if ignore is None or not re.match(ignore, line): |
|
line = re.sub(pattern, replacement, line) |
|
yield line |
|
|
|
|
|
def subst(pattern, replacement, lines, ignore=None): |
|
if isinstance(lines, str): |
|
return '\n'.join(subst_1(pattern, replacement, lines.split('\n'), ignore=ignore)) |
|
else: |
|
return subst_1(pattern, replacement, lines, ignore=ignore) |
|
|
|
|
|
from io import BytesIO |
|
|
|
|
|
class epub2txt(): |
|
def __init__(self, epubfile=None): |
|
self.epub = epubfile if epubfile != '-' and not epubfile.startswith('/dev/fd/') else BytesIO(sys.stdin.buffer.read()) |
|
self.epub_name = epubfile |
|
|
|
def convert(self): |
|
|
|
file = zipfile.ZipFile(self.epub, "r") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if False: |
|
files = {x.filename: file.read(x).decode('utf-8') for x in file.filelist if x.filename.endswith('htm') or x.filename.endswith('html')} |
|
file_order = natsorted(list(files.keys())) |
|
|
|
else: |
|
meta = extract_epub_rootfile(file) |
|
if meta is None: import pdb; pdb.set_trace() |
|
opf = extract_epub_opf(file, meta=meta) |
|
if opf is None: import pdb; pdb.set_trace() |
|
|
|
file_order = htmlfiles(extract_epub_order(file, opf=opf)) |
|
if file_order is None: import pdb; pdb.set_trace() |
|
|
|
files = {x: file.read(x).decode('utf-8') for x in file_order} |
|
|
|
content = [] |
|
for xmlfile in file_order: |
|
html = files[xmlfile] |
|
if not args.quiet: |
|
sys.stderr.write(self.epub_name+'/'+xmlfile + '\n') |
|
h = html2text.HTML2Text() |
|
h.body_width = 0 |
|
text = h.handle(html) |
|
if not text.endswith('\n'): |
|
text += '\n' |
|
filename = self.epub_name+'/'+xmlfile |
|
|
|
bookname = filename + '.md' |
|
if not args.no_metadata: |
|
content.append('<|file name={}|>'.format(json.dumps(bookname)) + '\n') |
|
content.append(text) |
|
if not args.no_metadata: |
|
content.append('<|/file name={}|>'.format(json.dumps(bookname)) + '\n') |
|
|
|
file.close() |
|
result = ''.join(content) |
|
|
|
|
|
result = result.replace('\n\n| \n\n', ' | ') |
|
|
|
if args.ftfy: |
|
import ftfy |
|
result = ftfy.fix_text(result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
result = result.replace(' …', '...') |
|
|
|
result = result.replace('…', '...') |
|
|
|
result = result.split('\n') |
|
|
|
ignore_ul_item = r'[*]\s' |
|
ignore_ol_item = r'[0-9]+[.]\s' |
|
ignore_li = '(?!(' + ignore_ul_item + ')|(' + ignore_ol_item + '))' |
|
ignore_code='^[ ]{4,}' + ignore_li + r'[^\s]' |
|
|
|
def sub(pattern, replacement, text): |
|
return subst(pattern, replacement, text, ignore=ignore_code) |
|
|
|
if args.plain_text: |
|
|
|
|
|
result = sub('[!]\s*[\[].*?[\]][(].*?[)]', ' ', result) |
|
|
|
result = sub('\[([0-9]+?)\][(].*?[)]', '', result) |
|
|
|
result = sub('[!]?\[(.*?)\][(].*?[)]', r'\1', result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = sub(re.compile(r'([0-9]+)[\\][.][ ]', re.DOTALL), r'\1. ', result) |
|
|
|
|
|
result = '\n'.join(result) |
|
|
|
if not args.no_collapse_blanks: |
|
|
|
rx = re.compile(r'([\r\t ]*[\n]+){2,}', re.DOTALL) |
|
result = re.sub(rx, r'\n\n', result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = re.sub(r'\n([^\n]+)[\n]#', r'\n\1\n\n#', result) |
|
|
|
if not args.no_collapse_blanks: |
|
|
|
rx = re.compile(r'([\r\t ]*[\n]+){3,}', re.DOTALL) |
|
result = re.sub(rx, r'\n\n\n', result) |
|
|
|
if args.append is not None: |
|
append = str.encode(args.append).decode('unicode-escape') |
|
result += append |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, |
|
description=""" |
|
TODO |
|
""") |
|
|
|
parser.add_argument('infile', default='-', nargs='?') |
|
parser.add_argument('outfile', default='-', nargs='?') |
|
|
|
parser.add_argument('-v', '--verbose', |
|
action="store_true", |
|
help="verbose output" ) |
|
|
|
parser.add_argument('-n', '--no-metadata', |
|
action="store_true", |
|
help="Don't output <|file name=...|>" ) |
|
|
|
parser.add_argument('-f', '--ftfy', |
|
action="store_true", |
|
help="Run text through ftfy.fix_text()" ) |
|
|
|
parser.add_argument('-a', '--append', |
|
default=None, |
|
help="Append this string to the end of the text (useful for adding <|endoftext|>)") |
|
|
|
parser.add_argument('-p', '--plain-text', |
|
action="store_true", |
|
help="Convert markdown to plain text") |
|
|
|
parser.add_argument('-q', '--quiet', |
|
action="store_true", |
|
help="Don't output ToC info to stderr") |
|
|
|
parser.add_argument('-nc', '--no-collapse-blanks', |
|
action="store_true", |
|
help="Don't collapse long runs of blank lines into three blank lines" ) |
|
|
|
parser.add_argument('--debug', |
|
action="store_true", |
|
help="pdb.set_trace() on error conditions" ) |
|
|
|
args = None |
|
|
|
|
|
import time |
|
|
|
def main(): |
|
global args |
|
if not args: |
|
args, leftovers = parser.parse_known_args() |
|
args.args = leftovers |
|
filenames = glob(args.infile) if '*' in args.infile else [args.infile] |
|
out = None |
|
for filename in filenames: |
|
try: |
|
txt = epub2txt(filename).convert() |
|
except: |
|
sys.stderr.write('Error converting {!r}:\n'.format(filename)) |
|
raise |
|
if len(txt.strip()) > 0: |
|
if out is None: |
|
out = open(args.outfile, "w", encoding="utf-8") if args.outfile != '-' else sys.stdout |
|
out.write(txt) |
|
out.flush() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
|
|
|