Spaces:
Runtime error
Runtime error
#! /usr/bin/env python | |
import argparse | |
import re | |
import xml.etree.ElementTree as ET | |
import zipfile | |
import os | |
import sys | |
nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} | |
def process_args(): | |
parser = argparse.ArgumentParser(description='A pure python-based utility ' | |
'to extract text and images ' | |
'from docx files.') | |
parser.add_argument("docx", help="path of the docx file") | |
parser.add_argument('-i', '--img_dir', help='path of directory ' | |
'to extract images') | |
args = parser.parse_args() | |
if not os.path.exists(args.docx): | |
print('File {} does not exist.'.format(args.docx)) | |
sys.exit(1) | |
if args.img_dir is not None: | |
if not os.path.exists(args.img_dir): | |
try: | |
os.makedirs(args.img_dir) | |
except OSError: | |
print("Unable to create img_dir {}".format(args.img_dir)) | |
sys.exit(1) | |
return args | |
def qn(tag): | |
""" | |
Stands for 'qualified name', a utility function to turn a namespace | |
prefixed tag name into a Clark-notation qualified tag name for lxml. For | |
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. | |
Source: https://github.com/python-openxml/python-docx/ | |
""" | |
prefix, tagroot = tag.split(':') | |
uri = nsmap[prefix] | |
return '{{{}}}{}'.format(uri, tagroot) | |
def xml2text(xml): | |
""" | |
A string representing the textual content of this run, with content | |
child elements like ``<w:tab/>`` translated to their Python | |
equivalent. | |
Adapted from: https://github.com/python-openxml/python-docx/ | |
""" | |
text = u'' | |
root = ET.fromstring(xml) | |
for child in root.iter(): | |
if child.tag == qn('w:t'): | |
t_text = child.text | |
text += t_text if t_text is not None else '' | |
elif child.tag == qn('w:tab'): | |
text += '\t' | |
elif child.tag in (qn('w:br'), qn('w:cr')): | |
text += '\n' | |
elif child.tag == qn("w:p"): | |
text += '\n\n' | |
return text | |
def process(docx, img_dir=None): | |
text = u'' | |
# unzip the docx in memory | |
zipf = zipfile.ZipFile(docx) | |
filelist = zipf.namelist() | |
# get header text | |
# there can be 3 header files in the zip | |
header_xmls = 'word/header[0-9]*.xml' | |
for fname in filelist: | |
if re.match(header_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
# get main text | |
doc_xml = 'word/document.xml' | |
text += xml2text(zipf.read(doc_xml)) | |
# get footer text | |
# there can be 3 footer files in the zip | |
footer_xmls = 'word/footer[0-9]*.xml' | |
for fname in filelist: | |
if re.match(footer_xmls, fname): | |
text += xml2text(zipf.read(fname)) | |
if img_dir is not None: | |
# extract images | |
for fname in filelist: | |
_, extension = os.path.splitext(fname) | |
if extension in [".jpg", ".jpeg", ".png", ".bmp"]: | |
dst_fname = os.path.join(img_dir, os.path.basename(fname)) | |
with open(dst_fname, "wb") as dst_f: | |
dst_f.write(zipf.read(fname)) | |
zipf.close() | |
return text.strip() | |
if __name__ == '__main__': | |
args = process_args() | |
text = process(args.docx, args.img_dir) | |
sys.stdout.write(text.encode('utf-8')) | |