Spaces:

AbeerTrial
/

SOAPAssist

Runtime error

App Files Files Community

SOAPAssist / Lib /site-packages /docx2txt /docx2txt.py

AbeerTrial

Upload folder using huggingface_hub

8a58cf3 over 1 year ago

raw

history blame contribute delete

3.43 kB

	#! /usr/bin/env python

	import argparse
	import re
	import xml.etree.ElementTree as ET
	import zipfile
	import os
	import sys


	nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}


	def process_args():
	parser = argparse.ArgumentParser(description='A pure python-based utility '
	'to extract text and images '
	'from docx files.')
	parser.add_argument("docx", help="path of the docx file")
	parser.add_argument('-i', '--img_dir', help='path of directory '
	'to extract images')

	args = parser.parse_args()

	if not os.path.exists(args.docx):
	print('File {} does not exist.'.format(args.docx))
	sys.exit(1)

	if args.img_dir is not None:
	if not os.path.exists(args.img_dir):
	try:
	os.makedirs(args.img_dir)
	except OSError:
	print("Unable to create img_dir {}".format(args.img_dir))
	sys.exit(1)
	return args


	def qn(tag):
	"""
	Stands for 'qualified name', a utility function to turn a namespace
	prefixed tag name into a Clark-notation qualified tag name for lxml. For
	example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
	Source: https://github.com/python-openxml/python-docx/
	"""
	prefix, tagroot = tag.split(':')
	uri = nsmap[prefix]
	return '{{{}}}{}'.format(uri, tagroot)


	def xml2text(xml):
	"""
	A string representing the textual content of this run, with content
	child elements like ``<w:tab/>`` translated to their Python
	equivalent.
	Adapted from: https://github.com/python-openxml/python-docx/
	"""
	text = u''
	root = ET.fromstring(xml)
	for child in root.iter():
	if child.tag == qn('w:t'):
	t_text = child.text
	text += t_text if t_text is not None else ''
	elif child.tag == qn('w:tab'):
	text += '\t'
	elif child.tag in (qn('w:br'), qn('w:cr')):
	text += '\n'
	elif child.tag == qn("w:p"):
	text += '\n\n'
	return text


	def process(docx, img_dir=None):
	text = u''

	# unzip the docx in memory
	zipf = zipfile.ZipFile(docx)
	filelist = zipf.namelist()

	# get header text
	# there can be 3 header files in the zip
	header_xmls = 'word/header[0-9]*.xml'
	for fname in filelist:
	if re.match(header_xmls, fname):
	text += xml2text(zipf.read(fname))

	# get main text
	doc_xml = 'word/document.xml'
	text += xml2text(zipf.read(doc_xml))

	# get footer text
	# there can be 3 footer files in the zip
	footer_xmls = 'word/footer[0-9]*.xml'
	for fname in filelist:
	if re.match(footer_xmls, fname):
	text += xml2text(zipf.read(fname))

	if img_dir is not None:
	# extract images
	for fname in filelist:
	_, extension = os.path.splitext(fname)
	if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
	dst_fname = os.path.join(img_dir, os.path.basename(fname))
	with open(dst_fname, "wb") as dst_f:
	dst_f.write(zipf.read(fname))

	zipf.close()
	return text.strip()


	if __name__ == '__main__':
	args = process_args()
	text = process(args.docx, args.img_dir)
	sys.stdout.write(text.encode('utf-8'))