Allen Park commited on
Commit
6efea88
·
1 Parent(s): 901a87e

feat(docx text extraction): extract all the text from the uploaded docx file

Browse files

* feat: add python-docx text extraction from pdf helper functoin
---------
Co-authored-by: Allen Park <[email protected]>

Files changed (2) hide show
  1. app.py +10 -2
  2. requirements.txt +2 -1
app.py CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
6
  import gradio as gr
7
  import openai
8
  import pymupdf
 
9
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
11
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
@@ -143,6 +144,13 @@ def extract_text_pymupdf(file):
143
  text += page.get_text()
144
  return text
145
 
 
 
 
 
 
 
 
146
  def upload_file(filepath):
147
  extracted_file_text = ""
148
  if filepath is not None:
@@ -153,8 +161,8 @@ def upload_file(filepath):
153
  # conditionals for filetype and function call
154
  if filetype == "pdf" or filetype == "txt":
155
  extracted_file_text = extract_text_pymupdf(filepath)
156
- elif filetype == "docx" or filetype == "doc":
157
- extracted_file_text = filepath.read().decode("utf-8")
158
  return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
159
  else:
160
  return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
 
6
  import gradio as gr
7
  import openai
8
  import pymupdf
9
+ from docx import Document
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
12
  LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
 
144
  text += page.get_text()
145
  return text
146
 
147
+ def extract_text_python_docx(file):
148
+ doc = Document(io.BytesIO(file))
149
+ text = ""
150
+ for paragraph in doc.paragraphs:
151
+ text += paragraph.text + '\n'
152
+ return text.strip()
153
+
154
  def upload_file(filepath):
155
  extracted_file_text = ""
156
  if filepath is not None:
 
161
  # conditionals for filetype and function call
162
  if filetype == "pdf" or filetype == "txt":
163
  extracted_file_text = extract_text_pymupdf(filepath)
164
+ elif filetype == "docx":
165
+ extracted_file_text = extract_text_python_docx(filepath)
166
  return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
167
  else:
168
  return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  openai
2
- PyMuPDF
 
 
1
  openai
2
+ PyMuPDF
3
+ python-docx