Spaces:
Sleeping
Sleeping
Allen Park
commited on
Commit
·
6efea88
1
Parent(s):
901a87e
feat(docx text extraction): extract all the text from the uploaded docx file
Browse files* feat: add python-docx text extraction from pdf helper functoin
---------
Co-authored-by: Allen Park <[email protected]>
- app.py +10 -2
- requirements.txt +2 -1
app.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6 |
import gradio as gr
|
7 |
import openai
|
8 |
import pymupdf
|
|
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
@@ -143,6 +144,13 @@ def extract_text_pymupdf(file):
|
|
143 |
text += page.get_text()
|
144 |
return text
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
def upload_file(filepath):
|
147 |
extracted_file_text = ""
|
148 |
if filepath is not None:
|
@@ -153,8 +161,8 @@ def upload_file(filepath):
|
|
153 |
# conditionals for filetype and function call
|
154 |
if filetype == "pdf" or filetype == "txt":
|
155 |
extracted_file_text = extract_text_pymupdf(filepath)
|
156 |
-
elif filetype == "docx"
|
157 |
-
extracted_file_text = filepath
|
158 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
159 |
else:
|
160 |
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
|
|
|
6 |
import gradio as gr
|
7 |
import openai
|
8 |
import pymupdf
|
9 |
+
from docx import Document
|
10 |
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
12 |
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
|
|
144 |
text += page.get_text()
|
145 |
return text
|
146 |
|
147 |
+
def extract_text_python_docx(file):
|
148 |
+
doc = Document(io.BytesIO(file))
|
149 |
+
text = ""
|
150 |
+
for paragraph in doc.paragraphs:
|
151 |
+
text += paragraph.text + '\n'
|
152 |
+
return text.strip()
|
153 |
+
|
154 |
def upload_file(filepath):
|
155 |
extracted_file_text = ""
|
156 |
if filepath is not None:
|
|
|
161 |
# conditionals for filetype and function call
|
162 |
if filetype == "pdf" or filetype == "txt":
|
163 |
extracted_file_text = extract_text_pymupdf(filepath)
|
164 |
+
elif filetype == "docx":
|
165 |
+
extracted_file_text = extract_text_python_docx(filepath)
|
166 |
return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown("**Uploaded file:** {name}".format(name=name)), extracted_file_text]
|
167 |
else:
|
168 |
return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text]
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
openai
|
2 |
-
PyMuPDF
|
|
|
|
1 |
openai
|
2 |
+
PyMuPDF
|
3 |
+
python-docx
|