pszemraj commited on
Commit
1d3a103
β€’
1 Parent(s): b8e1b99

πŸ“ docstrings

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. app.py +41 -27
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import contextlib
2
  import logging
3
  import os
@@ -19,7 +25,6 @@ import gradio as gr
19
  import nltk
20
  import torch
21
  from cleantext import clean
22
- from doctr.io import DocumentFile
23
  from doctr.models import ocr_predictor
24
 
25
  from pdf2text import convert_PDF_to_Text
@@ -28,7 +33,7 @@ from utils import load_example_filenames, saves_summary, truncate_word_count
28
 
29
  _here = Path(__file__).parent
30
 
31
- nltk.download("stopwords") # TODO=find where this requirement originates from
32
 
33
 
34
  MODEL_OPTIONS = [
@@ -37,7 +42,7 @@ MODEL_OPTIONS = [
37
  "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
38
  "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
39
  "pszemraj/pegasus-x-large-book-summary",
40
- ]
41
 
42
 
43
  def predict(
@@ -46,8 +51,16 @@ def predict(
46
  token_batch_length: int = 1024,
47
  empty_cache: bool = True,
48
  **settings,
49
- ):
50
- """helper fn to support multiple models at once"""
 
 
 
 
 
 
 
 
51
  if torch.cuda.is_available() and empty_cache:
52
  torch.cuda.empty_cache()
53
 
@@ -143,9 +156,11 @@ def proc_submission(
143
  token_batch_length=token_batch_length,
144
  **settings,
145
  )
146
- sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
 
 
147
  sum_scores = [
148
- f" - Section {i}: {round(s['summary_score'],4)}"
149
  for i, s in enumerate(_summaries)
150
  ]
151
 
@@ -153,9 +168,9 @@ def proc_submission(
153
  history["Summary Scores"] = "<br><br>"
154
  scores_out = "\n".join(sum_scores)
155
  rt = round((time.perf_counter() - st) / 60, 2)
156
- print(f"Runtime: {rt} minutes")
157
  html = ""
158
- html += f"<p>Runtime: {rt} minutes on CPU</p>"
159
  if msg is not None:
160
  html += msg
161
 
@@ -170,11 +185,13 @@ def proc_submission(
170
  def load_single_example_text(
171
  example_path: str or Path,
172
  max_pages=20,
173
- ):
174
  """
175
- load_single_example - a helper function for the gradio module to load examples
176
- Returns:
177
- list of str, the examples
 
 
178
  """
179
  global name_to_path
180
  full_ex_path = name_to_path[example_path]
@@ -198,30 +215,27 @@ def load_single_example_text(
198
  return text
199
 
200
 
201
- def load_uploaded_file(file_obj, max_pages=20):
202
  """
203
- load_uploaded_file - process an uploaded file
204
-
205
- Args:
206
- file_obj (POTENTIALLY list): Gradio file object inside a list
207
 
208
- Returns:
209
- str, the uploaded file contents
 
 
210
  """
211
-
212
- # file_path = Path(file_obj[0].name)
213
-
214
  # check if mysterious file object is a list
215
  if isinstance(file_obj, list):
216
  file_obj = file_obj[0]
217
  file_path = Path(file_obj.name)
218
  try:
 
219
  if file_path.suffix == ".txt":
220
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
221
  raw_text = f.read()
222
- text = clean(raw_text, lower=False)
223
  elif file_path.suffix == ".pdf":
224
- logging.info(f"Loading PDF file {file_path}")
225
  conversion_stats = convert_PDF_to_Text(
226
  file_path,
227
  ocr_model=ocr_model,
@@ -230,11 +244,11 @@ def load_uploaded_file(file_obj, max_pages=20):
230
  text = conversion_stats["converted_text"]
231
  else:
232
  logging.error(f"Unknown file type {file_path.suffix}")
233
- text = "ERROR - check example path"
234
 
235
  return text
236
  except Exception as e:
237
- logging.info(f"Trying to load file with path {file_path}, error: {e}")
238
  return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
239
 
240
 
 
1
+ """
2
+ app.py - the main module for the gradio app
3
+
4
+ Usage:
5
+ python app.py
6
+ """
7
  import contextlib
8
  import logging
9
  import os
 
25
  import nltk
26
  import torch
27
  from cleantext import clean
 
28
  from doctr.models import ocr_predictor
29
 
30
  from pdf2text import convert_PDF_to_Text
 
33
 
34
  _here = Path(__file__).parent
35
 
36
+ nltk.download("stopwords", quiet=True)
37
 
38
 
39
  MODEL_OPTIONS = [
 
42
  "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
43
  "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
44
  "pszemraj/pegasus-x-large-book-summary",
45
+ ] # models users can choose from
46
 
47
 
48
  def predict(
 
51
  token_batch_length: int = 1024,
52
  empty_cache: bool = True,
53
  **settings,
54
+ ) -> list:
55
+ """
56
+ predict - helper fn to support multiple models for summarization at once
57
+
58
+ :param str input_text: the input text to summarize
59
+ :param str model_name: model name to use
60
+ :param int token_batch_length: the length of the token batches to use
61
+ :param bool empty_cache: whether to empty the cache before loading a new= model
62
+ :return: list of dicts with keys "summary" and "score"
63
+ """
64
  if torch.cuda.is_available() and empty_cache:
65
  torch.cuda.empty_cache()
66
 
 
156
  token_batch_length=token_batch_length,
157
  **settings,
158
  )
159
+ sum_text = [
160
+ f"Batch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
161
+ ]
162
  sum_scores = [
163
+ f" - Batch Summary {i}: {round(s['summary_score'],4)}"
164
  for i, s in enumerate(_summaries)
165
  ]
166
 
 
168
  history["Summary Scores"] = "<br><br>"
169
  scores_out = "\n".join(sum_scores)
170
  rt = round((time.perf_counter() - st) / 60, 2)
171
+ logging.info(f"Runtime: {rt} minutes")
172
  html = ""
173
+ html += f"<p>Runtime: {rt} minutes with model: {model_name}</p>"
174
  if msg is not None:
175
  html += msg
176
 
 
185
  def load_single_example_text(
186
  example_path: str or Path,
187
  max_pages=20,
188
+ ) -> str:
189
  """
190
+ load_single_example_text - loads a single example text file
191
+
192
+ :param strorPath example_path: name of the example to load
193
+ :param int max_pages: the maximum number of pages to load from a PDF
194
+ :return str: the text of the example
195
  """
196
  global name_to_path
197
  full_ex_path = name_to_path[example_path]
 
215
  return text
216
 
217
 
218
+ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
219
  """
220
+ load_uploaded_file - loads a file uploaded by the user
 
 
 
221
 
222
+ :param file_obj (POTENTIALLY list): Gradio file object inside a list
223
+ :param int max_pages: the maximum number of pages to load from a PDF
224
+ :param bool lower: whether to lowercase the text
225
+ :return str: the text of the file
226
  """
 
 
 
227
  # check if mysterious file object is a list
228
  if isinstance(file_obj, list):
229
  file_obj = file_obj[0]
230
  file_path = Path(file_obj.name)
231
  try:
232
+ logging.info(f"Loading file:\t{file_path}")
233
  if file_path.suffix == ".txt":
234
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
235
  raw_text = f.read()
236
+ text = clean(raw_text, lower=lower)
237
  elif file_path.suffix == ".pdf":
238
+ logging.info(f"loading as PDF file {file_path}")
239
  conversion_stats = convert_PDF_to_Text(
240
  file_path,
241
  ocr_model=ocr_model,
 
244
  text = conversion_stats["converted_text"]
245
  else:
246
  logging.error(f"Unknown file type {file_path.suffix}")
247
+ text = "ERROR - check file - unknown file type"
248
 
249
  return text
250
  except Exception as e:
251
+ logging.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
252
  return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
253
 
254