Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,496 Bytes
87e5c9c 34de38e 87e5c9c 925dd67 87e5c9c 079d1ca 34de38e 079d1ca 87e5c9c 3ea8fe3 87e5c9c 925dd67 7452863 925dd67 3ea8fe3 87e5c9c 3ea8fe3 87e5c9c 34de38e 079d1ca 34de38e 079d1ca 34de38e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
"""
utils.py - Utility functions for the project.
"""
import re
from pathlib import Path
from datetime import datetime
from natsort import natsorted
import subprocess
def get_timestamp() -> str:
"""
get_timestamp - get a timestamp for the current time
Returns:
str, the timestamp
"""
return datetime.now().strftime("%Y%m%d_%H%M%S")
def truncate_word_count(text, max_words=512):
"""
truncate_word_count - a helper function for the gradio module
Parameters
----------
text : str, required, the text to be processed
max_words : int, optional, the maximum number of words, default=512
Returns
-------
dict, the text and whether it was truncated
"""
# split on whitespace with regex
words = re.split(r"\s+", text)
processed = {}
if len(words) > max_words:
processed["was_truncated"] = True
processed["truncated_text"] = " ".join(words[:max_words])
else:
processed["was_truncated"] = False
processed["truncated_text"] = text
return processed
def load_examples(src, filetypes=[".txt", ".pdf"]):
"""
load_examples - a helper function for the gradio module to load examples
Returns:
list of str, the examples
"""
src = Path(src)
src.mkdir(exist_ok=True)
pdf_url = (
"https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
)
subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
examples = [f for f in src.iterdir() if f.suffix in filetypes]
examples = natsorted(examples)
# load the examples into a list
text_examples = []
for example in examples:
with open(example, "r") as f:
text = f.read()
text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])
return text_examples
def load_example_filenames(example_path: str or Path):
"""
load_example_filenames - a helper function for the gradio module to load examples
Returns:
dict, the examples (filename:full path)
"""
example_path = Path(example_path)
# load the examples into a list
examples = {f.name: f for f in example_path.glob("*.txt")}
return examples
def saves_summary(summarize_output, outpath: str or Path = None, add_signature=True):
"""
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
_summaries = summarize_via_tokenbatches(
text,
batch_length=token_batch_length,
batch_stride=batch_stride,
**settings,
)
"""
outpath = (
Path.cwd() / f"document_summary_{get_timestamp()}.txt"
if outpath is None
else Path(outpath)
)
sum_text = [s["summary"][0] for s in summarize_output]
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
scores_text = "\n".join(sum_scores)
full_summary = "\n\t".join(sum_text)
with open(
outpath,
"w",
) as fo:
if add_signature:
fo.write(
"Generated with the Document Summarization space :) https://hf.co/spaces/pszemraj/document-summarization\n\n"
)
fo.writelines(full_summary)
with open(
outpath,
"a",
) as fo:
fo.write("\n" * 3)
fo.write(f"\n\nSection Scores:\n")
fo.writelines(scores_text)
fo.write("\n\n---\n")
return outpath
|