File size: 3,259 Bytes
6bc94ac
436ce71
 
 
abca9bf
6bc94ac
db5ef00
aafa95b
6bc94ac
 
 
 
 
 
 
 
436ce71
 
 
abca9bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436ce71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafa95b
436ce71
 
aafa95b
 
 
 
 
db5ef00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5beab45
db5ef00
 
 
 
 
 
 
aafa95b
db5ef00
 
436ce71
db5ef00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
import spacy
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
from unlimiformer import Unlimiformer, UnlimiformerArguments
import streamlit as st
from urllib.request import Request, urlopen, HTTPError
from bs4 import BeautifulSoup


def hide_footer():
    hide_st_style = """
            <style>
            footer {visibility: hidden;}
            </style>
            """
    st.markdown(hide_st_style, unsafe_allow_html=True)

@st.cache_resource
def get_seq2seq_model(model_id, use_unlimiformer=True, _tokenizer=None):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    if use_unlimiformer:
        defaults = UnlimiformerArguments()
        unlimiformer_kwargs = {
            'layer_begin': defaults.layer_begin, 
            'layer_end': defaults.layer_end,
            'unlimiformer_head_num': defaults.unlimiformer_head_num, 
            'exclude_attention': defaults.unlimiformer_exclude, 
            'chunk_overlap': defaults.unlimiformer_chunk_overlap,
            'model_encoder_max_len': defaults.unlimiformer_chunk_size,
            'verbose': defaults.unlimiformer_verbose, 'tokenizer': _tokenizer,
            'unlimiformer_training': defaults.unlimiformer_training,
            'use_datastore': defaults.use_datastore,
            'flat_index': defaults.flat_index,
            'test_datastore': defaults.test_datastore,
            'reconstruct_embeddings': defaults.reconstruct_embeddings,
            'gpu_datastore': defaults.gpu_datastore,
            'gpu_index': defaults.gpu_index
        }
        return Unlimiformer.convert_model(model, **unlimiformer_kwargs)
    else:
        return model

@st.cache_resource
def get_causal_model(model_id):
    return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

@st.cache_resource
def get_auto_model(model_id):
    return AutoModel.from_pretrained(model_id)

@st.cache_resource
def get_tokenizer(model_id):
    return AutoTokenizer.from_pretrained(model_id)

@st.cache_data
def get_celeb_data(fpath):
    with open(fpath, encoding='UTF-8') as json_file:
        return json.load(json_file)

def get_article(url):
    req = Request(
    url=url,
    headers={'User-Agent': 'Mozilla/5.0'}
    )
    try:
        html = urlopen(req).read()
        soup = BeautifulSoup(html, features="html.parser")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        lines = []

        # get text
        for para in soup.find_all("p", class_='topic-paragraph'):
            lines.append(para.get_text().strip())

        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = ' '.join(chunk for chunk in chunks if chunk)
        return text

    except:
        st.markdown("The internet is not stable.")
        return ""
    
@st.cache_resource
def get_spacy_model(model_id):
    return spacy.load(model_id)

def preprocess_text(name, text:str, model_id):
    spacy_model = get_spacy_model(model_id)
    texts = [i.text.strip() for i in spacy_model(text).sents]
    return spacy_model, texts