File size: 2,635 Bytes
6bc94ac
436ce71
 
 
6bc94ac
aafa95b
 
6bc94ac
 
 
 
 
 
 
 
 
 
 
 
 
436ce71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafa95b
436ce71
 
aafa95b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436ce71
 
 
aafa95b
436ce71
aafa95b
436ce71
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
import spacy
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
import streamlit as st
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

he_regex = re.compile(r'\b(he|him|himself)\b', flags=re.IGNORECASE)
his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE)
she_regex = re.compile(r'\b(she|herself)\b', flags=re.IGNORECASE)
her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE)


def hide_footer():
    hide_st_style = """
            <style>
            footer {visibility: hidden;}
            </style>
            """
    st.markdown(hide_st_style, unsafe_allow_html=True)

@st.cache_resource
def get_seq2seq_model(model_id):
    return AutoModelForSeq2SeqLM.from_pretrained(model_id)

@st.cache_resource
def get_causal_model(model_id):
    return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

@st.cache_resource
def get_auto_model(model_id):
    return AutoModel.from_pretrained(model_id)

@st.cache_resource
def get_tokenizer(model_id):
    return AutoTokenizer.from_pretrained(model_id)

@st.cache_data
def get_celeb_data(fpath):
    with open(fpath, encoding='UTF-8') as json_file:
        return json.load(json_file)

@st.cache_data
def get_article(url):
    req = Request(
    url=url,
    headers={'User-Agent': 'Mozilla/5.0'}
    )
    html = urlopen(req).read()
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    lines = []

    # get text
    for para in soup.find_all("p", class_='topic-paragraph'):
        lines.append(para.get_text().strip())

    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)
    return text


@st.cache_resource
def preprocess_text(name, gender, text, model_id):
    lname = name.split(" ")[-1]
    lnames = lname+"’s"
    lnames_regex = re.compile(rf'\b({lnames})\b')
    names = name+"’s"
    names_regex = re.compile(rf'\b({names})\b')
    if gender == "M":
        text = re.sub(he_regex, "I", text)
        text = re.sub(his_regex, "my", text)
    elif gender == "F":
        text = re.sub(she_regex, "I", text)
        text = re.sub(her_regex, "my", text)
    text = re.sub(names_regex, "my", text)
    text = re.sub(lnames_regex, "my", text)
    spacy_model = spacy.load(model_id)
    texts = [i.text.strip() for i in spacy_model(text).sents]
    return spacy_model, texts