CelebChat / utils.py
new commits
history blame
2.6 kB
import re
import spacy
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
import streamlit as st
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
he_regex = re.compile(r'\b(he|him|himself)\b', flags=re.IGNORECASE)
his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE)
she_regex = re.compile(r'\b(she|herself)\b', flags=re.IGNORECASE)
her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE)
def hide_footer():
hide_st_style = """
footer {visibility: hidden;}
st.markdown(hide_st_style, unsafe_allow_html=True)
def get_seq2seq_model(model_id):
return AutoModelForSeq2SeqLM.from_pretrained(model_id)
def get_causal_model(model_id):
return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
def get_auto_model(model_id):
return AutoModel.from_pretrained(model_id)
def get_tokenizer(model_id):
return AutoTokenizer.from_pretrained(model_id)
def get_celeb_data(fpath):
with open(fpath, encoding='UTF-8') as json_file:
return json.load(json_file)
def get_article(url):
req = Request(
headers={'User-Agent': 'Mozilla/5.0'}
html = urlopen(req).read()
soup = BeautifulSoup(html, features="html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
lines = []
# get text
for para in soup.find_all("p", class_='topic-paragraph'):
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = ' '.join(chunk for chunk in chunks if chunk)
return text
def preprocess_text(name, gender, text, model_id):
lname = name.split(" ")[-1]
lnames = lname+"’s"
lnames_regex = re.compile(rf'\b({lnames})\b')
names = name+"’s"
names_regex = re.compile(rf'\b({names})\b')
if gender == "M":
text = re.sub(he_regex, "I", text)
text = re.sub(his_regex, "my", text)
elif gender == "F":
text = re.sub(she_regex, "I", text)
text = re.sub(her_regex, "my", text)
text = re.sub(names_regex, "my", text)
text = re.sub(lnames_regex, "my", text)
spacy_model = spacy.load(model_id)
texts = [i.text.strip() for i in spacy_model(text).sents]
return spacy_model, texts