Spaces:

liuhaozhe6788
/

CelebChat

Runtime error

File size: 2,635 Bytes

6bc94ac
436ce71
 
 
6bc94ac
aafa95b
 
6bc94ac
 
 
 
 
 
 
 
 
 
 
 
 
436ce71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aafa95b
436ce71
 
aafa95b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436ce71
 
 
aafa95b
436ce71
aafa95b
436ce71

import re
import spacy
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoModel
import streamlit as st
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

he_regex = re.compile(r'\b(he|him|himself)\b', flags=re.IGNORECASE)
his_regex = re.compile(r'\b(his)\b', flags=re.IGNORECASE)
she_regex = re.compile(r'\b(she|herself)\b', flags=re.IGNORECASE)
her_regex = re.compile(r'\b(her)\b', flags=re.IGNORECASE)


def hide_footer():
    hide_st_style = """
            <style>
            footer {visibility: hidden;}
            </style>
            """
    st.markdown(hide_st_style, unsafe_allow_html=True)

@st.cache_resource
def get_seq2seq_model(model_id):
    return AutoModelForSeq2SeqLM.from_pretrained(model_id)

@st.cache_resource
def get_causal_model(model_id):
    return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

@st.cache_resource
def get_auto_model(model_id):
    return AutoModel.from_pretrained(model_id)

@st.cache_resource
def get_tokenizer(model_id):
    return AutoTokenizer.from_pretrained(model_id)

@st.cache_data
def get_celeb_data(fpath):
    with open(fpath, encoding='UTF-8') as json_file:
        return json.load(json_file)

@st.cache_data
def get_article(url):
    req = Request(
    url=url,
    headers={'User-Agent': 'Mozilla/5.0'}
    )
    html = urlopen(req).read()
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    lines = []

    # get text
    for para in soup.find_all("p", class_='topic-paragraph'):
        lines.append(para.get_text().strip())

    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = ' '.join(chunk for chunk in chunks if chunk)
    return text


@st.cache_resource
def preprocess_text(name, gender, text, model_id):
    lname = name.split(" ")[-1]
    lnames = lname+"’s"
    lnames_regex = re.compile(rf'\b({lnames})\b')
    names = name+"’s"
    names_regex = re.compile(rf'\b({names})\b')
    if gender == "M":
        text = re.sub(he_regex, "I", text)
        text = re.sub(his_regex, "my", text)
    elif gender == "F":
        text = re.sub(she_regex, "I", text)
        text = re.sub(her_regex, "my", text)
    text = re.sub(names_regex, "my", text)
    text = re.sub(lnames_regex, "my", text)
    spacy_model = spacy.load(model_id)
    texts = [i.text.strip() for i in spacy_model(text).sents]
    return spacy_model, texts