Spaces:
Build error
Build error
import streamlit as st | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | |
import spacy | |
from tika import parser | |
import requests | |
import pandas as pd | |
# Loading spaCy model outside the streamlit cache | |
nlp = spacy.load("en_core_web_sm") | |
def load_environmental_model(): | |
name_env = "ESGBERT/EnvironmentalBERT-environmental" | |
tokenizer_env = AutoTokenizer.from_pretrained(name_env) | |
model_env = AutoModelForSequenceClassification.from_pretrained(name_env) | |
return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env) | |
def load_social_model(): | |
name_soc = "ESGBERT/SocialBERT-social" | |
tokenizer_soc = AutoTokenizer.from_pretrained(name_soc) | |
model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc) | |
return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc) | |
def load_governance_model(): | |
name_gov = "ESGBERT/GovernanceBERT-governance" | |
tokenizer_gov = AutoTokenizer.from_pretrained(name_gov) | |
model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov) | |
return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov) | |
def load_sentiment_model(): | |
model_name = "climatebert/distilroberta-base-climate-sentiment" | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512) | |
return pipeline("text-classification", model=model, tokenizer=tokenizer) | |
# Streamlit App | |
st.title("ESG Report Classification using Natural Language Processing") | |
# Get report URL from user input | |
url = st.text_input("Enter the URL of the report (PDF):") | |
# Model selection dropdown | |
st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.") | |
st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.") | |
selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"]) | |
if url: | |
# Download PDF content from the URL | |
response = requests.get(url, stream=True) | |
if response.status_code == 200: | |
# Parse PDF and extract text | |
raw_text = parser.from_buffer(response.content)['content'] | |
# Extract sentences using spaCy | |
doc = nlp(raw_text) | |
sentences = [sent.text for sent in doc.sents] | |
# Filtering and preprocessing sentences | |
sequences = list(map(str, sentences)) | |
sentences = [x.replace("\n", "") for x in sequences] | |
sentences = [x for x in sentences if x != ""] | |
sentences = [x for x in sentences if x[0].isupper()] | |
sub_sentences = sentences[:100] | |
# Classification using different models based on user selection | |
if selected_model == "Environmental Model": | |
pipe_model = load_environmental_model() | |
elif selected_model == "Social Model": | |
pipe_model = load_social_model() | |
elif selected_model == "Governance Model": | |
pipe_model = load_governance_model() | |
else: | |
pipe_model = load_sentiment_model() | |
# Get predictions for the selected model | |
model_results = pipe_model(sub_sentences, padding=True, truncation=True) | |
model_labels = [x["label"] for x in model_results] | |
# Display count of sentences labeled as the selected model | |
st.subheader(f"{selected_model} Sentences Count") | |
st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count()) | |
else: | |
st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.") | |