Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
import re | |
tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") | |
model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
# Define a function to split a text into segments of 512 tokens | |
def split_text(text): | |
#remove unnessary charachters | |
text=re.sub(r'[^a-zA-Z\s]','',text) | |
# Tokenize the text | |
tokens = tokenizer.tokenize(text) | |
# Initialize an empty list for segments | |
segments = [] | |
# Initialize an empty list for current segment | |
current_segment = [] | |
# Initialize a counter for tokens | |
token_count = 0 | |
# Loop through the tokens | |
for token in tokens: | |
# Add the token to the current segment | |
current_segment.append(token) | |
# Increment the token count | |
token_count += 1 | |
# If the token count reaches 512 or the end of the text, add the current segment to the segments list | |
if token_count == 512 or token == tokens[-1]: | |
# Convert the current segment to a string and add it to the segments list | |
segments.append(tokenizer.convert_tokens_to_string(current_segment)) | |
# Reset the current segment and the token count | |
current_segment = [] | |
token_count = 0 | |
# Return the segments list | |
return segments | |
# Define a function to extract predictions from model output (adjust as needed) | |
def extract_predictions(outputs): | |
# Assuming outputs contain logits and labels (adapt based on your model's output format) | |
logits = outputs.logits | |
probs = logits.softmax(dim=1) | |
preds = torch.argmax(probs, dim=1) | |
return probs, preds # Return all probabilities and predicted labels | |
# a function that classifies text | |
def classify_text(text): | |
# Split text into segments using split_text | |
segments = split_text(text) | |
# Initialize empty list for predictions | |
predictions = [] | |
# Loop through segments, process, and store predictions | |
for segment in segments: | |
inputs = tokenizer([segment], padding=True, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
attention_mask = inputs["attention_mask"].to(device) | |
with torch.no_grad(): | |
outputs = model(input_ids, attention_mask=attention_mask) | |
# Extract predictions for each segment | |
probs, preds = extract_predictions(outputs) # Define this function based on your model's output | |
pred_label = model.config.id2label[preds[0].item()] | |
# Append predictions for this segment | |
predictions.append({ | |
"segment_text": segment, | |
"label": pred_label, # Assuming single label prediction | |
"probability": probs[0][preds[0]].item() # Access probability for the predicted label | |
}) | |
return predictions | |
# Streamlit app | |
st.title("Text Classification Demo") | |
st.write("Enter some text, and the model will classify it.") | |
text_input = st.text_input("Text Input") | |
if st.button("Classify"): | |
predictions = classify_text(text_input) | |
for prediction in predictions: | |
st.write(f"Segment Text: {prediction['segment_text']}") | |
st.write(f"Label: {prediction['label']}") | |
st.write(f"Probability: {prediction['probability']}") |