import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import re tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # Define a function to split a text into segments of 512 tokens def split_text(text): #remove unnessary charachters text=re.sub(r'[^a-zA-Z\s]','',text) # Tokenize the text tokens = tokenizer.tokenize(text) # Initialize an empty list for segments segments = [] # Initialize an empty list for current segment current_segment = [] # Initialize a counter for tokens token_count = 0 # Loop through the tokens for token in tokens: # Add the token to the current segment current_segment.append(token) # Increment the token count token_count += 1 # If the token count reaches 512 or the end of the text, add the current segment to the segments list if token_count == 512 or token == tokens[-1]: # Convert the current segment to a string and add it to the segments list segments.append(tokenizer.convert_tokens_to_string(current_segment)) # Reset the current segment and the token count current_segment = [] token_count = 0 # Return the segments list return segments # Define a function to extract predictions from model output (adjust as needed) def extract_predictions(outputs): # Assuming outputs contain logits and labels (adapt based on your model's output format) logits = outputs.logits probs = logits.softmax(dim=1) preds = torch.argmax(probs, dim=1) return probs, preds # Return all probabilities and predicted labels # a function that classifies text def classify_text(text): # Split text into segments using split_text segments = split_text(text) # Initialize empty list for predictions predictions = [] # Loop through segments, process, and store predictions for segment in segments: inputs = tokenizer([segment], padding=True, return_tensors="pt") input_ids = inputs["input_ids"].to(device) attention_mask = inputs["attention_mask"].to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) # Extract predictions for each segment probs, preds = extract_predictions(outputs) # Define this function based on your model's output pred_label = model.config.id2label[preds[0].item()] # Append predictions for this segment predictions.append({ "segment_text": segment, "label": pred_label, # Assuming single label prediction "probability": probs[0][preds[0]].item() # Access probability for the predicted label }) return predictions # Streamlit app st.title("Text Classification Demo") st.write("Enter some text, and the model will classify it.") text_input = st.text_input("Text Input") if st.button("Classify"): predictions = classify_text(text_input) for prediction in predictions: st.write(f"Segment Text: {prediction['segment_text']}") st.write(f"Label: {prediction['label']}") st.write(f"Probability: {prediction['probability']}")