Spaces:

nebiyu29
/

good_acc_v2

Sleeping

App Files Files Community

good_acc_v2 / app.py

nebiyu29

added regex expression

6042393 verified 9 months ago

raw

history blame contribute delete

3.44 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import re

	tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")
	model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	# Define a function to split a text into segments of 512 tokens
	def split_text(text):
	#remove unnessary charachters
	text=re.sub(r'[^a-zA-Z\s]','',text)
	# Tokenize the text
	tokens = tokenizer.tokenize(text)
	# Initialize an empty list for segments
	segments = []
	# Initialize an empty list for current segment
	current_segment = []
	# Initialize a counter for tokens
	token_count = 0
	# Loop through the tokens
	for token in tokens:
	# Add the token to the current segment
	current_segment.append(token)
	# Increment the token count
	token_count += 1
	# If the token count reaches 512 or the end of the text, add the current segment to the segments list
	if token_count == 512 or token == tokens[-1]:
	# Convert the current segment to a string and add it to the segments list
	segments.append(tokenizer.convert_tokens_to_string(current_segment))
	# Reset the current segment and the token count
	current_segment = []
	token_count = 0
	# Return the segments list
	return segments

	# Define a function to extract predictions from model output (adjust as needed)
	def extract_predictions(outputs):
	# Assuming outputs contain logits and labels (adapt based on your model's output format)
	logits = outputs.logits
	probs = logits.softmax(dim=1)
	preds = torch.argmax(probs, dim=1)
	return probs, preds # Return all probabilities and predicted labels

	# a function that classifies text
	def classify_text(text):
	# Split text into segments using split_text
	segments = split_text(text)

	# Initialize empty list for predictions
	predictions = []

	# Loop through segments, process, and store predictions
	for segment in segments:
	inputs = tokenizer([segment], padding=True, return_tensors="pt")
	input_ids = inputs["input_ids"].to(device)
	attention_mask = inputs["attention_mask"].to(device)

	with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)

	# Extract predictions for each segment
	probs, preds = extract_predictions(outputs) # Define this function based on your model's output
	pred_label = model.config.id2label[preds[0].item()]

	# Append predictions for this segment
	predictions.append({
	"segment_text": segment,
	"label": pred_label, # Assuming single label prediction
	"probability": probs[0][preds[0]].item() # Access probability for the predicted label
	})

	return predictions

	# Streamlit app
	st.title("Text Classification Demo")
	st.write("Enter some text, and the model will classify it.")

	text_input = st.text_input("Text Input")
	if st.button("Classify"):
	predictions = classify_text(text_input)
	for prediction in predictions:
	st.write(f"Segment Text: {prediction['segment_text']}")
	st.write(f"Label: {prediction['label']}")
	st.write(f"Probability: {prediction['probability']}")