Spaces:

nebiyu29
/

good_acc_v3

Sleeping

App Files Files Community

good_acc_v3 / app.py

nebiyu29

came back to the before the converiosn of data frame

609abfc verified 7 months ago

raw

history blame contribute delete

No virus

4.6 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import re
	import pandas as pd

	# Load the model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")
	model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)



	# Define a function to split a text into segments of 512 tokens
	def split_text(text):
	text=re.sub(r'[^a-zA-Z\s]','',text)
	text=str(text)
	# Tokenize the text
	tokens = tokenizer.tokenize(text)
	# Initialize an empty list for segments
	segments = []
	# Initialize an empty list for current segment
	current_segment = []
	# Initialize a counter for tokens
	token_count = 0
	# Loop through the tokens
	for token in tokens:
	# Add the token to the current segment
	current_segment.append(token)
	# Increment the token count
	token_count += 1
	# If the token count reaches 512 or the end of the text, add the current segment to the segments list
	if token_count == 512 or token == tokens[-1]:
	# Convert the current segment to a string and add it to the segments list
	segments.append(tokenizer.convert_tokens_to_string(current_segment))
	# Reset the current segment and the token count
	current_segment = []
	token_count = 0
	# Return the segments list
	return segments

	def classify(text):
	# Define the labels
	labels = ["depression", "anxiety", "bipolar disorder", "schizophrenia", "PTSD", "OCD", "ADHD", "autism", "eating disorder", "personality disorder", "phobia"]
	#labels=list(model.config.id2label)
	# Encode the labels
	label_encodings = tokenizer(labels, padding=True, return_tensors="pt")
	# Split the text into segments
	segments = split_text(text)
	# Initialize an empty list for logits
	logits_list = []

	# Loop through the segments
	for segment in segments:
	# Encode the segment and the labels
	inputs = tokenizer([segment] + labels, padding=True, return_tensors="pt")
	# Get the input ids and attention mask
	input_ids = inputs["input_ids"]
	attention_mask = inputs["attention_mask"]
	# Move the input ids and attention mask to the device
	input_ids = input_ids.to(device)
	attention_mask = attention_mask.to(device)
	# Get the model outputs for each segment

	with torch.no_grad():
	outputs = model(
	input_ids,
	attention_mask=attention_mask,
	)
	# Get the logits for each segment and append them to the logits list
	logits = outputs.logits
	logits_list.append(logits)
	# Average the logits across the segments
	avg_logits = torch.mean(torch.stack(logits_list), dim=0)
	# Apply softmax to convert logits to probabilities
	probabilities = torch.softmax(avg_logits, dim=1)
	# Get the probabilities for each label
	label_probabilities = probabilities[:, :len(labels)].tolist()

	# Get the top 3 most likely labels and their probabilities
	# Get the top 3 most likely labels and their probabilities
	top_labels = []
	top_probabilities = []
	label_probabilities = label_probabilities[0] # Extract the list of probabilities for the first (and only) example
	for _ in range(3):
	max_prob_index = label_probabilities.index(max(label_probabilities))
	top_labels.append(labels[max_prob_index])
	top_probabilities.append(max(label_probabilities))
	label_probabilities[max_prob_index] = 0 # Set the max probability to 0 to get the next highest probability

	# Create a dictionary to store the results
	results = {
	"sequence": text,
	"top_labels": top_labels,
	"top_probabilities": top_probabilities
	}

	return results

	# Streamlit app
	st.title("Text Classification.")
	st.write("Enter some text, and the model will classify it.")

	text_input = st.text_input("Text Input")
	#if st.button("Classify"):
	predictions = classify(text_input)

	labels_str=",".join(predictions["top_labels"])
	probs_ints=",".join(map(str,predictions["top_probabilities"]))

	#df=pd.DataFrame({'probabilities: ',probs_ints})
	#formated_df=df.styled.format("{:.2f}").to_dict('list')

	#for prediction in predictions:
	# st.write(f"Segment Text: {prediction['segment_text']}")

	st.write(f"Label: {labels_str}")
	st.write(f"Probability: {probs_ints}")