File size: 4,411 Bytes
bd1016c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12a9624
bd1016c
 
 
 
 
 
 
12a9624
bd1016c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")
model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



# Define a function to split a text into segments of 512 tokens
def split_text(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    text=str(text)
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Initialize an empty list for segments
    segments = []
    # Initialize an empty list for current segment
    current_segment = []
    # Initialize a counter for tokens
    token_count = 0
    # Loop through the tokens
    for token in tokens:
        # Add the token to the current segment
        current_segment.append(token)
        # Increment the token count
        token_count += 1
        # If the token count reaches 512 or the end of the text, add the current segment to the segments list
        if token_count == 512 or token == tokens[-1]:
            # Convert the current segment to a string and add it to the segments list
            segments.append(tokenizer.convert_tokens_to_string(current_segment))
            # Reset the current segment and the token count
            current_segment = []
            token_count = 0
    # Return the segments list
    return segments

def classify(text):
    # Define the labels
    labels = ["depression", "anxiety", "bipolar disorder", "schizophrenia", "PTSD", "OCD", "ADHD", "autism", "eating disorder", "personality disorder", "phobia"]
    #labels=list(model.config.id2label)
    # Encode the labels
    label_encodings = tokenizer(labels, padding=True, return_tensors="pt")
    # Split the text into segments
    segments = split_text(text)
    # Initialize an empty list for logits
    logits_list = []
 
    # Loop through the segments
    for segment in segments:
        # Encode the segment and the labels
        inputs = tokenizer([segment] + labels, padding=True, return_tensors="pt")
        # Get the input ids and attention mask
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        # Move the input ids and attention mask to the device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        # Get the model outputs for each segment

        with torch.no_grad():
            outputs = model(
                input_ids,
                attention_mask=attention_mask,
            )
        # Get the logits for each segment and append them to the logits list
        logits = outputs.logits
        logits_list.append(logits)
    # Average the logits across the segments
    avg_logits = torch.mean(torch.stack(logits_list), dim=0)
    # Apply softmax to convert logits to probabilities
    probabilities = torch.softmax(avg_logits, dim=1)
    # Get the probabilities for each label
    label_probabilities = probabilities[:, :len(labels)].tolist()

    # Get the top 3 most likely labels and their probabilities
   # Get the top 3 most likely labels and their probabilities
    top_labels = []
    top_probabilities = []
    label_probabilities = label_probabilities[0]  # Extract the list of probabilities for the first (and only) example
    for _ in range(3):
        max_prob_index = label_probabilities.index(max(label_probabilities))
        top_labels.append(labels[max_prob_index])
        top_probabilities.append(max(label_probabilities))
        label_probabilities[max_prob_index] = 0  # Set the max probability to 0 to get the next highest probability

    # Create a dictionary to store the results
    results = {
        "sequence": text,
        "top_labels": top_labels,
        "top_probabilities": top_probabilities
    }

    return results

# Streamlit app
st.title("Text Classification.")
st.write("Enter some text, and the model will classify it.")

text_input = st.text_input("Text Input")
if st.button("Classify"):
    predictions = classify(text_input)
    for prediction in predictions:
       # st.write(f"Segment Text: {prediction['segment_text']}")
        st.write(f"Label: {list(prediction['top_labels'])}")
        st.write(f"Probability: {prediction['top_probabilities']}")