File size: 5,026 Bytes
aea9d52
c4dd8e7
aea9d52
 
 
 
2f6ade8
 
 
aea9d52
 
 
 
2f6ade8
 
 
 
 
 
 
49ce6a9
 
2f6ade8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49ce6a9
 
 
 
 
 
 
 
e328eaa
 
74f91bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e328eaa
74f91bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e328eaa
74f91bd
 
 
 
 
e328eaa
74f91bd
 
2f6ade8
74f91bd
2f6ade8
74f91bd
 
 
 
 
e328eaa
74f91bd
e328eaa
74f91bd
e328eaa
2f6ade8
 
 
 
 
 
 
 
 
aea9d52
 
 
 
 
 
 
dcab95f
aea9d52
 
501f411
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import transformers

tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")
model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")


# Load the model and tokenizer
# model = transformers.AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

# tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

# Define a function to split a text into segments of 512 tokens
def split_text(text):
    #this prints progress
    print("going to split the text")
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Initialize an empty list for segments
    segments = []
    # Initialize an empty list for current segment
    current_segment = []
    # Initialize a counter for tokens
    token_count = 0
    # Loop through the tokens
    for token in tokens:
        # Add the token to the current segment
        current_segment.append(token)
        # Increment the token count
        token_count += 1
        # If the token count reaches 512 or the end of the text, add the current segment to the segments list
        if token_count == 512 or token == tokens[-1]:
            # Convert the current segment to a string and add it to the segments list
            segments.append(tokenizer.convert_tokens_to_string(current_segment))
            # Reset the current segment and the token count
            current_segment = []
            token_count = 0
    # Return the segments list
    return segments

# Define a function to extract predictions from model output (adjust as needed)
def extract_predictions(outputs):
    # Assuming outputs contain logits and labels (adapt based on your model's output format)
    logits = outputs.logits
    probs = logits.softmax(dim=1)
    preds = torch.argmax(probs, dim=1)
    return probs, preds  # Return all probabilities and predicted labels
    
# a function that classifies text

# def classify_text(text):
#     # Define labels
#     labels = ["depression", "anxiety", "bipolar disorder", "schizophrenia", "PTSD", "OCD", "ADHD", "autism", "eating disorder", "personality disorder", "phobia"]

#     # Split text into segments using split_text
#     segments = split_text(text)

#     # Initialize empty list for predictions
#     predictions = []

#     # Move device to GPU if available
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model = model.to(device)

#     # Loop through segments, process, and store predictions
#     for segment in segments:
#         inputs = tokenizer([segment], padding=True, return_tensors="pt")
#         input_ids = inputs["input_ids"].to(device)
#         attention_mask = inputs["attention_mask"].to(device)

#         with torch.no_grad():
#             outputs = model(input_ids, attention_mask=attention_mask)

#         # Extract predictions for each segment
#         probs, preds = extract_predictions(outputs)  # Define this function based on your model's output

#         # Append predictions for this segment
#         predictions.append({
#             "segment_text": segment,
#             "label": preds[0],  # Assuming single label prediction
#             "probability": probs[preds[0]]  # Access probability for the predicted label
#         })

def classify_text(text):
    
 
  segments=split_text(text)

  predictions = []
  for segment in segments:
    inputs = tokenizer([segment], padding=True, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)

    probs, preds = extract_predictions(outputs)

    predictions.append({
      "segment_text": segment,
      "label": model.config.id2label[preds[0]],  # assuming single label prediction
      "probability": probs[preds[0]]
    })

  return predictions

        

# def classify_text(text):
#   """
#   This function preprocesses, feeds text to the model, and outputs the predicted class.
#   """
#   inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
#   outputs = model(**inputs)
#   logits = outputs.logits  # Access logits instead of pipeline output
#   predictions = torch.argmax(logits, dim=-1)  # Apply argmax for prediction
#   return model.config.id2label[predictions.item()]  # Map index to class label

interface = gr.Interface(
    fn=classify_text,
    inputs="text",
    outputs="text",
    title="Text Classification Demo",
    description="Enter some text, and the model will classify it.",
    #choices=["depression", "anxiety", "bipolar disorder", "schizophrenia", "PTSD", "OCD", "ADHD", "autism", "eating disorder", "personality disorder", "phobia"]  # Adjust class names
)

#interface.launch()