Spaces:

nebiyu29
/

good_acc

Sleeping

File size: 3,516 Bytes

aea9d52
c4dd8e7
aea9d52
 
 
 
2f6ade8
 
 
aea9d52
 
 
 
2f6ade8
 
 
 
 
 
 
49ce6a9
 
2f6ade8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49ce6a9
 
 
 
 
 
 
 
e328eaa
 
dd12fa7
74f91bd
dd12fa7
 
74f91bd
dd12fa7
 
74f91bd
dd12fa7
 
 
74f91bd
dd12fa7
 
 
 
 
e328eaa
dd12fa7
 
74f91bd
dd12fa7
 
74f91bd
dd12fa7
 
 
 
 
 
74f91bd
aea9d52
 
 
 
 
 
 
7768e0f
aea9d52
7768e0f

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import transformers

tokenizer = AutoTokenizer.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")
model = AutoModelForSequenceClassification.from_pretrained("nebiyu29/fintunned-v2-roberta_GA")


# Load the model and tokenizer
# model = transformers.AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

# tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

# Define a function to split a text into segments of 512 tokens
def split_text(text):
    #this prints progress
    print("going to split the text")
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Initialize an empty list for segments
    segments = []
    # Initialize an empty list for current segment
    current_segment = []
    # Initialize a counter for tokens
    token_count = 0
    # Loop through the tokens
    for token in tokens:
        # Add the token to the current segment
        current_segment.append(token)
        # Increment the token count
        token_count += 1
        # If the token count reaches 512 or the end of the text, add the current segment to the segments list
        if token_count == 512 or token == tokens[-1]:
            # Convert the current segment to a string and add it to the segments list
            segments.append(tokenizer.convert_tokens_to_string(current_segment))
            # Reset the current segment and the token count
            current_segment = []
            token_count = 0
    # Return the segments list
    return segments

# Define a function to extract predictions from model output (adjust as needed)
def extract_predictions(outputs):
    # Assuming outputs contain logits and labels (adapt based on your model's output format)
    logits = outputs.logits
    probs = logits.softmax(dim=1)
    preds = torch.argmax(probs, dim=1)
    return probs, preds  # Return all probabilities and predicted labels
    
# a function that classifies text

def classify_text(text):

    # Split text into segments using split_text
    segments = split_text(text)

    # Initialize empty list for predictions
    predictions = []

    # Move device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Loop through segments, process, and store predictions
    for segment in segments:
        inputs = tokenizer([segment], padding=True, return_tensors="pt")
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        # Extract predictions for each segment
        probs, preds = extract_predictions(outputs)  # Define this function based on your model's output

        # Append predictions for this segment
        predictions.append({
            "segment_text": segment,
            "label": preds[0],  # Assuming single label prediction
            "probability": probs[preds[0]]  # Access probability for the predicted label
        })


interface = gr.Interface(
    fn=classify_text,
    inputs="text",
    outputs="text",
    title="Text Classification Demo",
    description="Enter some text, and the model will classify it.",
   )

#interface.launch(server_port=8080)