# Install necessary libraries import os import subprocess # Function to install a package if it is not already installed def install(package): subprocess.check_call([os.sys.executable, "-m", "pip", "install", package]) # Ensure the necessary packages are installed install("transformers") install("torch") install("pandas") install("scikit-learn") install("gradio") import os import pandas as pd import gradio as gr from transformers import AutoModel, AutoTokenizer import torch from sklearn.model_selection import train_test_split # Function to convert a list to a DataFrame def list_to_dataframe(data_list): # Convert the list to a DataFrame (assuming it's a list of dicts or tuples) df = pd.DataFrame(data_list) return df # Load your dataset from a file def load_dataset(file_path=None): if file_path is None: file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab # Check if the file exists if file_path and not os.path.exists(file_path): print(f"File not found at '{file_path}', using default list data...") # Fallback to a default list if file is not found default_data = [ {'text': 'Example sentence 1', 'label': 'label1'}, {'text': 'Example sentence 2', 'label': 'label2'}, # Add more example data as needed ] return list_to_dataframe(default_data) try: df = pd.read_excel(file_path) print("Columns in the dataset:", df.columns.tolist()) return df except Exception as e: print(f"Error loading dataset: {e}") return None # Preprocess the data def preprocess_data(df): # Add your preprocessing steps here # For example: cleaning, tokenization, etc. return df # Train your model def train_model(df): # Split the dataset into training and testing sets train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Load your pre-trained model and tokenizer from Hugging Face tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") # Add your training code here # This may involve tokenizing the data and feeding it into the model return model # Define the Gradio interface function def predict(input_text): # Load the model and tokenizer tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base") model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base") # Tokenize input and make predictions inputs = tokenizer(input_text, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # Process the outputs as needed (e.g., extracting relevant information) return outputs.last_hidden_state # Build the Gradio interface def build_interface(file_path=None): df = load_dataset(file_path) # Load your dataset if df is None: return None df = preprocess_data(df) # Preprocess the dataset model = train_model(df) # Train your model iface = gr.Interface( fn=predict, inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), outputs="text" ) return iface # Run the Gradio interface if __name__ == "__main__": # You can specify a file_path here if you have a specific file to use file_path = None # Change this to your specific file path if needed iface = build_interface(file_path=file_path) if iface: iface.launch() else: print("Failed to build the Gradio interface. Please check the dataset and model.")