marksverdhei's picture
Update func
4b61117
raw
history blame
2.04 kB
import streamlit as st
import pandas as pd
import torch
import vec2text
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
from utils import file_cache
from transformers import PreTrainedModel, PreTrainedTokenizer
# Caching the vec2text corrector
@st.cache_resource
def load_corrector():
return vec2text.load_pretrained_corrector("gtr-base")
# Caching the dataframe since loading from an external source can be time-consuming
@st.cache_data
def load_data():
return pd.read_csv("https://huggingface.co/datasets/marksverdhei/reddit-syac-urls/resolve/main/train.csv")
@st.cache_resource
def vector_compressor_from_config():
# Return UMAP with 2 components for dimensionality reduction
# return UMAP(n_components=2)
return PCA(n_components=2)
@st.cache_data
@file_cache(".cache/reducer_embeddings.pickle")
def reduce_embeddings(embeddings):
reducer = vector_compressor_from_config()
return reducer.fit_transform(embeddings), reducer
# Caching the model and tokenizer to avoid reloading
@st.cache_resource
def load_model_and_tokenizer(device="cpu"):
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
return encoder, tokenizer
def get_gtr_embeddings(text_list: list[str],
encoder: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
) -> torch.Tensor:
inputs = tokenizer(text_list,
return_tensors="pt",
max_length=128,
truncation=True,
padding="max_length",).to("cuda")
with torch.no_grad():
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
hidden_state = model_output.last_hidden_state
embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])
return embeddings