File size: 1,251 Bytes
e8bfa89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import streamlit as st
import pandas as pd
import vec2text
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
from utils import file_cache
# Caching the vec2text corrector
@st.cache_resource
def load_corrector():
return vec2text.load_pretrained_corrector("gtr-base")
# Caching the dataframe since loading from an external source can be time-consuming
@st.cache_data
def load_data():
return pd.read_csv("https://huggingface.co/datasets/marksverdhei/reddit-syac-urls/resolve/main/train.csv")
@st.cache_resource
def vector_compressor_from_config():
# Return UMAP with 2 components for dimensionality reduction
# return UMAP(n_components=2)
return PCA(n_components=2)
@st.cache_data
@file_cache(".cache/reducer_embeddings.pickle")
def reduce_embeddings(embeddings):
reducer = vector_compressor_from_config()
return reducer.fit_transform(embeddings), reducer
# Caching the model and tokenizer to avoid reloading
@st.cache_resource
def load_model_and_tokenizer(device="cpu"):
encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
return encoder, tokenizer |