File size: 1,251 Bytes
e8bfa89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import streamlit as st
import pandas as pd
import vec2text
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
from utils import file_cache


# Caching the vec2text corrector
@st.cache_resource
def load_corrector():
    return vec2text.load_pretrained_corrector("gtr-base")

# Caching the dataframe since loading from an external source can be time-consuming
@st.cache_data
def load_data():
    return pd.read_csv("https://huggingface.co/datasets/marksverdhei/reddit-syac-urls/resolve/main/train.csv")


@st.cache_resource
def vector_compressor_from_config():
    # Return UMAP with 2 components for dimensionality reduction
    # return UMAP(n_components=2)
    return PCA(n_components=2)


@st.cache_data
@file_cache(".cache/reducer_embeddings.pickle")
def reduce_embeddings(embeddings):
    reducer = vector_compressor_from_config()
    return reducer.fit_transform(embeddings), reducer

# Caching the model and tokenizer to avoid reloading
@st.cache_resource
def load_model_and_tokenizer(device="cpu"):
    encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
    return encoder, tokenizer