File size: 2,039 Bytes
e8bfa89
 
bdef5c4
e8bfa89
 
 
 
bdef5c4
e8bfa89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdef5c4
 
 
4b61117
bdef5c4
4b61117
 
bdef5c4
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
import pandas as pd
import torch
import vec2text
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
from utils import file_cache
from transformers import PreTrainedModel, PreTrainedTokenizer

# Caching the vec2text corrector
@st.cache_resource
def load_corrector():
    return vec2text.load_pretrained_corrector("gtr-base")

# Caching the dataframe since loading from an external source can be time-consuming
@st.cache_data
def load_data():
    return pd.read_csv("https://huggingface.co/datasets/marksverdhei/reddit-syac-urls/resolve/main/train.csv")


@st.cache_resource
def vector_compressor_from_config():
    # Return UMAP with 2 components for dimensionality reduction
    # return UMAP(n_components=2)
    return PCA(n_components=2)


@st.cache_data
@file_cache(".cache/reducer_embeddings.pickle")
def reduce_embeddings(embeddings):
    reducer = vector_compressor_from_config()
    return reducer.fit_transform(embeddings), reducer

# Caching the model and tokenizer to avoid reloading
@st.cache_resource
def load_model_and_tokenizer(device="cpu"):
    encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
    return encoder, tokenizer


def get_gtr_embeddings(text_list: list[str],
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer,
                       ) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings