File size: 1,749 Bytes
8aa44e7
44fce2f
 
78022ff
74e9a4d
9c88742
f5fb53f
6b30d5d
 
9c88742
44fce2f
 
e8bfa89
44fce2f
 
 
 
 
 
 
 
 
74e9a4d
44fce2f
3eacaec
 
 
 
 
 
 
 
6b30d5d
 
 
3eacaec
 
 
78022ff
44fce2f
78022ff
74e9a4d
 
 
 
 
 
05dd656
74e9a4d
44fce2f
78022ff
bdef5c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import torch
import numpy as np
import views
from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings

use_cpu = not torch.cuda.is_available()
# device = "cpu" if use_cpu else "cuda"
device = "cpu"

df = load_data()

encoder, tokenizer = load_model_and_tokenizer(device)

corrector = load_corrector()

# Caching the precomputed embeddings since they are stored locally and large
@st.cache_data
def load_embeddings():
    return np.load("syac-title-embeddings.npy")

embeddings = load_embeddings()
vectors_2d, reducer = reduce_embeddings(embeddings)

def sidebar():
    st.sidebar.title("About this app")
    st.sidebar.markdown(
        "This app is intended to give a more intuitive and interactive understanding of sequence embeddings (e.g. sentence), \n"
        "through interactive plots and operations with these embeddings, with a focus on embedding inversion.\n"
        "We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
        " dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
    )
    st.sidebar.markdown(
        "### The Dataset\nThe dataset in use is the Reddit SYAC dataset train split ([Heiervang, 2022](https://www.duo.uio.no/handle/10852/96578)), which contains the title of different clickbait articles."
    )

sidebar()

tab1, tab2 = st.tabs(["plot", "diffs"])

with tab1:
    views.plot(
        df=df,
        embeddings=embeddings,
        vectors_2d=vectors_2d, 
        reducer=reducer,
        corrector=corrector,
        device=device,
    )

with tab2:
    views.diffs(embeddings, corrector, encoder, tokenizer)