Commit
·
6b30d5d
1
Parent(s):
4b61117
Add explanation
Browse files- app.py +5 -1
- resources.py +2 -1
- views.py +3 -2
app.py
CHANGED
@@ -5,7 +5,8 @@ import views
|
|
5 |
from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
|
6 |
|
7 |
use_cpu = not torch.cuda.is_available()
|
8 |
-
device = "cpu" if use_cpu else "cuda"
|
|
|
9 |
|
10 |
df = load_data()
|
11 |
|
@@ -29,6 +30,9 @@ def sidebar():
|
|
29 |
"We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
|
30 |
" dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
|
31 |
)
|
|
|
|
|
|
|
32 |
|
33 |
sidebar()
|
34 |
|
|
|
5 |
from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
|
6 |
|
7 |
use_cpu = not torch.cuda.is_available()
|
8 |
+
# device = "cpu" if use_cpu else "cuda"
|
9 |
+
device = "cpu"
|
10 |
|
11 |
df = load_data()
|
12 |
|
|
|
30 |
"We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
|
31 |
" dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
|
32 |
)
|
33 |
+
st.sidebar.markdown(
|
34 |
+
"### The Dataset\nThe dataset in use is the Reddit SYAC dataset train split ([Heiervang, 2022](https://www.duo.uio.no/handle/10852/96578)), which contains the title of different clickbait articles."
|
35 |
+
)
|
36 |
|
37 |
sidebar()
|
38 |
|
resources.py
CHANGED
@@ -42,13 +42,14 @@ def load_model_and_tokenizer(device="cpu"):
|
|
42 |
def get_gtr_embeddings(text_list: list[str],
|
43 |
encoder: PreTrainedModel,
|
44 |
tokenizer: PreTrainedTokenizer,
|
|
|
45 |
) -> torch.Tensor:
|
46 |
|
47 |
inputs = tokenizer(text_list,
|
48 |
return_tensors="pt",
|
49 |
max_length=128,
|
50 |
truncation=True,
|
51 |
-
padding="max_length",).to(
|
52 |
|
53 |
with torch.no_grad():
|
54 |
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
|
|
42 |
def get_gtr_embeddings(text_list: list[str],
|
43 |
encoder: PreTrainedModel,
|
44 |
tokenizer: PreTrainedTokenizer,
|
45 |
+
device: str,
|
46 |
) -> torch.Tensor:
|
47 |
|
48 |
inputs = tokenizer(text_list,
|
49 |
return_tensors="pt",
|
50 |
max_length=128,
|
51 |
truncation=True,
|
52 |
+
padding="max_length",).to(device)
|
53 |
|
54 |
with torch.no_grad():
|
55 |
model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
|
views.py
CHANGED
@@ -9,9 +9,10 @@ import utils
|
|
9 |
import pandas as pd
|
10 |
from scipy.spatial import distance
|
11 |
from resources import get_gtr_embeddings
|
|
|
12 |
dimensionality_reduction_model_name = "PCA"
|
13 |
|
14 |
-
def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
15 |
st.title('"A man is to king, what woman is to queen"')
|
16 |
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
17 |
st.markdown(
|
@@ -34,7 +35,7 @@ def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
|
|
34 |
st.latex("=")
|
35 |
|
36 |
if submit_button:
|
37 |
-
v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer).to("cpu")
|
38 |
v4 = v1 - v2 + v3
|
39 |
generated_sentence, = vec2text.invert_embeddings(
|
40 |
embeddings=v4.unsqueeze(0).cuda(),
|
|
|
9 |
import pandas as pd
|
10 |
from scipy.spatial import distance
|
11 |
from resources import get_gtr_embeddings
|
12 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer
|
13 |
dimensionality_reduction_model_name = "PCA"
|
14 |
|
15 |
+
def diffs(embeddings: np.ndarray, corrector, encoder: PreTrainedModel, tokenizer: PreTrainedTokenizer):
|
16 |
st.title('"A man is to king, what woman is to queen"')
|
17 |
st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
|
18 |
st.markdown(
|
|
|
35 |
st.latex("=")
|
36 |
|
37 |
if submit_button:
|
38 |
+
v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer, device=encoder.device).to("cpu")
|
39 |
v4 = v1 - v2 + v3
|
40 |
generated_sentence, = vec2text.invert_embeddings(
|
41 |
embeddings=v4.unsqueeze(0).cuda(),
|