marksverdhei commited on
Commit
6b30d5d
·
1 Parent(s): 4b61117

Add explanation

Browse files
Files changed (3) hide show
  1. app.py +5 -1
  2. resources.py +2 -1
  3. views.py +3 -2
app.py CHANGED
@@ -5,7 +5,8 @@ import views
5
  from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
6
 
7
  use_cpu = not torch.cuda.is_available()
8
- device = "cpu" if use_cpu else "cuda"
 
9
 
10
  df = load_data()
11
 
@@ -29,6 +30,9 @@ def sidebar():
29
  "We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
30
  " dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
31
  )
 
 
 
32
 
33
  sidebar()
34
 
 
5
  from resources import load_corrector, load_data, load_model_and_tokenizer, reduce_embeddings
6
 
7
  use_cpu = not torch.cuda.is_available()
8
+ # device = "cpu" if use_cpu else "cuda"
9
+ device = "cpu"
10
 
11
  df = load_data()
12
 
 
30
  "We explore both sequence embedding inversion using the method described in [Morris et al., 2023](https://arxiv.org/abs/2310.06816), as well as"
31
  " dimensionality rediction transforms and inverse transforms, and its effect on embedding inversion."
32
  )
33
+ st.sidebar.markdown(
34
+ "### The Dataset\nThe dataset in use is the Reddit SYAC dataset train split ([Heiervang, 2022](https://www.duo.uio.no/handle/10852/96578)), which contains the title of different clickbait articles."
35
+ )
36
 
37
  sidebar()
38
 
resources.py CHANGED
@@ -42,13 +42,14 @@ def load_model_and_tokenizer(device="cpu"):
42
  def get_gtr_embeddings(text_list: list[str],
43
  encoder: PreTrainedModel,
44
  tokenizer: PreTrainedTokenizer,
 
45
  ) -> torch.Tensor:
46
 
47
  inputs = tokenizer(text_list,
48
  return_tensors="pt",
49
  max_length=128,
50
  truncation=True,
51
- padding="max_length",).to("cuda")
52
 
53
  with torch.no_grad():
54
  model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
 
42
  def get_gtr_embeddings(text_list: list[str],
43
  encoder: PreTrainedModel,
44
  tokenizer: PreTrainedTokenizer,
45
+ device: str,
46
  ) -> torch.Tensor:
47
 
48
  inputs = tokenizer(text_list,
49
  return_tensors="pt",
50
  max_length=128,
51
  truncation=True,
52
+ padding="max_length",).to(device)
53
 
54
  with torch.no_grad():
55
  model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
views.py CHANGED
@@ -9,9 +9,10 @@ import utils
9
  import pandas as pd
10
  from scipy.spatial import distance
11
  from resources import get_gtr_embeddings
 
12
  dimensionality_reduction_model_name = "PCA"
13
 
14
- def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
15
  st.title('"A man is to king, what woman is to queen"')
16
  st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
17
  st.markdown(
@@ -34,7 +35,7 @@ def diffs(embeddings: np.ndarray, corrector, encoder, tokenizer):
34
  st.latex("=")
35
 
36
  if submit_button:
37
- v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer).to("cpu")
38
  v4 = v1 - v2 + v3
39
  generated_sentence, = vec2text.invert_embeddings(
40
  embeddings=v4.unsqueeze(0).cuda(),
 
9
  import pandas as pd
10
  from scipy.spatial import distance
11
  from resources import get_gtr_embeddings
12
+ from transformers import PreTrainedModel, PreTrainedTokenizer
13
  dimensionality_reduction_model_name = "PCA"
14
 
15
+ def diffs(embeddings: np.ndarray, corrector, encoder: PreTrainedModel, tokenizer: PreTrainedTokenizer):
16
  st.title('"A man is to king, what woman is to queen"')
17
  st.markdown("A well known pehnomenon in semantic vectors is the way we can do vector operations like addition and subtraction to find spacial relations in the vector space.")
18
  st.markdown(
 
35
  st.latex("=")
36
 
37
  if submit_button:
38
+ v1, v2, v3 = get_gtr_embeddings([sent1, sent2, sent3], encoder, tokenizer, device=encoder.device).to("cpu")
39
  v4 = v1 - v2 + v3
40
  generated_sentence, = vec2text.invert_embeddings(
41
  embeddings=v4.unsqueeze(0).cuda(),