Feliks Zaslavskiy commited on
Commit
71667b3
·
1 Parent(s): 0766f0d

minor updates

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. app.py +30 -25
  3. data.py +8 -8
  4. data_set_training.csv +2 -1
  5. dev_set_training.csv +9 -1
  6. train.py +1 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ output/
app.py CHANGED
@@ -9,37 +9,42 @@ from sklearn.metrics.pairwise import cosine_similarity
9
  from io import BytesIO
10
 
11
  # base is smaller, vs large
12
- model_size='base'
13
- tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
14
- model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
 
 
 
 
 
15
 
16
- model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
17
  # for regular burt 0.98
18
 
19
- similarity_threshold = 0.8
 
20
 
21
  def get_sbert_embedding(input_text):
22
  embedding = model_sbert.encode(input_text)
23
  return embedding.tolist()
24
 
25
- def get_embedding(input_text):
26
- encoded_input = tokenizer(input_text, return_tensors='pt')
27
- input_ids = encoded_input.input_ids
28
- #input_num_tokens = input_ids.shape[1]
29
-
30
- #print( "Number of input tokens: " + str(input_num_tokens))
31
- #print("Length of input: " + str(len(input_text)))
32
-
33
- list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
34
-
35
- #print( "Tokens : " + ' '.join(list_of_tokens))
36
- with torch.no_grad():
37
-
38
- outputs = model(**encoded_input)
39
- last_hidden_states = outputs[0]
40
- sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
41
- #sentence_embedding = output.last_hidden_state[0][0]
42
- return sentence_embedding.tolist()
43
 
44
  st.set_page_config(layout="wide")
45
  st.title('Upload the Address Dataset')
@@ -58,7 +63,7 @@ if uploaded_file is not None:
58
  data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
59
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
60
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
61
- + data_caqh['city'].astype(str) + ' '\
62
  + data_caqh['state'].astype(str) + ' ' \
63
  + data_caqh['postalcode'].astype(str)
64
 
@@ -75,7 +80,7 @@ if uploaded_file is not None:
75
  + data_ndb['zip_pls_4_cd'].astype(str))
76
 
77
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
78
- + data_ndb['cty_nm'].astype(str).str.strip() + ' ' \
79
  + data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd_zip_pls_4_cd']
80
 
81
  # Calculate similarity For CAQH
 
9
  from io import BytesIO
10
 
11
  # base is smaller, vs large
12
+ #model_size='base'
13
+ #tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
14
+ #model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
15
+
16
+ # For baseline 'sentence-transformers/paraphrase-albert-base-v2'
17
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
18
+
19
+ similarity_threshold = 0.9
20
 
 
21
  # for regular burt 0.98
22
 
23
+ model_sbert = SentenceTransformer(model_name)
24
+
25
 
26
  def get_sbert_embedding(input_text):
27
  embedding = model_sbert.encode(input_text)
28
  return embedding.tolist()
29
 
30
+ #def get_embedding(input_text):
31
+ # encoded_input = tokenizer(input_text, return_tensors='pt')
32
+ # input_ids = encoded_input.input_ids
33
+ # #input_num_tokens = input_ids.shape[1]
34
+ #
35
+ # #print( "Number of input tokens: " + str(input_num_tokens))
36
+ # #print("Length of input: " + str(len(input_text)))
37
+ #
38
+ # list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
39
+ #
40
+ # #print( "Tokens : " + ' '.join(list_of_tokens))
41
+ # with torch.no_grad():
42
+ #
43
+ # outputs = model(**encoded_input)
44
+ # last_hidden_states = outputs[0]
45
+ # sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
46
+ # #sentence_embedding = output.last_hidden_state[0][0]
47
+ # return sentence_embedding.tolist()
48
 
49
  st.set_page_config(layout="wide")
50
  st.title('Upload the Address Dataset')
 
63
  data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
64
  data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
65
  + np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
66
+ + data_caqh['city'].astype(str) + ', '\
67
  + data_caqh['state'].astype(str) + ' ' \
68
  + data_caqh['postalcode'].astype(str)
69
 
 
80
  + data_ndb['zip_pls_4_cd'].astype(str))
81
 
82
  data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
83
+ + data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
84
  + data_ndb['st_cd'].astype(str) + ' ' + data_ndb['zip_cd_zip_pls_4_cd']
85
 
86
  # Calculate similarity For CAQH
data.py CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import SentenceTransformer
9
  #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
  #model = AlbertModel.from_pretrained("albert-base-v2")
11
  #'sentence-transformers/paraphrase-albert-base-v2'
12
- model_name = 'output/training_OnlineConstrativeLoss-2023-03-09_23-55-34'
13
  model_sbert = SentenceTransformer(model_name)
14
 
15
  def get_sbert_embedding(input_text):
@@ -58,22 +58,22 @@ e8 = get_sbert_embedding(a8)
58
  e8x = get_sbert_embedding(a8x)
59
  e9 = get_sbert_embedding(a9)
60
  e10 = get_sbert_embedding(a10)
61
- print(f"a1 \"{a1}\" to \"{a2}\" a2")
62
  print(cosine_similarity([e1], [e2]))
63
- print(f"a1 \"{a1}\" to \"{a4}\" a4")
64
  print(cosine_similarity([e1], [e4]))
65
- print(f"a1 \"{a1}\" to \"{a5}\" a5")
66
  print(cosine_similarity([e1], [e5]))
67
 
68
- print(f"a7 \"{a7}\" to \"{a8}\" a8")
69
  print(cosine_similarity([e7], [e8]))
70
- print(f"a7 \"{a7}\" to \"{a8x}\" a8x")
71
  print(cosine_similarity([e7], [e8x]))
72
 
73
- print(f"a7 \"{a7}\" to \"{a9}\" a9")
74
  print(cosine_similarity([e7], [e9]))
75
 
76
- print(f"a7 \"{a7}\" to \"{a10}\" a10")
77
  print(cosine_similarity([e7], [e10]))
78
  # with base
79
  #a1 to a2
 
9
  #tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
10
  #model = AlbertModel.from_pretrained("albert-base-v2")
11
  #'sentence-transformers/paraphrase-albert-base-v2'
12
+ model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
13
  model_sbert = SentenceTransformer(model_name)
14
 
15
  def get_sbert_embedding(input_text):
 
58
  e8x = get_sbert_embedding(a8x)
59
  e9 = get_sbert_embedding(a9)
60
  e10 = get_sbert_embedding(a10)
61
+ print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
62
  print(cosine_similarity([e1], [e2]))
63
+ print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
64
  print(cosine_similarity([e1], [e4]))
65
+ print(f"a1 \"{a1}\" to \"{a5}\" a5 - expected Same")
66
  print(cosine_similarity([e1], [e5]))
67
 
68
+ print(f"a7 \"{a7}\" to \"{a8}\" a8 - expected Different")
69
  print(cosine_similarity([e7], [e8]))
70
+ print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
71
  print(cosine_similarity([e7], [e8x]))
72
 
73
+ print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
74
  print(cosine_similarity([e7], [e9]))
75
 
76
+ print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
77
  print(cosine_similarity([e7], [e10]))
78
  # with base
79
  #a1 to a2
data_set_training.csv CHANGED
@@ -106,7 +106,8 @@ Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benn
106
  Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benning Rd, Valley Healthcare System, Columbus, GA 31903|1
107
  Memorial Satilla Health, 1900 Tebeau St, Waycross, GA 31501|1900 Tebeau St, Waycross, GA 31501|1
108
  VA Medical Center 2002 Holcombe Blvd, Houston, TX 77030|VA Medical Center 2002 Holcombe Boulevard, Houston, TX 77030|1
109
-
 
110
 
111
 
112
 
 
106
  Valley Healthcare System 1600 Fort Benning Rd, Columbus, GA 31903|1600 Fort Benning Rd, Valley Healthcare System, Columbus, GA 31903|1
107
  Memorial Satilla Health, 1900 Tebeau St, Waycross, GA 31501|1900 Tebeau St, Waycross, GA 31501|1
108
  VA Medical Center 2002 Holcombe Blvd, Houston, TX 77030|VA Medical Center 2002 Holcombe Boulevard, Houston, TX 77030|1
109
+ 1839 E Capitol Ave, Bismarck, ND 58501|1839 East Capitol Avenue, Bismarck, ND 58501|1
110
+ 1839 E Capitol Ave, Bismarck, ND 58501|1912 East Capitol Avenue, Bismarck, ND 58501|0
111
 
112
 
113
 
dev_set_training.csv CHANGED
@@ -4,4 +4,12 @@ address1|address2|are_same
4
  1061 Schmidt Ln, North Brunswick Township, NJ 08902|1061 Schmidt Lane, North Brunswick Township, NJ 08902|1
5
  1061 Schmidt Ln, North Brunswick Township, NJ 08902|934 Schmidt Ln, North Brunswick Township, NJ 08902|0
6
  5844 N Orange Blossom Trail, Orlando, FL 32810|5844 North Orange Blossom Trail, Orlando, FL 32810-9635|1
7
- 6701 Fannin St #1400, Houston, TX 77030|6701 Fannin Ste #1400, Houston, TX 77030|1
 
 
 
 
 
 
 
 
 
4
  1061 Schmidt Ln, North Brunswick Township, NJ 08902|1061 Schmidt Lane, North Brunswick Township, NJ 08902|1
5
  1061 Schmidt Ln, North Brunswick Township, NJ 08902|934 Schmidt Ln, North Brunswick Township, NJ 08902|0
6
  5844 N Orange Blossom Trail, Orlando, FL 32810|5844 North Orange Blossom Trail, Orlando, FL 32810-9635|1
7
+ 6701 Fannin St #1400, Houston, TX 77030|6701 Fannin Ste #1400, Houston, TX 77030|1
8
+ 14143 Winecup Ln, Houston, TX 77047|14121 Winecup Lane, Houston, TX 77047|0
9
+ 440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|440 Technology Center Dr., Boston, MA 10034|1
10
+ 440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|440 Technology Center Dr., Boston, MA 10034-0345|1
11
+ 440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|87 Technology Center Drive, Boston, MA 10034|0
12
+ 440 TECHNOLOGY CENTER DRIVE, Boston, MA 10034|200 Technology Center Drive, Boston, MA 10034|0
13
+ 65 Mountain Blvd Ext, Warren, NJ 07059|65 Mountain Boulevard Ext, Warren, NJ 07059|1
14
+ 65 Mountain Blvd Ext, Warren, NJ 07059|5078 S Maryland Pkwy, Las Vegas, NV 89119|0
15
+ 65 Mountain Blvd Ext, Warren, NJ 07059|112 Mountain Blvd Ext, Warren, NJ 07059|0
train.py CHANGED
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
24
 
25
 
26
  #As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
27
- model = SentenceTransformer('albert-base-v2')
28
  num_epochs = 10
29
  train_batch_size = 8
30
 
 
24
 
25
 
26
  #As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
27
+ model = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
28
  num_epochs = 10
29
  train_batch_size = 8
30