Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
·
ecdea0f
1
Parent(s):
cf9bb91
wip
Browse files- app.py +1 -0
- data_set_training.csv +30 -1
- dev_set_training.csv +9 -1
- eval.py +6 -0
- quick_evaluate.py +15 -3
app.py
CHANGED
@@ -15,6 +15,7 @@ from io import BytesIO
|
|
15 |
|
16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
17 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
|
|
18 |
|
19 |
similarity_threshold = 0.9
|
20 |
|
|
|
15 |
|
16 |
# For baseline 'sentence-transformers/paraphrase-albert-base-v2'
|
17 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
18 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
19 |
|
20 |
similarity_threshold = 0.9
|
21 |
|
data_set_training.csv
CHANGED
@@ -239,4 +239,33 @@ VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENN
|
|
239 |
165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
|
240 |
345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
|
241 |
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
|
242 |
-
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
|
240 |
345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
|
241 |
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
|
242 |
+
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
|
243 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST, Mc Coll, SC 29570|1
|
244 |
+
14453 UNION ST, Mc Coll, SC 29570|144 53 UNION ST, Mc Coll, SC 29570|0
|
245 |
+
14453 UNION ST, Mc Coll, SC 29570|14 453 UNION STREET, Mc Coll, SC 29570|1
|
246 |
+
14453 UNION ST APT 343, Mc Coll, SC 29570|144 53 UNION ST APT 343, Mc Coll, SC 29570|1
|
247 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53A UNION STREET, Mc Coll, SC 29570|0
|
248 |
+
14453 UNION ST, Mc Coll, SC 29570|14443 UNION ST, Mc Coll, SC 29570|0
|
249 |
+
14453 UNION ST, Mc Coll, SC 29570|144-53 UNION ST APT 343, Mc Coll, SC 29570|0
|
250 |
+
20334 PARK AVE, PARK CITY, UT 84060|20234 PARK AVE, PARK CITY, UT 84060|0
|
251 |
+
20334 PARK AVE, PARK CITY, UT 84060|20-334 PARK AVE, PARK CITY, UT 84060|0
|
252 |
+
20334 PARK AVE, PARK CITY, UT 84060|202-34 PARK AVENUE, PARK CITY, UT 84060|1
|
253 |
+
20334 PARK AVE, PARK CITY SUITE 2, UT 84060|202 34 PARK AVENUE STE 2, PARK CITY, UT 84060|1
|
254 |
+
203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|1
|
255 |
+
203 MAPLE AVE FL 2, ENGLEWOOD, NJ 07631|203 MAPLE AVENUE, ENGLEWOOD, NJ 07631|1
|
256 |
+
203 MAPLE AVE FL 2 STE 3, ENGLEWOOD, NJ 07631|203 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
257 |
+
203 MAPLE AVE, ENGLEWOOD, NJ 07631|205 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
258 |
+
2032 MAPLE AVE, ENGLEWOOD, NJ 07631|2031 MAPLE AVE, ENGLEWOOD, NJ 07631|0
|
259 |
+
1427 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|1417 MARVIN GRIFFIN RD, AUGUSTA, GA 30906|0
|
260 |
+
32 GRAND ST, NEDERLAND, TX 77627|33 GRAND ST, NEDERLAND, TX 77627|0
|
261 |
+
32 GRAND ST, NEDERLAND, TX 77627|32 GRAND ST #4, NEDERLAND, TX 77627|0
|
262 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DRIVE SUITE 6, BARBOURVILLE, KY 40906|1
|
263 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR. STE. 6, BARBOURVILLE KY, 40906|1
|
264 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|SUITE #6, 80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
|
265 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|STE #6 - 80 HOSPITAL DR., BARBOURVILLE, KY 40906|1
|
266 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|#6-80 HOSPITAL DRIVE, BARBOURVILLE, KY 40906|1
|
267 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80-2 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
|
268 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|80 HOSPITAL DR SUITE 6A, BARBOURVILLE, KY 40906|0
|
269 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|81 HOSPITAL DRIVE STE 6, BARBOURVILLE, KY 40906|0
|
270 |
+
80 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|82 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|0
|
271 |
+
80 22 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|8022 HOSPITAL DR SUITE 6, BARBOURVILLE, KY 40906|1
|
dev_set_training.csv
CHANGED
@@ -26,4 +26,12 @@ ADDRESS1|ADDRESS2|ARE_SAME
|
|
26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
27 |
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
28 |
115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
|
29 |
-
112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
27 |
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
28 |
115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
|
29 |
+
112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
|
30 |
+
3619 S 22ND DR, YUMA, AZ 85364|3636 S 22ND DR, YUMA, AZ 85364|0
|
31 |
+
7325 FRANKLIN BLVD, SACRAMENTO, CA 95823|73235 FRANKLIN BLVD, SACRAMENTO, CA 95823|0
|
32 |
+
3660 MAIN ST, TUCSON, AZ 85721|3701 MAIN ST, TUCSON, AZ 85721|0
|
33 |
+
3910 MAGNET RD, MALVERN, AR 72104|3910 MAGNET RD, STE 206 MALVERN, AR 72104|0
|
34 |
+
15702 OBERLIN RD, RALEIGH, NC 27605|15702 OBERLIN RD FL 1, RALEIGH, NC 27605|1
|
35 |
+
14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|14325 ROOSOVELT AVE, LA JOLLA, CA 92092|0
|
36 |
+
14425 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|144-25 ROOSOVELT AVE APT 322, LA JOLLA, CA 92092|1
|
37 |
+
14425 ROOSOVELT AVE, LA JOLLA, CA 92092|144-25A ROOSOVELT AVENUE, LA JOLLA, CA 92092|0
|
eval.py
CHANGED
@@ -13,6 +13,12 @@ logger = logging.getLogger(__name__)
|
|
13 |
|
14 |
model_name = 'sentence-transformers/paraphrase-albert-base-v2'
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
model_sbert = SentenceTransformer(model_name)
|
18 |
|
|
|
13 |
|
14 |
model_name = 'sentence-transformers/paraphrase-albert-base-v2'
|
15 |
|
16 |
+
#model_name='output/training_OnlineConstrativeLoss-2023-03-11_23-47-34'
|
17 |
+
#model_name= 'output/training_OnlineConstrativeLoss-2023-03-14_01-24-44'
|
18 |
+
|
19 |
+
#86% so far
|
20 |
+
model_name = 'output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
21 |
+
|
22 |
|
23 |
model_sbert = SentenceTransformer(model_name)
|
24 |
|
quick_evaluate.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
|
@@ -6,15 +6,16 @@ from sentence_transformers import SentenceTransformer
|
|
6 |
|
7 |
# base
|
8 |
# large
|
9 |
-
|
10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
12 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
13 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
|
14 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
|
15 |
-
model_name='output/training_OnlineConstrativeLoss-2023-03-
|
16 |
model_sbert = SentenceTransformer(model_name)
|
17 |
|
|
|
18 |
def get_sbert_embedding(input_text):
|
19 |
embedding = model_sbert.encode(input_text)
|
20 |
return embedding.tolist()
|
@@ -40,6 +41,17 @@ a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
|
40 |
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
41 |
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
#def get_embedding(input_text):
|
44 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
45 |
# input_ids = encoded_input.input_ids
|
|
|
1 |
+
from transformers import AlbertTokenizer, AlbertModel
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
|
|
|
6 |
|
7 |
# base
|
8 |
# large
|
9 |
+
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
10 |
#model = AlbertModel.from_pretrained("albert-base-v2")
|
11 |
#'sentence-transformers/paraphrase-albert-base-v2'
|
12 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-10_11-17-15'
|
13 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_00-24-35'
|
14 |
model_name = 'output/training_OnlineConstrativeLoss-2023-03-11_01-00-19'
|
15 |
+
model_name='output/training_OnlineConstrativeLoss-2023-03-17_16-10-39'
|
16 |
model_sbert = SentenceTransformer(model_name)
|
17 |
|
18 |
+
|
19 |
def get_sbert_embedding(input_text):
|
20 |
embedding = model_sbert.encode(input_text)
|
21 |
return embedding.tolist()
|
|
|
41 |
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
42 |
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
43 |
|
44 |
+
a19 = "THE PAVILION AT QUEENS FOR REHABILITAION AND NURSING 36-17 PARSONS BOULEVARD, FLUSHING, NY 11354"
|
45 |
+
a20 = "136-17 39TH AVENUE, 4TH FLOOR, SUITE CF-E, FLUSHING, NY 11354"
|
46 |
+
a21="WISDOM MEDICAL P.C., 136-20 38 TH AVE 6E, FLUSHING, NY 11354"
|
47 |
+
|
48 |
+
encoded_input = tokenizer(a21, return_tensors='pt')
|
49 |
+
input_ids = encoded_input.input_ids
|
50 |
+
input_num_tokens = input_ids.shape[1]
|
51 |
+
print(input_num_tokens)
|
52 |
+
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
|
53 |
+
#
|
54 |
+
print( "Tokens : " + ' '.join(list_of_tokens))
|
55 |
#def get_embedding(input_text):
|
56 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
57 |
# input_ids = encoded_input.input_ids
|