Spaces:
Runtime error
Runtime error
Feliks Zaslavskiy
commited on
Commit
·
ce71282
1
Parent(s):
2627f58
training data
Browse files- data_set_training.csv +5 -0
- dev_set_training.csv +3 -1
- quick_evaluate.py +35 -19
data_set_training.csv
CHANGED
@@ -232,3 +232,8 @@ MEMORIAL SATILLA HEALTH, 1900 TEBEAU ST, WAYCROSS, GA 31501|1900 TEBEAU STREET,
|
|
232 |
VA MEDICAL CENTER 2002 HOLCOMBE BLVD, HOUSTON, TX 77030|VA MEDICAL CENTER 2002 HOLCOMBE BOULEVARD, HOUSTON, TX 77030|1
|
233 |
VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, COLUMBUS, GA 31903|1
|
234 |
VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, VALLEY HEALTHCARE SYSTEM, COLUMBUS, GA 31903|1
|
|
|
|
|
|
|
|
|
|
|
|
232 |
VA MEDICAL CENTER 2002 HOLCOMBE BLVD, HOUSTON, TX 77030|VA MEDICAL CENTER 2002 HOLCOMBE BOULEVARD, HOUSTON, TX 77030|1
|
233 |
VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, COLUMBUS, GA 31903|1
|
234 |
VALLEY HEALTHCARE SYSTEM 1600 FORT BENNING RD, COLUMBUS, GA 31903|1600 FORT BENNING RD, VALLEY HEALTHCARE SYSTEM, COLUMBUS, GA 31903|1
|
235 |
+
315 22 BRAVERTON ST #110, EDGEWATER, MD 21037|315-22 BRAVERTON ST #110, EDGEWATER, MD 21037|1
|
236 |
+
165 10 VILLAGE DR W, UPPER MARLBORO, MD 20772|165 12 VILLAGE DR W, UPPER MARLBORO, MD 20772|0
|
237 |
+
345 12 OLD WASHINGTON RD, WALDORF, MD 20602|345-12 OLD WASHINGTON RD, WALDORF, MD 20602|1
|
238 |
+
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-11 ONYX CT, FREDERICKSBURG, VA 22407|0
|
239 |
+
144 12 ONYX CT, FREDERICKSBURG, VA 22407|144-12 ONYX CT, FREDERICKSBURG, VA 22407|1
|
dev_set_training.csv
CHANGED
@@ -24,4 +24,6 @@ ADDRESS1|ADDRESS2|ARE_SAME
|
|
24 |
87-44 ROUTE 13, CORTLANDVILLE, NY 13045|87 24 ROUTE 13, CORTLANDVILLE, NY 13045|0
|
25 |
872 ROUTE 13, CORTLANDVILLE, NY 13045|87-2 ROUTE 13, CORTLANDVILLE ,NY 13045|1
|
26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
27 |
-
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
|
|
|
|
|
24 |
87-44 ROUTE 13, CORTLANDVILLE, NY 13045|87 24 ROUTE 13, CORTLANDVILLE, NY 13045|0
|
25 |
872 ROUTE 13, CORTLANDVILLE, NY 13045|87-2 ROUTE 13, CORTLANDVILLE ,NY 13045|1
|
26 |
8724 ROUTE 13, CORTLANDVILLE, NY 13045|87-24 ROUTE 13, CORTLANDVILLE, NY 13045|1
|
27 |
+
HEART HEALTH, 90 N COLUMBUS AVE, LOUISVILLE, MS 39339|90 N COLUMBUS AVE, LOUISVILLE, MS 39339|1
|
28 |
+
115 34 SHOREWAY DR, QUEENSTOWN, MD 21658|115-43 SHOREWAY DR, QUEENSTOWN, MD 21658|0
|
29 |
+
112 24 SHOREWAY DR, QUEENSTOWN, MD 21658|112-24 SHOREWAY DR, QUEENSTOWN, MD 21658|1
|
quick_evaluate.py
CHANGED
@@ -19,22 +19,27 @@ def get_sbert_embedding(input_text):
|
|
19 |
embedding = model_sbert.encode(input_text)
|
20 |
return embedding.tolist()
|
21 |
|
22 |
-
a1 = "65
|
23 |
-
a2 = "112
|
24 |
-
a3 = "1677 NJ-27 #2,
|
25 |
-
a4 = "5078 S
|
26 |
-
a5 = "65
|
27 |
-
a6 = "123
|
28 |
-
a7 = "440 TECHNOLOGY CENTER DRIVE,
|
29 |
-
a8 = "200
|
30 |
-
a8x= "87
|
31 |
-
a9 = "440
|
32 |
-
a10
|
33 |
-
a11="
|
34 |
-
a12="87
|
35 |
-
a13="87-
|
36 |
-
a14="257 37 US
|
37 |
-
a15="257-37 US
|
|
|
|
|
|
|
|
|
|
|
38 |
#def get_embedding(input_text):
|
39 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
40 |
# input_ids = encoded_input.input_ids
|
@@ -63,7 +68,7 @@ e6 = get_sbert_embedding(a6)
|
|
63 |
e7 = get_sbert_embedding(a7)
|
64 |
e8 = get_sbert_embedding(a8)
|
65 |
e8x = get_sbert_embedding(a8x)
|
66 |
-
e9 = get_sbert_embedding(a9)
|
67 |
e10 = get_sbert_embedding(a10)
|
68 |
e11 = get_sbert_embedding(a11)
|
69 |
e12 = get_sbert_embedding(a12)
|
@@ -71,6 +76,10 @@ e13 = get_sbert_embedding(a13)
|
|
71 |
e14 = get_sbert_embedding(a14)
|
72 |
e15 = get_sbert_embedding(a15)
|
73 |
|
|
|
|
|
|
|
|
|
74 |
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
|
75 |
print(cosine_similarity([e1], [e2]))
|
76 |
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
|
@@ -83,8 +92,8 @@ print(cosine_similarity([e7], [e8]))
|
|
83 |
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
|
84 |
print(cosine_similarity([e7], [e8x]))
|
85 |
|
86 |
-
print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
|
87 |
-
print(cosine_similarity([e7], [e9]))
|
88 |
|
89 |
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
|
90 |
print(cosine_similarity([e7], [e10]))
|
@@ -97,6 +106,13 @@ print(cosine_similarity([e11], [e13]))
|
|
97 |
|
98 |
print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
|
99 |
print(cosine_similarity([e14], [e15]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
# with base
|
101 |
#a1 to a2
|
102 |
#[[0.99512167]]
|
|
|
19 |
embedding = model_sbert.encode(input_text)
|
20 |
return embedding.tolist()
|
21 |
|
22 |
+
a1 = "65 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
|
23 |
+
a2 = "112 MOUNTAIN BLVD EXT, WARREN, NJ 07059"
|
24 |
+
a3 = "1677 NJ-27 #2, EDISON, NJ 08817"
|
25 |
+
a4 = "5078 S MARYLAND PKWY, LAS VEGAS, NV 89119"
|
26 |
+
a5 = "65 MOUNTAIN BOULEVARD EXT, WARREN, NJ 07059"
|
27 |
+
a6 = "123 BROAD ST, NEW YORK, NY, 10304-2345"
|
28 |
+
a7 = "440 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
|
29 |
+
a8 = "200 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
|
30 |
+
a8x= "87 TECHNOLOGY CENTER DRIVE, BOSTON, MA 10034"
|
31 |
+
#a9 = "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
|
32 |
+
a10= "440 TECHNOLOGY CENTER DR., BOSTON, MA 10034"
|
33 |
+
a11="87-22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
34 |
+
a12="87 22 ROUTE 13, CORTLANDVILLE, NY 13045"
|
35 |
+
a13="87-55 ROUTE 13, CORTLANDVILLE, NY 13045"
|
36 |
+
a14="257 37 US RT 11, EVANS MILLS, NY 13637"
|
37 |
+
a15="257-37 US ROUTE 11, EVANS MILLS, NY 13637"
|
38 |
+
|
39 |
+
a16="15645 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
40 |
+
a17="156-45 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
41 |
+
a18="156-46 S MAIN ST SUITE D, PENNINGTON, NJ 08534"
|
42 |
+
|
43 |
#def get_embedding(input_text):
|
44 |
# encoded_input = tokenizer(input_text, return_tensors='pt')
|
45 |
# input_ids = encoded_input.input_ids
|
|
|
68 |
e7 = get_sbert_embedding(a7)
|
69 |
e8 = get_sbert_embedding(a8)
|
70 |
e8x = get_sbert_embedding(a8x)
|
71 |
+
#e9 = get_sbert_embedding(a9)
|
72 |
e10 = get_sbert_embedding(a10)
|
73 |
e11 = get_sbert_embedding(a11)
|
74 |
e12 = get_sbert_embedding(a12)
|
|
|
76 |
e14 = get_sbert_embedding(a14)
|
77 |
e15 = get_sbert_embedding(a15)
|
78 |
|
79 |
+
e16 = get_sbert_embedding(a16)
|
80 |
+
e17 = get_sbert_embedding(a17)
|
81 |
+
e18 = get_sbert_embedding(a18)
|
82 |
+
|
83 |
print(f"a1 \"{a1}\" to \"{a2}\" a2 - expected Different")
|
84 |
print(cosine_similarity([e1], [e2]))
|
85 |
print(f"a1 \"{a1}\" to \"{a4}\" a4 - expected Different")
|
|
|
92 |
print(f"a7 \"{a7}\" to \"{a8x}\" a8x - expected Different")
|
93 |
print(cosine_similarity([e7], [e8x]))
|
94 |
|
95 |
+
#print(f"a7 \"{a7}\" to \"{a9}\" a9 - expected Same")
|
96 |
+
#print(cosine_similarity([e7], [e9]))
|
97 |
|
98 |
print(f"a7 \"{a7}\" to \"{a10}\" a10 - expected Same")
|
99 |
print(cosine_similarity([e7], [e10]))
|
|
|
106 |
|
107 |
print(f"a14 \"{a14}\" to \"{a15}\" a15 - expected Same")
|
108 |
print(cosine_similarity([e14], [e15]))
|
109 |
+
|
110 |
+
print(f"a16 \"{a16}\" to \"{a17}\" a17 - expected Same")
|
111 |
+
print(cosine_similarity([e16], [e17]))
|
112 |
+
|
113 |
+
print(f"a16 \"{a16}\" to \"{a18}\" a18 - expected Different")
|
114 |
+
print(cosine_similarity([e16], [e18]))
|
115 |
+
|
116 |
# with base
|
117 |
#a1 to a2
|
118 |
#[[0.99512167]]
|