Switched default dataset
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ from semhash.datamodels import DeduplicationResult
|
|
8 |
from model2vec import StaticModel
|
9 |
|
10 |
# Default parameters
|
11 |
-
default_dataset_name = "
|
12 |
default_dataset1_split = "train"
|
13 |
default_dataset2_split = "test"
|
14 |
default_text_column = "text"
|
@@ -96,9 +96,12 @@ def perform_deduplication(
|
|
96 |
# Show example duplicates
|
97 |
if num_duplicates > 0:
|
98 |
result_text += "**Example duplicates:**\n\n"
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
102 |
orig_text, score = duprec.duplicates[0]
|
103 |
differences = display_word_differences(orig_text, dup_text)
|
104 |
result_text += (
|
@@ -108,13 +111,8 @@ def perform_deduplication(
|
|
108 |
f"**Differences:**\n{differences}\n"
|
109 |
+ "-" * 50 + "\n\n"
|
110 |
)
|
111 |
-
|
112 |
-
|
113 |
-
result_text += (
|
114 |
-
f"**Duplicate:**\n{dup_text}\n\n"
|
115 |
-
"No near-duplicate details available.\n"
|
116 |
-
+ "-" * 50 + "\n\n"
|
117 |
-
)
|
118 |
else:
|
119 |
result_text += "No duplicates found."
|
120 |
|
@@ -145,9 +143,12 @@ def perform_deduplication(
|
|
145 |
|
146 |
if num_duplicates > 0:
|
147 |
result_text += "**Example duplicates from Dataset 2:**\n\n"
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
151 |
orig_text, score = duprec.duplicates[0]
|
152 |
differences = display_word_differences(orig_text, dup_text)
|
153 |
result_text += (
|
@@ -157,12 +158,8 @@ def perform_deduplication(
|
|
157 |
f"**Differences:**\n{differences}\n"
|
158 |
+ "-" * 50 + "\n\n"
|
159 |
)
|
160 |
-
|
161 |
-
|
162 |
-
f"**Potential Duplicate (Dataset 2):**\n{dup_text}\n\n"
|
163 |
-
"No near-duplicate details available.\n"
|
164 |
-
+ "-" * 50 + "\n\n"
|
165 |
-
)
|
166 |
else:
|
167 |
result_text += "No duplicates found."
|
168 |
|
@@ -232,4 +229,3 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
|
|
232 |
)
|
233 |
|
234 |
demo.launch()
|
235 |
-
|
|
|
8 |
from model2vec import StaticModel
|
9 |
|
10 |
# Default parameters
|
11 |
+
default_dataset_name = "SetFit/amazon_massive_scenario_en-US"
|
12 |
default_dataset1_split = "train"
|
13 |
default_dataset2_split = "test"
|
14 |
default_text_column = "text"
|
|
|
96 |
# Show example duplicates
|
97 |
if num_duplicates > 0:
|
98 |
result_text += "**Example duplicates:**\n\n"
|
99 |
+
|
100 |
+
# Only show duplicates that actually have near-duplicate records
|
101 |
+
duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
|
102 |
+
if duplicates_with_data:
|
103 |
+
for duprec in duplicates_with_data[:5]:
|
104 |
+
dup_text = duprec.record
|
105 |
orig_text, score = duprec.duplicates[0]
|
106 |
differences = display_word_differences(orig_text, dup_text)
|
107 |
result_text += (
|
|
|
111 |
f"**Differences:**\n{differences}\n"
|
112 |
+ "-" * 50 + "\n\n"
|
113 |
)
|
114 |
+
else:
|
115 |
+
result_text += "No near-duplicate details available.\n\n"
|
|
|
|
|
|
|
|
|
|
|
116 |
else:
|
117 |
result_text += "No duplicates found."
|
118 |
|
|
|
143 |
|
144 |
if num_duplicates > 0:
|
145 |
result_text += "**Example duplicates from Dataset 2:**\n\n"
|
146 |
+
|
147 |
+
# Again, only show duplicates that actually have near-duplicate records
|
148 |
+
duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
|
149 |
+
if duplicates_with_data:
|
150 |
+
for duprec in duplicates_with_data[:5]:
|
151 |
+
dup_text = duprec.record # The "duplicate" text from dataset2
|
152 |
orig_text, score = duprec.duplicates[0]
|
153 |
differences = display_word_differences(orig_text, dup_text)
|
154 |
result_text += (
|
|
|
158 |
f"**Differences:**\n{differences}\n"
|
159 |
+ "-" * 50 + "\n\n"
|
160 |
)
|
161 |
+
else:
|
162 |
+
result_text += "No near-duplicate details available.\n\n"
|
|
|
|
|
|
|
|
|
163 |
else:
|
164 |
result_text += "No duplicates found."
|
165 |
|
|
|
229 |
)
|
230 |
|
231 |
demo.launch()
|
|