Pringled commited on
Commit
c8fad0f
·
unverified ·
1 Parent(s): b54da62

Switched default dataset

Browse files
Files changed (1) hide show
  1. app.py +17 -21
app.py CHANGED
@@ -8,7 +8,7 @@ from semhash.datamodels import DeduplicationResult
8
  from model2vec import StaticModel
9
 
10
  # Default parameters
11
- default_dataset_name = "ag_news"
12
  default_dataset1_split = "train"
13
  default_dataset2_split = "test"
14
  default_text_column = "text"
@@ -96,9 +96,12 @@ def perform_deduplication(
96
  # Show example duplicates
97
  if num_duplicates > 0:
98
  result_text += "**Example duplicates:**\n\n"
99
- for duprec in result.duplicates[:5]:
100
- dup_text = duprec.record
101
- if duprec.duplicates:
 
 
 
102
  orig_text, score = duprec.duplicates[0]
103
  differences = display_word_differences(orig_text, dup_text)
104
  result_text += (
@@ -108,13 +111,8 @@ def perform_deduplication(
108
  f"**Differences:**\n{differences}\n"
109
  + "-" * 50 + "\n\n"
110
  )
111
- else:
112
- # Possibly an exact duplicate cluster
113
- result_text += (
114
- f"**Duplicate:**\n{dup_text}\n\n"
115
- "No near-duplicate details available.\n"
116
- + "-" * 50 + "\n\n"
117
- )
118
  else:
119
  result_text += "No duplicates found."
120
 
@@ -145,9 +143,12 @@ def perform_deduplication(
145
 
146
  if num_duplicates > 0:
147
  result_text += "**Example duplicates from Dataset 2:**\n\n"
148
- for duprec in result.duplicates[:5]:
149
- dup_text = duprec.record # The "duplicate" text from dataset2
150
- if duprec.duplicates:
 
 
 
151
  orig_text, score = duprec.duplicates[0]
152
  differences = display_word_differences(orig_text, dup_text)
153
  result_text += (
@@ -157,12 +158,8 @@ def perform_deduplication(
157
  f"**Differences:**\n{differences}\n"
158
  + "-" * 50 + "\n\n"
159
  )
160
- else:
161
- result_text += (
162
- f"**Potential Duplicate (Dataset 2):**\n{dup_text}\n\n"
163
- "No near-duplicate details available.\n"
164
- + "-" * 50 + "\n\n"
165
- )
166
  else:
167
  result_text += "No duplicates found."
168
 
@@ -232,4 +229,3 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
232
  )
233
 
234
  demo.launch()
235
-
 
8
  from model2vec import StaticModel
9
 
10
  # Default parameters
11
+ default_dataset_name = "SetFit/amazon_massive_scenario_en-US"
12
  default_dataset1_split = "train"
13
  default_dataset2_split = "test"
14
  default_text_column = "text"
 
96
  # Show example duplicates
97
  if num_duplicates > 0:
98
  result_text += "**Example duplicates:**\n\n"
99
+
100
+ # Only show duplicates that actually have near-duplicate records
101
+ duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
102
+ if duplicates_with_data:
103
+ for duprec in duplicates_with_data[:5]:
104
+ dup_text = duprec.record
105
  orig_text, score = duprec.duplicates[0]
106
  differences = display_word_differences(orig_text, dup_text)
107
  result_text += (
 
111
  f"**Differences:**\n{differences}\n"
112
  + "-" * 50 + "\n\n"
113
  )
114
+ else:
115
+ result_text += "No near-duplicate details available.\n\n"
 
 
 
 
 
116
  else:
117
  result_text += "No duplicates found."
118
 
 
143
 
144
  if num_duplicates > 0:
145
  result_text += "**Example duplicates from Dataset 2:**\n\n"
146
+
147
+ # Again, only show duplicates that actually have near-duplicate records
148
+ duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
149
+ if duplicates_with_data:
150
+ for duprec in duplicates_with_data[:5]:
151
+ dup_text = duprec.record # The "duplicate" text from dataset2
152
  orig_text, score = duprec.duplicates[0]
153
  differences = display_word_differences(orig_text, dup_text)
154
  result_text += (
 
158
  f"**Differences:**\n{differences}\n"
159
  + "-" * 50 + "\n\n"
160
  )
161
+ else:
162
+ result_text += "No near-duplicate details available.\n\n"
 
 
 
 
163
  else:
164
  result_text += "No duplicates found."
165
 
 
229
  )
230
 
231
  demo.launch()