liujch1998 commited on
Commit
3649303
1 Parent(s): 555cd42

Sync changes

Browse files
Files changed (2) hide show
  1. app.py +137 -65
  2. constants.py +9 -4
app.py CHANGED
@@ -49,18 +49,21 @@ def format_tokenization_info(result):
49
  ttt.append(tt)
50
  output = '\n\n'.join(ttt)
51
  return output
52
- def format_doc(doc):
53
- formatted = []
54
  if doc['doc_len'] == doc['disp_len']:
55
- header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n'
56
  else:
57
- header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n'
58
- formatted.append((header, None))
59
- formatted += doc['spans']
60
  return formatted
61
 
62
- def count(index_desc, query):
63
- result = process('count', index_desc, query=query)
 
 
 
64
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
65
  tokenization_info = format_tokenization_info(result)
66
  if 'error' in result:
@@ -81,8 +84,8 @@ def prob(index_desc, query):
81
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
82
  return latency, tokenization_info, prob
83
 
84
- def ntd(index_desc, query):
85
- result = process('ntd', index_desc, query=query)
86
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
87
  tokenization_info = format_tokenization_info(result)
88
  if 'error' in result:
@@ -108,8 +111,8 @@ def infgram_prob(index_desc, query):
108
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
109
  return latency, tokenization_info, longest_suffix, prob
110
 
111
- def infgram_ntd(index_desc, query):
112
- result = process('infgram_ntd', index_desc, query=query)
113
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
114
  tokenization_info = format_tokenization_info(result)
115
  if 'error' in result:
@@ -123,21 +126,28 @@ def infgram_ntd(index_desc, query):
123
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
124
  return latency, tokenization_info, longest_suffix, ntd
125
 
126
- def search_docs(index_desc, query, maxnum):
127
- result = process('search_docs', index_desc, query=query, maxnum=maxnum)
 
 
 
128
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
129
  tokenization_info = format_tokenization_info(result)
130
  if 'error' in result:
131
  message = result['error']
 
132
  docs = [[] for _ in range(MAXNUM)]
133
  else:
134
  message = result['message']
135
- docs = result['documents']
136
- docs = [format_doc(doc) for doc in docs]
 
137
  docs = docs[:maxnum]
 
 
138
  while len(docs) < MAXNUM:
139
  docs.append([])
140
- return tuple([latency, tokenization_info, message] + docs)
141
 
142
  with gr.Blocks() as demo:
143
  with gr.Column():
@@ -151,34 +161,62 @@ with gr.Blocks() as demo:
151
  '''
152
  )
153
  with gr.Row():
154
- with gr.Column(scale=1):
155
  index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
156
 
157
  with gr.Column(scale=7):
158
  with gr.Tab('1. Count an n-gram'):
159
  with gr.Column():
160
  gr.HTML('<h2>1. Count an n-gram</h2>')
161
- gr.HTML('<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.</p>')
162
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is Cnt(natural language processing))</p>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  with gr.Row():
164
  with gr.Column(scale=1):
165
  count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
 
 
 
 
166
  with gr.Row():
167
  count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
168
  count_submit = gr.Button(value='Submit', variant='primary', visible=True)
169
  count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
170
- count_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
171
  with gr.Column(scale=1):
172
  count_count = gr.Label(label='Count', num_top_classes=0)
173
  count_clear.add([count_query, count_latency, count_tokenized, count_count])
174
- count_submit.click(count, inputs=[index_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False)
175
 
176
  with gr.Tab('2. Prob of the last token'):
177
  with gr.Column():
178
  gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
179
- gr.HTML('<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>')
180
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>')
181
- gr.HTML('<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</p>')
 
 
 
 
 
 
 
182
  with gr.Row():
183
  with gr.Column(scale=1):
184
  prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
@@ -186,7 +224,7 @@ with gr.Blocks() as demo:
186
  prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
187
  prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
188
  prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
189
- prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
190
  with gr.Column(scale=1):
191
  prob_probability = gr.Label(label='Probability', num_top_classes=0)
192
  prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
@@ -195,28 +233,46 @@ with gr.Blocks() as demo:
195
  with gr.Tab('3. Next-token distribution'):
196
  with gr.Column():
197
  gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
198
- gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
199
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
200
- gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_SUPPORT} times in the corpus, the result will be approximate.</p>')
 
 
 
 
 
 
 
 
 
201
  with gr.Row():
202
  with gr.Column(scale=1):
203
  ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
 
 
204
  with gr.Row():
205
  ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
206
  ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
207
  ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
208
- ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
209
  with gr.Column(scale=1):
210
  ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
211
  ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
212
- ntd_submit.click(ntd, inputs=[index_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
213
 
214
  with gr.Tab('4. ∞-gram prob'):
215
  with gr.Column():
216
  gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
217
- gr.HTML('<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>')
218
- gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)</p>')
219
- gr.HTML('<p style="font-size: 16px;">Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.</p>')
 
 
 
 
 
 
 
220
  with gr.Row():
221
  with gr.Column(scale=1):
222
  infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
@@ -224,7 +280,7 @@ with gr.Blocks() as demo:
224
  infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
225
  infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
226
  infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
227
- infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
228
  infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
229
  with gr.Column(scale=1):
230
  infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
@@ -234,61 +290,77 @@ with gr.Blocks() as demo:
234
  with gr.Tab('5. ∞-gram next-token distribution'):
235
  with gr.Column():
236
  gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
237
- gr.HTML('<p style="font-size: 16px;">This is similar to Query 3, but with ∞-gram instead of n-gram.</p>')
238
- gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
 
 
 
239
  with gr.Row():
240
  with gr.Column(scale=1):
241
  infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
 
 
242
  with gr.Row():
243
  infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
244
  infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
245
  infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
246
- infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
247
  infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
248
  with gr.Column(scale=1):
249
  infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
250
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
251
- infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
252
 
253
  with gr.Tab('6. Search documents'):
254
  with gr.Column():
255
- gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>
256
- <p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
257
- <p style="font-size: 16px;">Example queries:</p>
258
- <ul style="font-size: 16px;">
259
- <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
260
- <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
261
- <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
262
- </ul>
263
- <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
264
- <p style="font-size: 16px;">A few notes:</p>
265
- <ul style="font-size: 16px;">
266
- <li>If the document is too long, it will be truncated to {MAX_DISP_LEN} tokens.</li>
267
- <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
268
- <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} terms.</li>
269
- <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
270
- <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
271
- <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ} matches, we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
272
- </ul>
273
- <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
274
- ''')
 
 
 
 
275
  with gr.Row():
276
- with gr.Column(scale=2):
277
  search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
278
- search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=1, step=1, label='Number of documents to display')
 
 
 
 
 
279
  with gr.Row():
280
  search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
281
  search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
282
  search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
283
- search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
284
- with gr.Column(scale=3):
285
  search_docs_message = gr.Label(label='Message', num_top_classes=0)
 
286
  search_docs_outputs = []
287
  for i in range(MAXNUM):
288
  with gr.Tab(label=str(i+1)):
 
289
  search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
290
- search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs)
291
- search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs, api_name=False)
292
 
293
  with gr.Row():
294
  gr.Markdown('''
 
49
  ttt.append(tt)
50
  output = '\n\n'.join(ttt)
51
  return output
52
+ def format_doc_metadata(doc):
53
+ formatted = f'Document #{doc["doc_ix"]}\n'
54
  if doc['doc_len'] == doc['disp_len']:
55
+ formatted += f'Length: {doc["doc_len"]} tokens\n'
56
  else:
57
+ formatted += f'Length: {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)\n'
58
+ metadata = doc['metadata'].strip("\n")
59
+ formatted += f'Metadata: {metadata}'
60
  return formatted
61
 
62
+ def count(index_desc, query, max_clause_freq, max_diff_tokens):
63
+ if ' AND ' in query or ' OR ' in query: # CNF query
64
+ result = process('count', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
65
+ else: # simple query
66
+ result = process('count', index_desc, query=query)
67
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
68
  tokenization_info = format_tokenization_info(result)
69
  if 'error' in result:
 
84
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
85
  return latency, tokenization_info, prob
86
 
87
+ def ntd(index_desc, query, max_support):
88
+ result = process('ntd', index_desc, query=query, max_support=max_support)
89
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
90
  tokenization_info = format_tokenization_info(result)
91
  if 'error' in result:
 
111
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
112
  return latency, tokenization_info, longest_suffix, prob
113
 
114
+ def infgram_ntd(index_desc, query, max_support):
115
+ result = process('infgram_ntd', index_desc, query=query, max_support=max_support)
116
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
117
  tokenization_info = format_tokenization_info(result)
118
  if 'error' in result:
 
126
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
127
  return latency, tokenization_info, longest_suffix, ntd
128
 
129
+ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_diff_tokens):
130
+ if ' AND ' in query or ' OR ' in query: # CNF query
131
+ result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
132
+ else: # simple query
133
+ result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len)
134
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
135
  tokenization_info = format_tokenization_info(result)
136
  if 'error' in result:
137
  message = result['error']
138
+ metadatas = ['' for _ in range(MAXNUM)]
139
  docs = [[] for _ in range(MAXNUM)]
140
  else:
141
  message = result['message']
142
+ metadatas = [format_doc_metadata(doc) for doc in result['documents']]
143
+ docs = [doc['spans'] for doc in result['documents']]
144
+ metadatas = metadatas[:maxnum]
145
  docs = docs[:maxnum]
146
+ while len(metadatas) < MAXNUM:
147
+ metadatas.append('')
148
  while len(docs) < MAXNUM:
149
  docs.append([])
150
+ return tuple([latency, tokenization_info, message] + metadatas + docs)
151
 
152
  with gr.Blocks() as demo:
153
  with gr.Column():
 
161
  '''
162
  )
163
  with gr.Row():
164
+ with gr.Column(scale=1, min_width=240):
165
  index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
166
 
167
  with gr.Column(scale=7):
168
  with gr.Tab('1. Count an n-gram'):
169
  with gr.Column():
170
  gr.HTML('<h2>1. Count an n-gram</h2>')
171
+ with gr.Accordion(label='Click to view instructions', open=False):
172
+ gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
173
+ <br>
174
+ <p style="font-size: 16px;">Example queries:</p>
175
+ <ul style="font-size: 16px;">
176
+ <li><b>natural language processing</b> (the output is number of occurrences of "natural language processing")</li>
177
+ <li><b>natural language processing AND deep learning</b> (the output is the number of co-occurrences of "natural language processing" and "deep learning")</li>
178
+ <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the output is the number of co-occurrences of [one of "natural language processing" / "artificial intelligence"] and [one of "deep learning" / "machine learning"])</li>
179
+ </ul>
180
+ <br>
181
+ <p style="font-size: 16px;">Notes on CNF queries:</p>
182
+ <ul style="font-size: 16px;">
183
+ <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
184
+ <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
185
+ <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
186
+ <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
187
+ <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
188
+ </ul>
189
+ ''')
190
  with gr.Row():
191
  with gr.Column(scale=1):
192
  count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
193
+ with gr.Accordion(label='Advanced options', open=False):
194
+ with gr.Row():
195
+ count_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
196
+ count_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
197
  with gr.Row():
198
  count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
199
  count_submit = gr.Button(value='Submit', variant='primary', visible=True)
200
  count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
201
+ count_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
202
  with gr.Column(scale=1):
203
  count_count = gr.Label(label='Count', num_top_classes=0)
204
  count_clear.add([count_query, count_latency, count_tokenized, count_count])
205
+ count_submit.click(count, inputs=[index_desc, count_query, count_max_clause_freq, count_max_diff_tokens], outputs=[count_latency, count_tokenized, count_count], api_name=False)
206
 
207
  with gr.Tab('2. Prob of the last token'):
208
  with gr.Column():
209
  gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
210
+ with gr.Accordion(label='Click to view instructions', open=False):
211
+ gr.HTML(f'''<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>
212
+ <br>
213
+ <p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>
214
+ <br>
215
+ <p style="font-size: 16px;">Notes:</p>
216
+ <ul style="font-size: 16px;">
217
+ <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
218
+ </ul>
219
+ ''')
220
  with gr.Row():
221
  with gr.Column(scale=1):
222
  prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
 
224
  prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
225
  prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
226
  prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
227
+ prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
228
  with gr.Column(scale=1):
229
  prob_probability = gr.Label(label='Probability', num_top_classes=0)
230
  prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
 
233
  with gr.Tab('3. Next-token distribution'):
234
  with gr.Column():
235
  gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
236
+ with gr.Accordion(label='Click to view instructions', open=False):
237
+ gr.HTML(f'''<p style="font-size: 16px;">This is an extension of the Query Type 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>
238
+ <br>
239
+ <p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>
240
+ <br>
241
+ <p style="font-size: 16px;">Notes:</p>
242
+ <ul style="font-size: 16px;">
243
+ <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
244
+ <li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
245
+ </ul>
246
+ ''')
247
+
248
  with gr.Row():
249
  with gr.Column(scale=1):
250
  ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
251
+ with gr.Accordion(label='Advanced options', open=False):
252
+ ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
253
  with gr.Row():
254
  ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
255
  ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
256
  ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
257
+ ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
258
  with gr.Column(scale=1):
259
  ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
260
  ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
261
+ ntd_submit.click(ntd, inputs=[index_desc, ntd_query, ntd_max_support], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
262
 
263
  with gr.Tab('4. ∞-gram prob'):
264
  with gr.Column():
265
  gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
266
+ with gr.Accordion(label='Click to view instructions', open=False):
267
+ gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
268
+ <br>
269
+ <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
270
+ <br>
271
+ <p style="font-size: 16px;">Notes:</p>
272
+ <ul style="font-size: 16px;">
273
+ <li>It may be possible that the effective n = 1, i.e. longest found suffix is empty, in which case it reduces to the uni-gram probability of the last token.</li>
274
+ </ul>
275
+ ''')
276
  with gr.Row():
277
  with gr.Column(scale=1):
278
  infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
 
280
  infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
281
  infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
282
  infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
283
+ infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
284
  infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
285
  with gr.Column(scale=1):
286
  infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
 
290
  with gr.Tab('5. ∞-gram next-token distribution'):
291
  with gr.Column():
292
  gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
293
+ with gr.Accordion(label='Click to view instructions', open=False):
294
+ gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
295
+ <br>
296
+ <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
297
+ ''')
298
  with gr.Row():
299
  with gr.Column(scale=1):
300
  infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
301
+ with gr.Accordion(label='Advanced options', open=False):
302
+ infgram_ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
303
  with gr.Row():
304
  infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
305
  infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
306
  infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
307
+ infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
308
  infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
309
  with gr.Column(scale=1):
310
  infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
311
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
312
+ infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
313
 
314
  with gr.Tab('6. Search documents'):
315
  with gr.Column():
316
+ gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
317
+ with gr.Accordion(label='Click to view instructions', open=False):
318
+ gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
319
+ <br>
320
+ <p style="font-size: 16px;">Example queries:</p>
321
+ <ul style="font-size: 16px;">
322
+ <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
323
+ <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
324
+ <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
325
+ </ul>
326
+ <br>
327
+ <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
328
+ <br>
329
+ <p style="font-size: 16px;">Notes on CNF queries:</p>
330
+ <ul style="font-size: 16px;">
331
+ <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
332
+ <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
333
+ <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
334
+ <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
335
+ <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
336
+ </ul>
337
+ <br>
338
+ <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
339
+ ''')
340
  with gr.Row():
341
+ with gr.Column(scale=1):
342
  search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
343
+ search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=maxnum, step=1, label='Number of documents to display')
344
+ search_docs_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
345
+ with gr.Accordion(label='Advanced options', open=False):
346
+ with gr.Row():
347
+ search_docs_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
348
+ search_docs_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
349
  with gr.Row():
350
  search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
351
  search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
352
  search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
353
+ search_docs_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
354
+ with gr.Column(scale=2):
355
  search_docs_message = gr.Label(label='Message', num_top_classes=0)
356
+ search_docs_metadatas = []
357
  search_docs_outputs = []
358
  for i in range(MAXNUM):
359
  with gr.Tab(label=str(i+1)):
360
+ search_docs_metadatas.append(gr.Textbox(label='Metadata', lines=3, interactive=False))
361
  search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
362
+ search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
363
+ search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
364
 
365
  with gr.Row():
366
  gr.Markdown('''
constants.py CHANGED
@@ -22,15 +22,20 @@ INDEX_BY_DESC = {
22
  }
23
  INDEX_DESCS = list(INDEX_BY_DESC.keys())
24
 
25
- # API limits
26
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
27
  MAX_CLAUSES_PER_CNF = int(os.environ.get('MAX_CLAUSES_PER_CNF', 4))
28
  MAX_TERMS_PER_CLAUSE = int(os.environ.get('MAX_TERMS_PER_CLAUSE', 4))
 
29
  MAX_SUPPORT = int(os.environ.get('MAX_SUPPORT', 1000))
30
- MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 50000))
31
- MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
 
 
 
32
  MAXNUM = int(os.environ.get('MAXNUM', 10))
33
- MAX_DISP_LEN = int(os.environ.get('MAX_DISP_LEN', 5000))
 
34
 
35
  # HF demo
36
  API_URL = os.environ.get('API_URL', None)
 
22
  }
23
  INDEX_DESCS = list(INDEX_BY_DESC.keys())
24
 
25
+ # API limits and defaults
26
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
27
  MAX_CLAUSES_PER_CNF = int(os.environ.get('MAX_CLAUSES_PER_CNF', 4))
28
  MAX_TERMS_PER_CLAUSE = int(os.environ.get('MAX_TERMS_PER_CLAUSE', 4))
29
+ max_support = int(os.environ.get('max_support', 1000))
30
  MAX_SUPPORT = int(os.environ.get('MAX_SUPPORT', 1000))
31
+ max_clause_freq = int(os.environ.get('max_clause_freq', 50000))
32
+ MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 500000))
33
+ max_diff_tokens = int(os.environ.get('max_diff_tokens', 100))
34
+ MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 1000))
35
+ maxnum = int(os.environ.get('maxnum', 1))
36
  MAXNUM = int(os.environ.get('MAXNUM', 10))
37
+ max_disp_len = int(os.environ.get('max_disp_len', 1000))
38
+ MAX_DISP_LEN = int(os.environ.get('MAX_DISP_LEN', 10000))
39
 
40
  # HF demo
41
  API_URL = os.environ.get('API_URL', None)