Spaces:
Running
Running
liujch1998
commited on
Commit
·
8c4a00c
1
Parent(s):
3649303
Sync changes
Browse files- app.py +127 -2
- constants.py +1 -0
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import datetime
|
3 |
import json
|
|
|
4 |
import requests
|
5 |
from constants import *
|
6 |
|
@@ -149,6 +150,83 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
|
|
149 |
docs.append([])
|
150 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
with gr.Blocks() as demo:
|
153 |
with gr.Column():
|
154 |
gr.HTML(
|
@@ -183,7 +261,7 @@ with gr.Blocks() as demo:
|
|
183 |
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
|
184 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
185 |
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
|
186 |
-
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq}
|
187 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
188 |
</ul>
|
189 |
''')
|
@@ -311,7 +389,7 @@ with gr.Blocks() as demo:
|
|
311 |
infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
|
312 |
infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
|
313 |
|
314 |
-
with gr.Tab('6. Search documents'):
|
315 |
with gr.Column():
|
316 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
317 |
with gr.Accordion(label='Click to view instructions', open=False):
|
@@ -362,6 +440,53 @@ with gr.Blocks() as demo:
|
|
362 |
search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
|
363 |
search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
|
364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
with gr.Row():
|
366 |
gr.Markdown('''
|
367 |
If you find this tool useful, please kindly cite our paper:
|
|
|
1 |
import gradio as gr
|
2 |
import datetime
|
3 |
import json
|
4 |
+
import random
|
5 |
import requests
|
6 |
from constants import *
|
7 |
|
|
|
150 |
docs.append([])
|
151 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
152 |
|
153 |
+
find_result = None
|
154 |
+
|
155 |
+
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
|
156 |
+
global find_result
|
157 |
+
if ' AND ' in query or ' OR ' in query: # CNF query
|
158 |
+
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
|
159 |
+
find_result['type'] = 'cnf'
|
160 |
+
else: # simple query
|
161 |
+
find_result = process('find', index_desc, query=query)
|
162 |
+
find_result['type'] = 'simple'
|
163 |
+
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
|
164 |
+
tokenization_info = format_tokenization_info(find_result)
|
165 |
+
if 'error' in find_result:
|
166 |
+
message = find_result['error']
|
167 |
+
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
168 |
+
metadata = ''
|
169 |
+
doc = []
|
170 |
+
return latency, tokenization_info, message, idx, metadata, doc
|
171 |
+
|
172 |
+
if ' AND ' in query or ' OR ' in query: # CNF query
|
173 |
+
ptrs_by_shard = find_result['ptrs_by_shard']
|
174 |
+
cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
|
175 |
+
if find_result["approx"]:
|
176 |
+
message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
|
177 |
+
else:
|
178 |
+
message = f'{find_result["cnt"]} occurrences found'
|
179 |
+
else: # simple query
|
180 |
+
message = f'{find_result["cnt"]} occurrences found'
|
181 |
+
cnt_retrievable = find_result['cnt']
|
182 |
+
if cnt_retrievable == 0:
|
183 |
+
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
184 |
+
metadata = ''
|
185 |
+
doc = []
|
186 |
+
return latency, tokenization_info, message, idx, metadata, doc
|
187 |
+
idx = random.randint(0, cnt_retrievable-1)
|
188 |
+
metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
|
189 |
+
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
|
190 |
+
return latency, tokenization_info, message, idx, metadata, doc
|
191 |
+
|
192 |
+
def clear_search_docs_new():
|
193 |
+
global find_result
|
194 |
+
find_result = None
|
195 |
+
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
196 |
+
return idx
|
197 |
+
|
198 |
+
def get_another_doc(index_desc, idx, max_disp_len):
|
199 |
+
global find_result
|
200 |
+
if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
|
201 |
+
metadata = ''
|
202 |
+
doc = []
|
203 |
+
return metadata, doc
|
204 |
+
if find_result['type'] == 'cnf':
|
205 |
+
ptrs_by_shard = find_result['ptrs_by_shard']
|
206 |
+
cnt_by_shard = [len(ptrs) for ptrs in ptrs_by_shard]
|
207 |
+
s = 0
|
208 |
+
while idx >= cnt_by_shard[s]:
|
209 |
+
idx -= cnt_by_shard[s]
|
210 |
+
s += 1
|
211 |
+
ptr = ptrs_by_shard[s][idx]
|
212 |
+
result = process('get_doc_by_ptr', index_desc, s=s, ptr=ptr, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
|
213 |
+
else: # simple query
|
214 |
+
segment_by_shard = find_result['segment_by_shard']
|
215 |
+
cnt_by_shard = [end - start for (start, end) in segment_by_shard]
|
216 |
+
s = 0
|
217 |
+
while idx >= cnt_by_shard[s]:
|
218 |
+
idx -= cnt_by_shard[s]
|
219 |
+
s += 1
|
220 |
+
rank = segment_by_shard[s][0] + idx
|
221 |
+
result = process('get_doc_by_rank', index_desc, s=s, rank=rank, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
|
222 |
+
if 'error' in result:
|
223 |
+
metadata = result['error']
|
224 |
+
doc = []
|
225 |
+
return metadata, doc
|
226 |
+
metadata = format_doc_metadata(result)
|
227 |
+
doc = result['spans']
|
228 |
+
return metadata, doc
|
229 |
+
|
230 |
with gr.Blocks() as demo:
|
231 |
with gr.Column():
|
232 |
gr.HTML(
|
|
|
261 |
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
|
262 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
263 |
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
|
264 |
+
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
|
265 |
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
266 |
</ul>
|
267 |
''')
|
|
|
389 |
infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
|
390 |
infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
|
391 |
|
392 |
+
with gr.Tab('6. Search documents', visible=False):
|
393 |
with gr.Column():
|
394 |
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
395 |
with gr.Accordion(label='Click to view instructions', open=False):
|
|
|
440 |
search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
|
441 |
search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
|
442 |
|
443 |
+
with gr.Tab('6. Search documents'):
|
444 |
+
with gr.Column():
|
445 |
+
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
|
446 |
+
with gr.Accordion(label='Click to view instructions', open=False):
|
447 |
+
gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
|
448 |
+
<br>
|
449 |
+
<p style="font-size: 16px;">Example queries:</p>
|
450 |
+
<ul style="font-size: 16px;">
|
451 |
+
<li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
|
452 |
+
<li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
|
453 |
+
<li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
|
454 |
+
</ul>
|
455 |
+
<br>
|
456 |
+
<p style="font-size: 16px;">Notes on CNF queries:</p>
|
457 |
+
<ul style="font-size: 16px;">
|
458 |
+
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
|
459 |
+
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
460 |
+
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
|
461 |
+
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
|
462 |
+
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
463 |
+
</ul>
|
464 |
+
<br>
|
465 |
+
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
|
466 |
+
''')
|
467 |
+
with gr.Row():
|
468 |
+
with gr.Column(scale=1):
|
469 |
+
search_docs_new_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
|
470 |
+
search_docs_new_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
|
471 |
+
with gr.Accordion(label='Advanced options', open=False):
|
472 |
+
with gr.Row():
|
473 |
+
search_docs_new_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
|
474 |
+
search_docs_new_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
|
475 |
+
with gr.Row():
|
476 |
+
search_docs_new_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
|
477 |
+
search_docs_new_submit = gr.Button(value='Submit', variant='primary', visible=True)
|
478 |
+
search_docs_new_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
|
479 |
+
search_docs_new_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
|
480 |
+
with gr.Column(scale=2):
|
481 |
+
search_docs_new_message = gr.Label(label='Message', num_top_classes=0)
|
482 |
+
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
|
483 |
+
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
|
484 |
+
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
|
485 |
+
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
|
486 |
+
search_docs_new_clear.click(clear_search_docs_new, inputs=[], outputs=[search_docs_new_idx], api_name=False)
|
487 |
+
search_docs_new_submit.click(search_docs_new, inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len, search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens], outputs=[search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output], api_name=False)
|
488 |
+
search_docs_new_idx.input(get_another_doc, inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len], outputs=[search_docs_new_metadata, search_docs_new_output], api_name=False)
|
489 |
+
|
490 |
with gr.Row():
|
491 |
gr.Markdown('''
|
492 |
If you find this tool useful, please kindly cite our paper:
|
constants.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
|
3 |
# options
|
4 |
INDEX_BY_DESC = {
|
|
|
5 |
'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
|
6 |
'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
|
7 |
'Pile-train (380B tokens)': 'v4_piletrain_llama',
|
|
|
2 |
|
3 |
# options
|
4 |
INDEX_BY_DESC = {
|
5 |
+
'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
|
6 |
'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
|
7 |
'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
|
8 |
'Pile-train (380B tokens)': 'v4_piletrain_llama',
|