Spaces:

liujch1998
/

infini-gram

Running

App Files Files Community

liujch1998 commited on Jun 11

Commit

3649303

•

1 Parent(s): 555cd42

Sync changes

Browse files

Files changed (2) hide show

app.py +137 -65
constants.py +9 -4

app.py CHANGED Viewed

@@ -49,18 +49,21 @@ def format_tokenization_info(result):
             ttt.append(tt)
         output = '\n\n'.join(ttt)
     return output
-def format_doc(doc):
-    formatted = []
     if doc['doc_len'] == doc['disp_len']:
-        header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n'
     else:
-        header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n'
-    formatted.append((header, None))
-    formatted += doc['spans']
     return formatted
-def count(index_desc, query):
-    result = process('count', index_desc, query=query)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
@@ -81,8 +84,8 @@ def prob(index_desc, query):
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, prob
-def ntd(index_desc, query):
-    result = process('ntd', index_desc, query=query)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
@@ -108,8 +111,8 @@ def infgram_prob(index_desc, query):
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, longest_suffix, prob
-def infgram_ntd(index_desc, query):
-    result = process('infgram_ntd', index_desc, query=query)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
@@ -123,21 +126,28 @@ def infgram_ntd(index_desc, query):
             ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
     return latency, tokenization_info, longest_suffix, ntd
-def search_docs(index_desc, query, maxnum):
-    result = process('search_docs', index_desc, query=query, maxnum=maxnum)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
         message = result['error']
         docs = [[] for _ in range(MAXNUM)]
     else:
         message = result['message']
-        docs = result['documents']
-        docs = [format_doc(doc) for doc in docs]
     docs = docs[:maxnum]
     while len(docs) < MAXNUM:
         docs.append([])
-    return tuple([latency, tokenization_info, message] + docs)
 with gr.Blocks() as demo:
     with gr.Column():
@@ -151,34 +161,62 @@ with gr.Blocks() as demo:
             '''
         )
         with gr.Row():
-            with gr.Column(scale=1):
                 index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
             with gr.Column(scale=7):
                 with gr.Tab('1. Count an n-gram'):
                     with gr.Column():
                         gr.HTML('<h2>1. Count an n-gram</h2>')
-                        gr.HTML('<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.</p>')
-                        gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is Cnt(natural language processing))</p>')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
                                 with gr.Row():
                                     count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     count_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                count_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
                             with gr.Column(scale=1):
                                 count_count = gr.Label(label='Count', num_top_classes=0)
                     count_clear.add([count_query, count_latency, count_tokenized, count_count])
-                    count_submit.click(count, inputs=[index_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False)
                 with gr.Tab('2. Prob of the last token'):
                     with gr.Column():
                         gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
-                        gr.HTML('<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>')
-                        gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>')
-                        gr.HTML('<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</p>')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
@@ -186,7 +224,7 @@ with gr.Blocks() as demo:
                                     prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
                             with gr.Column(scale=1):
                                 prob_probability = gr.Label(label='Probability', num_top_classes=0)
                     prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
@@ -195,28 +233,46 @@ with gr.Blocks() as demo:
                 with gr.Tab('3. Next-token distribution'):
                     with gr.Column():
                         gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
-                        gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
-                        gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
-                        gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_SUPPORT} times in the corpus, the result will be approximate.</p>')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
                                 with gr.Row():
                                     ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
                             with gr.Column(scale=1):
                                 ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                     ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
-                    ntd_submit.click(ntd, inputs=[index_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
                 with gr.Tab('4. ∞-gram prob'):
                     with gr.Column():
                         gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
-                        gr.HTML('<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>')
-                        gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)</p>')
-                        gr.HTML('<p style="font-size: 16px;">Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.</p>')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
@@ -224,7 +280,7 @@ with gr.Blocks() as demo:
                                     infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
                                 infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                             with gr.Column(scale=1):
                                 infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
@@ -234,61 +290,77 @@ with gr.Blocks() as demo:
                 with gr.Tab('5. ∞-gram next-token distribution'):
                     with gr.Column():
                         gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
-                        gr.HTML('<p style="font-size: 16px;">This is similar to Query 3, but with ∞-gram instead of n-gram.</p>')
-                        gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
                                 with gr.Row():
                                     infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
                                 infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                             with gr.Column(scale=1):
                                 infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                     infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
-                    infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
                 with gr.Tab('6. Search documents'):
                     with gr.Column():
-                        gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>
-                                    <p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
-                                    <p style="font-size: 16px;">Example queries:</p>
-                                    <ul style="font-size: 16px;">
-                                        <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
-                                        <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
-                                        <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
-                                    </ul>
-                                    <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
-                                    <p style="font-size: 16px;">A few notes:</p>
-                                    <ul style="font-size: 16px;">
-                                        <li>If the document is too long, it will be truncated to {MAX_DISP_LEN} tokens.</li>
-                                        <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
-                                        <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} terms.</li>
-                                        <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
-                                        <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
-                                        <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ} matches, we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
-                                    </ul>
-                                    <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
-                                ''')
                         with gr.Row():
-                            with gr.Column(scale=2):
                                 search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
-                                search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=1, step=1, label='Number of documents to display')
                                 with gr.Row():
                                     search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
-                                search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
-                            with gr.Column(scale=3):
                                 search_docs_message = gr.Label(label='Message', num_top_classes=0)
                                 search_docs_outputs = []
                                 for i in range(MAXNUM):
                                     with gr.Tab(label=str(i+1)):
                                         search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
-                    search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs)
-                    search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs, api_name=False)
         with gr.Row():
             gr.Markdown('''

             ttt.append(tt)
         output = '\n\n'.join(ttt)
     return output
+def format_doc_metadata(doc):
+    formatted = f'Document #{doc["doc_ix"]}\n'
     if doc['doc_len'] == doc['disp_len']:
+        formatted += f'Length: {doc["doc_len"]} tokens\n'
     else:
+        formatted += f'Length: {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)\n'
+    metadata = doc['metadata'].strip("\n")
+    formatted += f'Metadata: {metadata}'
     return formatted
+def count(index_desc, query, max_clause_freq, max_diff_tokens):
+    if ' AND ' in query or ' OR ' in query: # CNF query
+        result = process('count', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
+    else: # simple query
+        result = process('count', index_desc, query=query)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, prob
+def ntd(index_desc, query, max_support):
+    result = process('ntd', index_desc, query=query, max_support=max_support)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
         prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
     return latency, tokenization_info, longest_suffix, prob
+def infgram_ntd(index_desc, query, max_support):
+    result = process('infgram_ntd', index_desc, query=query, max_support=max_support)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
             ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
     return latency, tokenization_info, longest_suffix, ntd
+def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_diff_tokens):
+    if ' AND ' in query or ' OR ' in query: # CNF query
+        result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
+    else: # simple query
+        result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len)
     latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
     tokenization_info = format_tokenization_info(result)
     if 'error' in result:
         message = result['error']
+        metadatas = ['' for _ in range(MAXNUM)]
         docs = [[] for _ in range(MAXNUM)]
     else:
         message = result['message']
+        metadatas = [format_doc_metadata(doc) for doc in result['documents']]
+        docs = [doc['spans'] for doc in result['documents']]
+    metadatas = metadatas[:maxnum]
     docs = docs[:maxnum]
+    while len(metadatas) < MAXNUM:
+        metadatas.append('')
     while len(docs) < MAXNUM:
         docs.append([])
+    return tuple([latency, tokenization_info, message] + metadatas + docs)
 with gr.Blocks() as demo:
     with gr.Column():
             '''
         )
         with gr.Row():
+            with gr.Column(scale=1, min_width=240):
                 index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
             with gr.Column(scale=7):
                 with gr.Tab('1. Count an n-gram'):
                     with gr.Column():
                         gr.HTML('<h2>1. Count an n-gram</h2>')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li><b>natural language processing</b> (the output is number of occurrences of "natural language processing")</li>
+                                            <li><b>natural language processing AND deep learning</b> (the output is the number of co-occurrences of "natural language processing" and "deep learning")</li>
+                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the output is the number of co-occurrences of [one of "natural language processing" / "artificial intelligence"] and [one of "deep learning" / "machine learning"])</li>
+                                        </ul>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
+                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
+                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
+                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
+                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
+                                        </ul>
+                                    ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
+                                with gr.Accordion(label='Advanced options', open=False):
+                                    with gr.Row():
+                                        count_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
+                                        count_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
                                 with gr.Row():
                                     count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     count_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                count_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                             with gr.Column(scale=1):
                                 count_count = gr.Label(label='Count', num_top_classes=0)
                     count_clear.add([count_query, count_latency, count_tokenized, count_count])
+                    count_submit.click(count, inputs=[index_desc, count_query, count_max_clause_freq, count_max_diff_tokens], outputs=[count_latency, count_tokenized, count_count], api_name=False)
                 with gr.Tab('2. Prob of the last token'):
                     with gr.Column():
                         gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
+                                        </ul>
+                                    ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
                                     prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                             with gr.Column(scale=1):
                                 prob_probability = gr.Label(label='Probability', num_top_classes=0)
                     prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
                 with gr.Tab('3. Next-token distribution'):
                     with gr.Column():
                         gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This is an extension of the Query Type 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
+                                            <li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
+                                        </ul>
+                                    ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
+                                with gr.Accordion(label='Advanced options', open=False):
+                                    ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
                                 with gr.Row():
                                     ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                             with gr.Column(scale=1):
                                 ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                     ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
+                    ntd_submit.click(ntd, inputs=[index_desc, ntd_query, ntd_max_support], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
                 with gr.Tab('4. ∞-gram prob'):
                     with gr.Column():
                         gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>It may be possible that the effective n = 1, i.e. longest found suffix is empty, in which case it reduces to the uni-gram probability of the last token.</li>
+                                        </ul>
+                                    ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
                                     infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                                 infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                             with gr.Column(scale=1):
                                 infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
                 with gr.Tab('5. ∞-gram next-token distribution'):
                     with gr.Column():
                         gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
+                                    ''')
                         with gr.Row():
                             with gr.Column(scale=1):
                                 infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
+                                with gr.Accordion(label='Advanced options', open=False):
+                                    infgram_ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
                                 with gr.Row():
                                     infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                                 infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                             with gr.Column(scale=1):
                                 infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                     infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
+                    infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
                 with gr.Tab('6. Search documents'):
                     with gr.Column():
+                        gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
+                        with gr.Accordion(label='Click to view instructions', open=False):
+                            gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Example queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
+                                            <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
+                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
+                                        </ul>
+                                        <br>
+                                        <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
+                                        <br>
+                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
+                                        <ul style="font-size: 16px;">
+                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
+                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
+                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
+                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
+                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
+                                        </ul>
+                                        <br>
+                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
+                                    ''')
                         with gr.Row():
+                            with gr.Column(scale=1):
                                 search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
+                                search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=maxnum, step=1, label='Number of documents to display')
+                                search_docs_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
+                                with gr.Accordion(label='Advanced options', open=False):
+                                    with gr.Row():
+                                        search_docs_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
+                                        search_docs_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
                                 with gr.Row():
                                     search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                     search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                 search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+                                search_docs_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
+                            with gr.Column(scale=2):
                                 search_docs_message = gr.Label(label='Message', num_top_classes=0)
+                                search_docs_metadatas = []
                                 search_docs_outputs = []
                                 for i in range(MAXNUM):
                                     with gr.Tab(label=str(i+1)):
+                                        search_docs_metadatas.append(gr.Textbox(label='Metadata', lines=3, interactive=False))
                                         search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
+                    search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
+                    search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
         with gr.Row():
             gr.Markdown('''

constants.py CHANGED Viewed

@@ -22,15 +22,20 @@ INDEX_BY_DESC = {
 }
 INDEX_DESCS = list(INDEX_BY_DESC.keys())
-# API limits
 MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
 MAX_CLAUSES_PER_CNF = int(os.environ.get('MAX_CLAUSES_PER_CNF', 4))
 MAX_TERMS_PER_CLAUSE = int(os.environ.get('MAX_TERMS_PER_CLAUSE', 4))
 MAX_SUPPORT = int(os.environ.get('MAX_SUPPORT', 1000))
-MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 50000))
-MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
 MAXNUM = int(os.environ.get('MAXNUM', 10))
-MAX_DISP_LEN = int(os.environ.get('MAX_DISP_LEN', 5000))
 # HF demo
 API_URL = os.environ.get('API_URL', None)

 }
 INDEX_DESCS = list(INDEX_BY_DESC.keys())
+# API limits and defaults
 MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
 MAX_CLAUSES_PER_CNF = int(os.environ.get('MAX_CLAUSES_PER_CNF', 4))
 MAX_TERMS_PER_CLAUSE = int(os.environ.get('MAX_TERMS_PER_CLAUSE', 4))
+max_support = int(os.environ.get('max_support', 1000))
 MAX_SUPPORT = int(os.environ.get('MAX_SUPPORT', 1000))
+max_clause_freq = int(os.environ.get('max_clause_freq', 50000))
+MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 500000))
+max_diff_tokens = int(os.environ.get('max_diff_tokens', 100))
+MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 1000))
+maxnum = int(os.environ.get('maxnum', 1))
 MAXNUM = int(os.environ.get('MAXNUM', 10))
+max_disp_len = int(os.environ.get('max_disp_len', 1000))
+MAX_DISP_LEN = int(os.environ.get('MAX_DISP_LEN', 10000))
 # HF demo
 API_URL = os.environ.get('API_URL', None)