Spaces:
Running
Running
liujch1998
commited on
Commit
•
619c9ac
1
Parent(s):
2e63f1e
Adapt to API updates
Browse files
app.py
CHANGED
@@ -30,78 +30,117 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
30 |
print(result)
|
31 |
return result
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def count(corpus_desc, engine_desc, query, request: gr.Request):
|
34 |
result = process('count', corpus_desc, engine_desc, query, None, request)
|
35 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
36 |
-
|
37 |
if 'error' in result:
|
38 |
count = result['error']
|
39 |
else:
|
40 |
count = f'{result["count"]:,}'
|
41 |
-
return latency,
|
42 |
|
43 |
def prob(corpus_desc, engine_desc, query, request: gr.Request):
|
44 |
result = process('prob', corpus_desc, engine_desc, query, None, request)
|
45 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
46 |
-
|
47 |
if 'error' in result:
|
48 |
prob = result['error']
|
49 |
elif result['prompt_cnt'] == 0:
|
50 |
prob = '(n-1)-gram is not found in the corpus'
|
51 |
else:
|
52 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
53 |
-
return latency,
|
54 |
|
55 |
def ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
56 |
result = process('ntd', corpus_desc, engine_desc, query, None, request)
|
57 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
58 |
-
|
59 |
if 'error' in result:
|
60 |
ntd = result['error']
|
61 |
else:
|
62 |
-
|
|
|
|
|
|
|
63 |
if ntd == {}:
|
64 |
ntd = '(n-1)-gram is not found in the corpus'
|
65 |
-
return latency,
|
66 |
|
67 |
def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
|
68 |
result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
|
69 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
70 |
-
|
71 |
if 'error' in result:
|
72 |
longest_suffix = ''
|
73 |
prob = result['error']
|
74 |
else:
|
75 |
longest_suffix = result['longest_suffix']
|
76 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
77 |
-
return latency,
|
78 |
|
79 |
def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
80 |
result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
|
81 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
82 |
-
|
83 |
if 'error' in result:
|
84 |
longest_suffix = ''
|
85 |
ntd = result['error']
|
86 |
else:
|
87 |
longest_suffix = result['longest_suffix']
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
|
91 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
92 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
93 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
94 |
-
|
95 |
if 'error' in result:
|
96 |
message = result['error']
|
97 |
docs = [[] for _ in range(10)]
|
98 |
else:
|
99 |
message = result['message']
|
100 |
-
docs = result['
|
|
|
101 |
docs = docs[:maxnum]
|
102 |
while len(docs) < 10:
|
103 |
docs.append([])
|
104 |
-
return latency,
|
105 |
|
106 |
def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
|
107 |
result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
|
|
|
30 |
print(result)
|
31 |
return result
|
32 |
|
33 |
+
def format_tokenization_info(result):
|
34 |
+
if not ('token_ids' in result and 'tokens' in result):
|
35 |
+
return ''
|
36 |
+
token_ids = result['token_ids']
|
37 |
+
tokens = result['tokens']
|
38 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
39 |
+
return t
|
40 |
+
def format_tokenization_info_nested(result):
|
41 |
+
if not ('token_idsss' in result and 'tokensss' in result):
|
42 |
+
return ''
|
43 |
+
token_idsss = result['token_idsss']
|
44 |
+
tokensss = result['tokensss']
|
45 |
+
ttt = []
|
46 |
+
for token_idss, tokenss in zip(token_idsss, tokensss):
|
47 |
+
tt = []
|
48 |
+
for token_ids, tokens in zip(token_idss, tokenss):
|
49 |
+
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
|
50 |
+
tt.append(t)
|
51 |
+
tt = '\n'.join(tt)
|
52 |
+
ttt.append(tt)
|
53 |
+
ttt = '\n\n'.join(ttt)
|
54 |
+
return ttt
|
55 |
+
def format_doc(doc):
|
56 |
+
formatted = []
|
57 |
+
if doc['doc_len'] == doc['disp_len']:
|
58 |
+
header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens]\n\n'
|
59 |
+
else:
|
60 |
+
header = f'[Document #{doc["doc_ix"]}, length = {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)]\n\n'
|
61 |
+
formatted.append((header, None))
|
62 |
+
formatted += doc['spans']
|
63 |
+
return formatted
|
64 |
+
|
65 |
def count(corpus_desc, engine_desc, query, request: gr.Request):
|
66 |
result = process('count', corpus_desc, engine_desc, query, None, request)
|
67 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
68 |
+
tokenization_info = format_tokenization_info(result)
|
69 |
if 'error' in result:
|
70 |
count = result['error']
|
71 |
else:
|
72 |
count = f'{result["count"]:,}'
|
73 |
+
return latency, tokenization_info, count
|
74 |
|
75 |
def prob(corpus_desc, engine_desc, query, request: gr.Request):
|
76 |
result = process('prob', corpus_desc, engine_desc, query, None, request)
|
77 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
78 |
+
tokenization_info = format_tokenization_info(result)
|
79 |
if 'error' in result:
|
80 |
prob = result['error']
|
81 |
elif result['prompt_cnt'] == 0:
|
82 |
prob = '(n-1)-gram is not found in the corpus'
|
83 |
else:
|
84 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
85 |
+
return latency, tokenization_info, prob
|
86 |
|
87 |
def ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
88 |
result = process('ntd', corpus_desc, engine_desc, query, None, request)
|
89 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
90 |
+
tokenization_info = format_tokenization_info(result)
|
91 |
if 'error' in result:
|
92 |
ntd = result['error']
|
93 |
else:
|
94 |
+
result_by_token_id = result['result_by_token_id']
|
95 |
+
ntd = {}
|
96 |
+
for token_id, r in result_by_token_id.items():
|
97 |
+
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
98 |
if ntd == {}:
|
99 |
ntd = '(n-1)-gram is not found in the corpus'
|
100 |
+
return latency, tokenization_info, ntd
|
101 |
|
102 |
def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
|
103 |
result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
|
104 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
105 |
+
tokenization_info = format_tokenization_info(result)
|
106 |
if 'error' in result:
|
107 |
longest_suffix = ''
|
108 |
prob = result['error']
|
109 |
else:
|
110 |
longest_suffix = result['longest_suffix']
|
111 |
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
|
112 |
+
return latency, tokenization_info, longest_suffix, prob
|
113 |
|
114 |
def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
|
115 |
result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
|
116 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
117 |
+
tokenization_info = format_tokenization_info(result)
|
118 |
if 'error' in result:
|
119 |
longest_suffix = ''
|
120 |
ntd = result['error']
|
121 |
else:
|
122 |
longest_suffix = result['longest_suffix']
|
123 |
+
result_by_token_id = result['result_by_token_id']
|
124 |
+
ntd = {}
|
125 |
+
for token_id, r in result_by_token_id.items():
|
126 |
+
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
|
127 |
+
return latency, tokenization_info, longest_suffix, ntd
|
128 |
|
129 |
def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
|
130 |
result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
|
131 |
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
|
132 |
+
tokenization_info = format_tokenization_info_nested(result)
|
133 |
if 'error' in result:
|
134 |
message = result['error']
|
135 |
docs = [[] for _ in range(10)]
|
136 |
else:
|
137 |
message = result['message']
|
138 |
+
docs = result['documents']
|
139 |
+
docs = [format_doc(doc) for doc in docs]
|
140 |
docs = docs[:maxnum]
|
141 |
while len(docs) < 10:
|
142 |
docs.append([])
|
143 |
+
return latency, tokenization_info, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9]
|
144 |
|
145 |
def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
|
146 |
result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
|