Spaces:
Sleeping
Sleeping
matdmiller
commited on
Commit
·
4db1bc1
1
Parent(s):
2b9e17f
optional ai cleaning and add md link regex cleaning by default
Browse files
app.ipynb
CHANGED
@@ -142,7 +142,7 @@
|
|
142 |
},
|
143 |
{
|
144 |
"cell_type": "code",
|
145 |
-
"execution_count":
|
146 |
"id": "4f486d3a",
|
147 |
"metadata": {},
|
148 |
"outputs": [],
|
@@ -168,7 +168,8 @@
|
|
168 |
"# from cartesia.tts import CartesiaTTS\n",
|
169 |
"import cartesia\n",
|
170 |
"import requests\n",
|
171 |
-
"import urllib"
|
|
|
172 |
]
|
173 |
},
|
174 |
{
|
@@ -810,7 +811,19 @@
|
|
810 |
},
|
811 |
{
|
812 |
"cell_type": "code",
|
813 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
814 |
"id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
|
815 |
"metadata": {},
|
816 |
"outputs": [],
|
@@ -820,12 +833,12 @@
|
|
820 |
" # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
|
821 |
" result = requests.get('https://r.jina.ai/'+url)\n",
|
822 |
" result.raise_for_status()\n",
|
823 |
-
" return result.text"
|
824 |
]
|
825 |
},
|
826 |
{
|
827 |
"cell_type": "code",
|
828 |
-
"execution_count":
|
829 |
"id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
|
830 |
"metadata": {},
|
831 |
"outputs": [],
|
@@ -858,6 +871,8 @@
|
|
858 |
"\n",
|
859 |
" idx = 0\n",
|
860 |
" while complete == False and idx < max_iters:\n",
|
|
|
|
|
861 |
" idx += 1\n",
|
862 |
" response = client.chat.completions.create(\n",
|
863 |
" model=\"gpt-4o\",\n",
|
@@ -895,19 +910,22 @@
|
|
895 |
},
|
896 |
{
|
897 |
"cell_type": "code",
|
898 |
-
"execution_count":
|
899 |
"id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
|
900 |
"metadata": {},
|
901 |
"outputs": [],
|
902 |
"source": [
|
903 |
"#| export\n",
|
904 |
-
"def get_page_text(url):\n",
|
905 |
-
"
|
|
|
|
|
|
|
906 |
]
|
907 |
},
|
908 |
{
|
909 |
"cell_type": "code",
|
910 |
-
"execution_count":
|
911 |
"id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
|
912 |
"metadata": {},
|
913 |
"outputs": [],
|
@@ -918,9 +936,11 @@
|
|
918 |
" ### Define UI ###\n",
|
919 |
" gr.Markdown(\"# TTS\")\n",
|
920 |
" gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
|
921 |
-
"For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\
|
|
|
922 |
" with gr.Row():\n",
|
923 |
" input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
|
|
|
924 |
" get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
|
925 |
" with gr.Row():\n",
|
926 |
" input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
|
@@ -946,7 +966,7 @@
|
|
946 |
"\n",
|
947 |
" ### Define UI Actions ###\n",
|
948 |
"\n",
|
949 |
-
" get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)\n",
|
950 |
" \n",
|
951 |
" # input_text \n",
|
952 |
" input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
|
@@ -982,7 +1002,7 @@
|
|
982 |
},
|
983 |
{
|
984 |
"cell_type": "code",
|
985 |
-
"execution_count":
|
986 |
"id": "a00648a1-891b-470b-9959-f5d502055713",
|
987 |
"metadata": {},
|
988 |
"outputs": [],
|
@@ -996,7 +1016,7 @@
|
|
996 |
},
|
997 |
{
|
998 |
"cell_type": "code",
|
999 |
-
"execution_count":
|
1000 |
"id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
|
1001 |
"metadata": {},
|
1002 |
"outputs": [
|
@@ -1025,16 +1045,9 @@
|
|
1025 |
"data": {
|
1026 |
"text/plain": []
|
1027 |
},
|
1028 |
-
"execution_count":
|
1029 |
"metadata": {},
|
1030 |
"output_type": "execute_result"
|
1031 |
-
},
|
1032 |
-
{
|
1033 |
-
"name": "stdout",
|
1034 |
-
"output_type": "stream",
|
1035 |
-
"text": [
|
1036 |
-
"TOKENS CLEANUP: 970\n"
|
1037 |
-
]
|
1038 |
}
|
1039 |
],
|
1040 |
"source": [
|
@@ -1060,7 +1073,7 @@
|
|
1060 |
},
|
1061 |
{
|
1062 |
"cell_type": "code",
|
1063 |
-
"execution_count":
|
1064 |
"id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
|
1065 |
"metadata": {},
|
1066 |
"outputs": [
|
@@ -1090,7 +1103,7 @@
|
|
1090 |
},
|
1091 |
{
|
1092 |
"cell_type": "code",
|
1093 |
-
"execution_count":
|
1094 |
"id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
|
1095 |
"metadata": {},
|
1096 |
"outputs": [],
|
|
|
142 |
},
|
143 |
{
|
144 |
"cell_type": "code",
|
145 |
+
"execution_count": 41,
|
146 |
"id": "4f486d3a",
|
147 |
"metadata": {},
|
148 |
"outputs": [],
|
|
|
168 |
"# from cartesia.tts import CartesiaTTS\n",
|
169 |
"import cartesia\n",
|
170 |
"import requests\n",
|
171 |
+
"import urllib\n",
|
172 |
+
"import re"
|
173 |
]
|
174 |
},
|
175 |
{
|
|
|
811 |
},
|
812 |
{
|
813 |
"cell_type": "code",
|
814 |
+
"execution_count": 42,
|
815 |
+
"id": "c5b0156a-f6d4-480a-b7b5-b0899e7520b9",
|
816 |
+
"metadata": {},
|
817 |
+
"outputs": [],
|
818 |
+
"source": [
|
819 |
+
"#| export\n",
|
820 |
+
"def remove_urls_from_markdown(text):\n",
|
821 |
+
" return re.sub(r'\\[([^\\]]+)\\]\\([^\\)]+\\)', r'\\1', text)"
|
822 |
+
]
|
823 |
+
},
|
824 |
+
{
|
825 |
+
"cell_type": "code",
|
826 |
+
"execution_count": 43,
|
827 |
"id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
|
828 |
"metadata": {},
|
829 |
"outputs": [],
|
|
|
833 |
" # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
|
834 |
" result = requests.get('https://r.jina.ai/'+url)\n",
|
835 |
" result.raise_for_status()\n",
|
836 |
+
" return remove_urls_from_markdown(result.text)"
|
837 |
]
|
838 |
},
|
839 |
{
|
840 |
"cell_type": "code",
|
841 |
+
"execution_count": 47,
|
842 |
"id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
|
843 |
"metadata": {},
|
844 |
"outputs": [],
|
|
|
871 |
"\n",
|
872 |
" idx = 0\n",
|
873 |
" while complete == False and idx < max_iters:\n",
|
874 |
+
" print('Page Cleaning Iter:',idx)\n",
|
875 |
+
" assert idx < max_iters\n",
|
876 |
" idx += 1\n",
|
877 |
" response = client.chat.completions.create(\n",
|
878 |
" model=\"gpt-4o\",\n",
|
|
|
910 |
},
|
911 |
{
|
912 |
"cell_type": "code",
|
913 |
+
"execution_count": 48,
|
914 |
"id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
|
915 |
"metadata": {},
|
916 |
"outputs": [],
|
917 |
"source": [
|
918 |
"#| export\n",
|
919 |
+
"def get_page_text(url:str, ai_clean:bool):\n",
|
920 |
+
" text = get_page_md(url)\n",
|
921 |
+
" if ai_clean:\n",
|
922 |
+
" text = clean_page_md(text)\n",
|
923 |
+
" return text"
|
924 |
]
|
925 |
},
|
926 |
{
|
927 |
"cell_type": "code",
|
928 |
+
"execution_count": 50,
|
929 |
"id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
|
930 |
"metadata": {},
|
931 |
"outputs": [],
|
|
|
936 |
" ### Define UI ###\n",
|
937 |
" gr.Markdown(\"# TTS\")\n",
|
938 |
" gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
|
939 |
+
"For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\n",
|
940 |
+
"[https://r.jina.ai/](https://r.jina.ai/)\"\"\")\n",
|
941 |
" with gr.Row():\n",
|
942 |
" input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
|
943 |
+
" input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')\n",
|
944 |
" get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
|
945 |
" with gr.Row():\n",
|
946 |
" input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
|
|
|
966 |
"\n",
|
967 |
" ### Define UI Actions ###\n",
|
968 |
"\n",
|
969 |
+
" get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)\n",
|
970 |
" \n",
|
971 |
" # input_text \n",
|
972 |
" input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
|
|
|
1002 |
},
|
1003 |
{
|
1004 |
"cell_type": "code",
|
1005 |
+
"execution_count": 51,
|
1006 |
"id": "a00648a1-891b-470b-9959-f5d502055713",
|
1007 |
"metadata": {},
|
1008 |
"outputs": [],
|
|
|
1016 |
},
|
1017 |
{
|
1018 |
"cell_type": "code",
|
1019 |
+
"execution_count": 52,
|
1020 |
"id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
|
1021 |
"metadata": {},
|
1022 |
"outputs": [
|
|
|
1045 |
"data": {
|
1046 |
"text/plain": []
|
1047 |
},
|
1048 |
+
"execution_count": 52,
|
1049 |
"metadata": {},
|
1050 |
"output_type": "execute_result"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1051 |
}
|
1052 |
],
|
1053 |
"source": [
|
|
|
1073 |
},
|
1074 |
{
|
1075 |
"cell_type": "code",
|
1076 |
+
"execution_count": 45,
|
1077 |
"id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
|
1078 |
"metadata": {},
|
1079 |
"outputs": [
|
|
|
1103 |
},
|
1104 |
{
|
1105 |
"cell_type": "code",
|
1106 |
+
"execution_count": 53,
|
1107 |
"id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
|
1108 |
"metadata": {},
|
1109 |
"outputs": [],
|
app.py
CHANGED
@@ -6,7 +6,8 @@ __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'OPENAI_CLIENT_TTS_THREAD
|
|
6 |
'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
|
7 |
'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
|
8 |
'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
|
9 |
-
'update_voice_choices', 'split_text_as_md', '
|
|
|
10 |
|
11 |
# %% app.ipynb 4
|
12 |
import os
|
@@ -72,6 +73,7 @@ import traceback
|
|
72 |
import cartesia
|
73 |
import requests
|
74 |
import urllib
|
|
|
75 |
|
76 |
# %% app.ipynb 11
|
77 |
TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
|
@@ -333,13 +335,17 @@ def split_text_as_md(*args, **kwargs):
|
|
333 |
return '# Text Splits:\n' + '<br>----------<br>'.join(output)
|
334 |
|
335 |
# %% app.ipynb 38
|
|
|
|
|
|
|
|
|
336 |
def get_page_md(url):
|
337 |
# result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
|
338 |
result = requests.get('https://r.jina.ai/'+url)
|
339 |
result.raise_for_status()
|
340 |
-
return result.text
|
341 |
|
342 |
-
# %% app.ipynb
|
343 |
# import json
|
344 |
def clean_page_md(text):
|
345 |
max_iters = 15
|
@@ -356,6 +362,8 @@ def clean_page_md(text):
|
|
356 |
|
357 |
idx = 0
|
358 |
while complete == False and idx < max_iters:
|
|
|
|
|
359 |
idx += 1
|
360 |
response = client.chat.completions.create(
|
361 |
model="gpt-4o",
|
@@ -380,19 +388,24 @@ def clean_page_md(text):
|
|
380 |
# res = clean_page_md(test_page_md)
|
381 |
# res
|
382 |
|
383 |
-
# %% app.ipynb 42
|
384 |
-
def get_page_text(url):
|
385 |
-
return clean_page_md(get_page_md(url))
|
386 |
-
|
387 |
# %% app.ipynb 43
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
|
389 |
|
390 |
### Define UI ###
|
391 |
gr.Markdown("# TTS")
|
392 |
gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
|
393 |
-
For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>
|
|
|
394 |
with gr.Row():
|
395 |
input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
|
|
|
396 |
get_url_content_btn = gr.Button("Get URL Contents")
|
397 |
with gr.Row():
|
398 |
input_text = gr.Textbox(max_lines=100, label="Enter text here")
|
@@ -418,7 +431,7 @@ For requests longer than allowed by the API they will be broken into chunks auto
|
|
418 |
|
419 |
### Define UI Actions ###
|
420 |
|
421 |
-
get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)
|
422 |
|
423 |
# input_text
|
424 |
input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
|
@@ -451,13 +464,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
|
|
451 |
|
452 |
|
453 |
|
454 |
-
# %% app.ipynb
|
455 |
# launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
|
456 |
# 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
|
457 |
launch_kwargs = {}
|
458 |
queue_kwargs = {'default_concurrency_limit':10}
|
459 |
|
460 |
-
# %% app.ipynb
|
461 |
#.py launch
|
462 |
if __name__ == "__main__":
|
463 |
app.queue(**queue_kwargs)
|
|
|
6 |
'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
|
7 |
'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
|
8 |
'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
|
9 |
+
'update_voice_choices', 'split_text_as_md', 'remove_urls_from_markdown', 'get_page_md', 'clean_page_md',
|
10 |
+
'get_page_text']
|
11 |
|
12 |
# %% app.ipynb 4
|
13 |
import os
|
|
|
73 |
import cartesia
|
74 |
import requests
|
75 |
import urllib
|
76 |
+
import re
|
77 |
|
78 |
# %% app.ipynb 11
|
79 |
TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
|
|
|
335 |
return '# Text Splits:\n' + '<br>----------<br>'.join(output)
|
336 |
|
337 |
# %% app.ipynb 38
|
338 |
+
def remove_urls_from_markdown(text):
|
339 |
+
return re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
340 |
+
|
341 |
+
# %% app.ipynb 39
|
342 |
def get_page_md(url):
|
343 |
# result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
|
344 |
result = requests.get('https://r.jina.ai/'+url)
|
345 |
result.raise_for_status()
|
346 |
+
return remove_urls_from_markdown(result.text)
|
347 |
|
348 |
+
# %% app.ipynb 41
|
349 |
# import json
|
350 |
def clean_page_md(text):
|
351 |
max_iters = 15
|
|
|
362 |
|
363 |
idx = 0
|
364 |
while complete == False and idx < max_iters:
|
365 |
+
print('Page Cleaning Iter:',idx)
|
366 |
+
assert idx < max_iters
|
367 |
idx += 1
|
368 |
response = client.chat.completions.create(
|
369 |
model="gpt-4o",
|
|
|
388 |
# res = clean_page_md(test_page_md)
|
389 |
# res
|
390 |
|
|
|
|
|
|
|
|
|
391 |
# %% app.ipynb 43
|
392 |
+
def get_page_text(url:str, ai_clean:bool):
|
393 |
+
text = get_page_md(url)
|
394 |
+
if ai_clean:
|
395 |
+
text = clean_page_md(text)
|
396 |
+
return text
|
397 |
+
|
398 |
+
# %% app.ipynb 44
|
399 |
with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
|
400 |
|
401 |
### Define UI ###
|
402 |
gr.Markdown("# TTS")
|
403 |
gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
|
404 |
+
For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>
|
405 |
+
[https://r.jina.ai/](https://r.jina.ai/)""")
|
406 |
with gr.Row():
|
407 |
input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
|
408 |
+
input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')
|
409 |
get_url_content_btn = gr.Button("Get URL Contents")
|
410 |
with gr.Row():
|
411 |
input_text = gr.Textbox(max_lines=100, label="Enter text here")
|
|
|
431 |
|
432 |
### Define UI Actions ###
|
433 |
|
434 |
+
get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)
|
435 |
|
436 |
# input_text
|
437 |
input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
|
|
|
464 |
|
465 |
|
466 |
|
467 |
+
# %% app.ipynb 45
|
468 |
# launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
|
469 |
# 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
|
470 |
launch_kwargs = {}
|
471 |
queue_kwargs = {'default_concurrency_limit':10}
|
472 |
|
473 |
+
# %% app.ipynb 47
|
474 |
#.py launch
|
475 |
if __name__ == "__main__":
|
476 |
app.queue(**queue_kwargs)
|