matdmiller commited on
Commit
4db1bc1
·
1 Parent(s): 2b9e17f

optional ai cleaning and add md link regex cleaning by default

Browse files
Files changed (2) hide show
  1. app.ipynb +36 -23
  2. app.py +24 -11
app.ipynb CHANGED
@@ -142,7 +142,7 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 5,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
@@ -168,7 +168,8 @@
168
  "# from cartesia.tts import CartesiaTTS\n",
169
  "import cartesia\n",
170
  "import requests\n",
171
- "import urllib"
 
172
  ]
173
  },
174
  {
@@ -810,7 +811,19 @@
810
  },
811
  {
812
  "cell_type": "code",
813
- "execution_count": 29,
 
 
 
 
 
 
 
 
 
 
 
 
814
  "id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
815
  "metadata": {},
816
  "outputs": [],
@@ -820,12 +833,12 @@
820
  " # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
821
  " result = requests.get('https://r.jina.ai/'+url)\n",
822
  " result.raise_for_status()\n",
823
- " return result.text"
824
  ]
825
  },
826
  {
827
  "cell_type": "code",
828
- "execution_count": 30,
829
  "id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
830
  "metadata": {},
831
  "outputs": [],
@@ -858,6 +871,8 @@
858
  "\n",
859
  " idx = 0\n",
860
  " while complete == False and idx < max_iters:\n",
 
 
861
  " idx += 1\n",
862
  " response = client.chat.completions.create(\n",
863
  " model=\"gpt-4o\",\n",
@@ -895,19 +910,22 @@
895
  },
896
  {
897
  "cell_type": "code",
898
- "execution_count": 33,
899
  "id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
900
  "metadata": {},
901
  "outputs": [],
902
  "source": [
903
  "#| export\n",
904
- "def get_page_text(url):\n",
905
- " return clean_page_md(get_page_md(url))"
 
 
 
906
  ]
907
  },
908
  {
909
  "cell_type": "code",
910
- "execution_count": 35,
911
  "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
912
  "metadata": {},
913
  "outputs": [],
@@ -918,9 +936,11 @@
918
  " ### Define UI ###\n",
919
  " gr.Markdown(\"# TTS\")\n",
920
  " gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
921
- "For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\"\"\")\n",
 
922
  " with gr.Row():\n",
923
  " input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
 
924
  " get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
925
  " with gr.Row():\n",
926
  " input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
@@ -946,7 +966,7 @@
946
  "\n",
947
  " ### Define UI Actions ###\n",
948
  "\n",
949
- " get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)\n",
950
  " \n",
951
  " # input_text \n",
952
  " input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
@@ -982,7 +1002,7 @@
982
  },
983
  {
984
  "cell_type": "code",
985
- "execution_count": 36,
986
  "id": "a00648a1-891b-470b-9959-f5d502055713",
987
  "metadata": {},
988
  "outputs": [],
@@ -996,7 +1016,7 @@
996
  },
997
  {
998
  "cell_type": "code",
999
- "execution_count": 37,
1000
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
1001
  "metadata": {},
1002
  "outputs": [
@@ -1025,16 +1045,9 @@
1025
  "data": {
1026
  "text/plain": []
1027
  },
1028
- "execution_count": 37,
1029
  "metadata": {},
1030
  "output_type": "execute_result"
1031
- },
1032
- {
1033
- "name": "stdout",
1034
- "output_type": "stream",
1035
- "text": [
1036
- "TOKENS CLEANUP: 970\n"
1037
- ]
1038
  }
1039
  ],
1040
  "source": [
@@ -1060,7 +1073,7 @@
1060
  },
1061
  {
1062
  "cell_type": "code",
1063
- "execution_count": 38,
1064
  "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
1065
  "metadata": {},
1066
  "outputs": [
@@ -1090,7 +1103,7 @@
1090
  },
1091
  {
1092
  "cell_type": "code",
1093
- "execution_count": 40,
1094
  "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
1095
  "metadata": {},
1096
  "outputs": [],
 
142
  },
143
  {
144
  "cell_type": "code",
145
+ "execution_count": 41,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
 
168
  "# from cartesia.tts import CartesiaTTS\n",
169
  "import cartesia\n",
170
  "import requests\n",
171
+ "import urllib\n",
172
+ "import re"
173
  ]
174
  },
175
  {
 
811
  },
812
  {
813
  "cell_type": "code",
814
+ "execution_count": 42,
815
+ "id": "c5b0156a-f6d4-480a-b7b5-b0899e7520b9",
816
+ "metadata": {},
817
+ "outputs": [],
818
+ "source": [
819
+ "#| export\n",
820
+ "def remove_urls_from_markdown(text):\n",
821
+ " return re.sub(r'\\[([^\\]]+)\\]\\([^\\)]+\\)', r'\\1', text)"
822
+ ]
823
+ },
824
+ {
825
+ "cell_type": "code",
826
+ "execution_count": 43,
827
  "id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
828
  "metadata": {},
829
  "outputs": [],
 
833
  " # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
834
  " result = requests.get('https://r.jina.ai/'+url)\n",
835
  " result.raise_for_status()\n",
836
+ " return remove_urls_from_markdown(result.text)"
837
  ]
838
  },
839
  {
840
  "cell_type": "code",
841
+ "execution_count": 47,
842
  "id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
843
  "metadata": {},
844
  "outputs": [],
 
871
  "\n",
872
  " idx = 0\n",
873
  " while complete == False and idx < max_iters:\n",
874
+ " print('Page Cleaning Iter:',idx)\n",
875
+ " assert idx < max_iters\n",
876
  " idx += 1\n",
877
  " response = client.chat.completions.create(\n",
878
  " model=\"gpt-4o\",\n",
 
910
  },
911
  {
912
  "cell_type": "code",
913
+ "execution_count": 48,
914
  "id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
915
  "metadata": {},
916
  "outputs": [],
917
  "source": [
918
  "#| export\n",
919
+ "def get_page_text(url:str, ai_clean:bool):\n",
920
+ " text = get_page_md(url)\n",
921
+ " if ai_clean:\n",
922
+ " text = clean_page_md(text)\n",
923
+ " return text"
924
  ]
925
  },
926
  {
927
  "cell_type": "code",
928
+ "execution_count": 50,
929
  "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
930
  "metadata": {},
931
  "outputs": [],
 
936
  " ### Define UI ###\n",
937
  " gr.Markdown(\"# TTS\")\n",
938
  " gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
939
+ "For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\n",
940
+ "[https://r.jina.ai/](https://r.jina.ai/)\"\"\")\n",
941
  " with gr.Row():\n",
942
  " input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
943
+ " input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')\n",
944
  " get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
945
  " with gr.Row():\n",
946
  " input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
 
966
  "\n",
967
  " ### Define UI Actions ###\n",
968
  "\n",
969
+ " get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)\n",
970
  " \n",
971
  " # input_text \n",
972
  " input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
 
1002
  },
1003
  {
1004
  "cell_type": "code",
1005
+ "execution_count": 51,
1006
  "id": "a00648a1-891b-470b-9959-f5d502055713",
1007
  "metadata": {},
1008
  "outputs": [],
 
1016
  },
1017
  {
1018
  "cell_type": "code",
1019
+ "execution_count": 52,
1020
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
1021
  "metadata": {},
1022
  "outputs": [
 
1045
  "data": {
1046
  "text/plain": []
1047
  },
1048
+ "execution_count": 52,
1049
  "metadata": {},
1050
  "output_type": "execute_result"
 
 
 
 
 
 
 
1051
  }
1052
  ],
1053
  "source": [
 
1073
  },
1074
  {
1075
  "cell_type": "code",
1076
+ "execution_count": 45,
1077
  "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
1078
  "metadata": {},
1079
  "outputs": [
 
1103
  },
1104
  {
1105
  "cell_type": "code",
1106
+ "execution_count": 53,
1107
  "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
1108
  "metadata": {},
1109
  "outputs": [],
app.py CHANGED
@@ -6,7 +6,8 @@ __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'OPENAI_CLIENT_TTS_THREAD
6
  'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
7
  'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
8
  'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
9
- 'update_voice_choices', 'split_text_as_md', 'get_page_md', 'clean_page_md', 'get_page_text']
 
10
 
11
  # %% app.ipynb 4
12
  import os
@@ -72,6 +73,7 @@ import traceback
72
  import cartesia
73
  import requests
74
  import urllib
 
75
 
76
  # %% app.ipynb 11
77
  TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
@@ -333,13 +335,17 @@ def split_text_as_md(*args, **kwargs):
333
  return '# Text Splits:\n' + '<br>----------<br>'.join(output)
334
 
335
  # %% app.ipynb 38
 
 
 
 
336
  def get_page_md(url):
337
  # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
338
  result = requests.get('https://r.jina.ai/'+url)
339
  result.raise_for_status()
340
- return result.text
341
 
342
- # %% app.ipynb 40
343
  # import json
344
  def clean_page_md(text):
345
  max_iters = 15
@@ -356,6 +362,8 @@ def clean_page_md(text):
356
 
357
  idx = 0
358
  while complete == False and idx < max_iters:
 
 
359
  idx += 1
360
  response = client.chat.completions.create(
361
  model="gpt-4o",
@@ -380,19 +388,24 @@ def clean_page_md(text):
380
  # res = clean_page_md(test_page_md)
381
  # res
382
 
383
- # %% app.ipynb 42
384
- def get_page_text(url):
385
- return clean_page_md(get_page_md(url))
386
-
387
  # %% app.ipynb 43
 
 
 
 
 
 
 
388
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
389
 
390
  ### Define UI ###
391
  gr.Markdown("# TTS")
392
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
393
- For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>""")
 
394
  with gr.Row():
395
  input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
 
396
  get_url_content_btn = gr.Button("Get URL Contents")
397
  with gr.Row():
398
  input_text = gr.Textbox(max_lines=100, label="Enter text here")
@@ -418,7 +431,7 @@ For requests longer than allowed by the API they will be broken into chunks auto
418
 
419
  ### Define UI Actions ###
420
 
421
- get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)
422
 
423
  # input_text
424
  input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
@@ -451,13 +464,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
451
 
452
 
453
 
454
- # %% app.ipynb 44
455
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
456
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
457
  launch_kwargs = {}
458
  queue_kwargs = {'default_concurrency_limit':10}
459
 
460
- # %% app.ipynb 46
461
  #.py launch
462
  if __name__ == "__main__":
463
  app.queue(**queue_kwargs)
 
6
  'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
7
  'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
8
  'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
9
+ 'update_voice_choices', 'split_text_as_md', 'remove_urls_from_markdown', 'get_page_md', 'clean_page_md',
10
+ 'get_page_text']
11
 
12
  # %% app.ipynb 4
13
  import os
 
73
  import cartesia
74
  import requests
75
  import urllib
76
+ import re
77
 
78
  # %% app.ipynb 11
79
  TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
 
335
  return '# Text Splits:\n' + '<br>----------<br>'.join(output)
336
 
337
  # %% app.ipynb 38
338
+ def remove_urls_from_markdown(text):
339
+ return re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
340
+
341
+ # %% app.ipynb 39
342
  def get_page_md(url):
343
  # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
344
  result = requests.get('https://r.jina.ai/'+url)
345
  result.raise_for_status()
346
+ return remove_urls_from_markdown(result.text)
347
 
348
+ # %% app.ipynb 41
349
  # import json
350
  def clean_page_md(text):
351
  max_iters = 15
 
362
 
363
  idx = 0
364
  while complete == False and idx < max_iters:
365
+ print('Page Cleaning Iter:',idx)
366
+ assert idx < max_iters
367
  idx += 1
368
  response = client.chat.completions.create(
369
  model="gpt-4o",
 
388
  # res = clean_page_md(test_page_md)
389
  # res
390
 
 
 
 
 
391
  # %% app.ipynb 43
392
+ def get_page_text(url:str, ai_clean:bool):
393
+ text = get_page_md(url)
394
+ if ai_clean:
395
+ text = clean_page_md(text)
396
+ return text
397
+
398
+ # %% app.ipynb 44
399
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
400
 
401
  ### Define UI ###
402
  gr.Markdown("# TTS")
403
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
404
+ For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>
405
+ [https://r.jina.ai/](https://r.jina.ai/)""")
406
  with gr.Row():
407
  input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
408
+ input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')
409
  get_url_content_btn = gr.Button("Get URL Contents")
410
  with gr.Row():
411
  input_text = gr.Textbox(max_lines=100, label="Enter text here")
 
431
 
432
  ### Define UI Actions ###
433
 
434
+ get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)
435
 
436
  # input_text
437
  input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
 
464
 
465
 
466
 
467
+ # %% app.ipynb 45
468
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
469
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
470
  launch_kwargs = {}
471
  queue_kwargs = {'default_concurrency_limit':10}
472
 
473
+ # %% app.ipynb 47
474
  #.py launch
475
  if __name__ == "__main__":
476
  app.queue(**queue_kwargs)