Spaces:
Sleeping
Sleeping
ishworrsubedii
commited on
Commit
•
b368e21
1
Parent(s):
2c6b8d9
Integrated speech transcription
Browse files- app.py +45 -27
- requirements.txt +2 -1
- src/__init__.py +19 -0
- src/api/__init__.py +4 -0
- src/api/speech_api.py +195 -0
- src/components/__init__.py +4 -0
- src/components/speech_to_text.py +48 -0
- src/components/text_to_speech_gtts.py +33 -0
- src/models/__init__.py +4 -0
- src/models/models.py +16 -0
- src/pipeline/__init__.py +4 -0
- src/pipeline/speech_transcription_pipeline.py +32 -0
- src/utils/__init__.py +4 -0
- src/utils/utils.py +11 -0
app.py
CHANGED
@@ -6,10 +6,10 @@ from fastapi import FastAPI, File, UploadFile
|
|
6 |
from pydantic import BaseModel
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
|
|
9 |
|
|
|
10 |
|
11 |
-
|
12 |
-
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
13 |
app.add_middleware(
|
14 |
CORSMiddleware,
|
15 |
allow_origins=["*"],
|
@@ -18,29 +18,33 @@ app.add_middleware(
|
|
18 |
allow_headers=["*"],
|
19 |
)
|
20 |
|
|
|
|
|
|
|
21 |
@app.post("/signup")
|
22 |
async def signup(username: str, password: str):
|
23 |
-
response = createUser(username
|
24 |
return response
|
25 |
|
26 |
|
27 |
@app.post("/login")
|
28 |
async def login(username: str, password: str):
|
29 |
-
response = matchPassword(username
|
30 |
return response
|
31 |
|
32 |
|
33 |
@app.post("/newChatbot")
|
34 |
async def newChatbot(chatbotName: str, username: str):
|
35 |
-
currentBotCount = len(listTables(username
|
36 |
-
limit = client.table("ConversAI_UserConfig").select("chatbotLimit").eq("username", username).execute().data[0][
|
|
|
37 |
if currentBotCount >= int(limit):
|
38 |
return {
|
39 |
"output": "CHATBOT LIMIT EXCEEDED"
|
40 |
}
|
41 |
client.table("ConversAI_ChatbotInfo").insert({"username": username, "chatbotname": chatbotName}).execute()
|
42 |
chatbotName = f"convai-{username}-{chatbotName}"
|
43 |
-
return createTable(tablename
|
44 |
|
45 |
|
46 |
@app.post("/addPDF")
|
@@ -53,11 +57,13 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
53 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
54 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
55 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
56 |
-
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
|
|
57 |
newCount = currentCount + len(text)
|
58 |
if newCount < int(limit):
|
59 |
-
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
60 |
-
|
|
|
61 |
else:
|
62 |
return {
|
63 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
@@ -67,7 +73,7 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
67 |
@app.post("/scanAndReturnText")
|
68 |
async def returnText(pdf: UploadFile = File(...)):
|
69 |
pdf = await pdf.read()
|
70 |
-
text = getTextFromImagePDF(pdfBytes
|
71 |
return text
|
72 |
|
73 |
|
@@ -77,10 +83,12 @@ async def addText(vectorstore: str, text: str):
|
|
77 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
78 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
79 |
newCount = currentCount + len(text)
|
80 |
-
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
|
|
81 |
if newCount < int(limit):
|
82 |
-
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
83 |
-
|
|
|
84 |
else:
|
85 |
return {
|
86 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
@@ -100,10 +108,12 @@ async def addText(addQaPair: AddQAPair):
|
|
100 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
101 |
qa = f"QUESTION: {addQaPair.question}\tANSWER: {addQaPair.answer}"
|
102 |
newCount = currentCount + len(qa)
|
103 |
-
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
|
|
104 |
if newCount < int(limit):
|
105 |
-
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
106 |
-
|
|
|
107 |
else:
|
108 |
return {
|
109 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
@@ -115,20 +125,24 @@ async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
|
115 |
urls = websiteUrls
|
116 |
loader = UnstructuredURLLoader(urls=urls)
|
117 |
docs = loader.load()
|
118 |
-
text = "\n\n".join(
|
|
|
119 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
120 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
121 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
122 |
newCount = currentCount + len(text)
|
123 |
-
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
|
|
124 |
if newCount < int(limit):
|
125 |
-
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
126 |
-
|
|
|
127 |
else:
|
128 |
return {
|
129 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
130 |
}
|
131 |
|
|
|
132 |
@app.post("/answerQuery")
|
133 |
async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
|
134 |
return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
|
@@ -140,15 +154,18 @@ async def delete(chatbotName: str):
|
|
140 |
client.table('ConversAI_ChatbotInfo').delete().eq('username', username).eq('chatbotname', chatbotName).execute()
|
141 |
return deleteTable(tableName=chatbotName)
|
142 |
|
|
|
143 |
@app.post("/listChatbots")
|
144 |
async def delete(username: str):
|
145 |
return listTables(username=username)
|
146 |
|
|
|
147 |
@app.post("/getLinks")
|
148 |
async def crawlUrl(baseUrl: str):
|
149 |
return {
|
150 |
"urls": getLinks(url=baseUrl, timeout=30)
|
151 |
-
|
|
|
152 |
|
153 |
@app.post("/getCurrentCount")
|
154 |
async def getCount(vectorstore: str):
|
@@ -156,11 +173,12 @@ async def getCount(vectorstore: str):
|
|
156 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
157 |
return {
|
158 |
"currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
|
159 |
-
|
|
|
160 |
|
161 |
@app.post("/getYoutubeTranscript")
|
162 |
async def getYTTranscript(urls: str):
|
163 |
-
return getTranscript(urls
|
164 |
|
165 |
|
166 |
@app.post("/analyzeData")
|
@@ -169,10 +187,10 @@ async def analyzeAndAnswer(query: str, file: UploadFile = File(...)):
|
|
169 |
try:
|
170 |
if extension in ["xls", "xlsx", "xlsm", "xlsb"]:
|
171 |
df = pd.read_excel(io.BytesIO(await file.read()))
|
172 |
-
response = analyzeData(query
|
173 |
elif extension == "csv":
|
174 |
df = pd.read_csv(io.BytesIO(await file.read()))
|
175 |
-
response = analyzeData(query
|
176 |
else:
|
177 |
response = "INVALID FILE TYPE"
|
178 |
return {
|
@@ -181,4 +199,4 @@ async def analyzeAndAnswer(query: str, file: UploadFile = File(...)):
|
|
181 |
except:
|
182 |
return {
|
183 |
"output": "UNABLE TO ANSWER QUERY"
|
184 |
-
}
|
|
|
6 |
from pydantic import BaseModel
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
9 |
+
from src.api.speech_api import speech_translator_router
|
10 |
|
11 |
+
app = FastAPI(title="ConversAI", root_path="/api/v1")
|
12 |
|
|
|
|
|
13 |
app.add_middleware(
|
14 |
CORSMiddleware,
|
15 |
allow_origins=["*"],
|
|
|
18 |
allow_headers=["*"],
|
19 |
)
|
20 |
|
21 |
+
app.include_router(speech_translator_router, prefix="/speech")
|
22 |
+
|
23 |
+
|
24 |
@app.post("/signup")
|
25 |
async def signup(username: str, password: str):
|
26 |
+
response = createUser(username=username, password=password)
|
27 |
return response
|
28 |
|
29 |
|
30 |
@app.post("/login")
|
31 |
async def login(username: str, password: str):
|
32 |
+
response = matchPassword(username=username, password=password)
|
33 |
return response
|
34 |
|
35 |
|
36 |
@app.post("/newChatbot")
|
37 |
async def newChatbot(chatbotName: str, username: str):
|
38 |
+
currentBotCount = len(listTables(username=username)["output"])
|
39 |
+
limit = client.table("ConversAI_UserConfig").select("chatbotLimit").eq("username", username).execute().data[0][
|
40 |
+
"chatbotLimit"]
|
41 |
if currentBotCount >= int(limit):
|
42 |
return {
|
43 |
"output": "CHATBOT LIMIT EXCEEDED"
|
44 |
}
|
45 |
client.table("ConversAI_ChatbotInfo").insert({"username": username, "chatbotname": chatbotName}).execute()
|
46 |
chatbotName = f"convai-{username}-{chatbotName}"
|
47 |
+
return createTable(tablename=chatbotName)
|
48 |
|
49 |
|
50 |
@app.post("/addPDF")
|
|
|
57 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
58 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
59 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
60 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
61 |
+
"tokenLimit"]
|
62 |
newCount = currentCount + len(text)
|
63 |
if newCount < int(limit):
|
64 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
65 |
+
"chatbotname", chatbotname).execute()
|
66 |
+
return addDocuments(text=text, vectorstore=vectorstore)
|
67 |
else:
|
68 |
return {
|
69 |
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
|
73 |
@app.post("/scanAndReturnText")
|
74 |
async def returnText(pdf: UploadFile = File(...)):
|
75 |
pdf = await pdf.read()
|
76 |
+
text = getTextFromImagePDF(pdfBytes=pdf)
|
77 |
return text
|
78 |
|
79 |
|
|
|
83 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
84 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
85 |
newCount = currentCount + len(text)
|
86 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
87 |
+
"tokenLimit"]
|
88 |
if newCount < int(limit):
|
89 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
90 |
+
"chatbotname", chatbotname).execute()
|
91 |
+
return addDocuments(text=text, vectorstore=vectorstore)
|
92 |
else:
|
93 |
return {
|
94 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
|
108 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
109 |
qa = f"QUESTION: {addQaPair.question}\tANSWER: {addQaPair.answer}"
|
110 |
newCount = currentCount + len(qa)
|
111 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
112 |
+
"tokenLimit"]
|
113 |
if newCount < int(limit):
|
114 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
115 |
+
"chatbotname", chatbotname).execute()
|
116 |
+
return addDocuments(text=qa, vectorstore=addQaPair.vectorstore)
|
117 |
else:
|
118 |
return {
|
119 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
|
|
125 |
urls = websiteUrls
|
126 |
loader = UnstructuredURLLoader(urls=urls)
|
127 |
docs = loader.load()
|
128 |
+
text = "\n\n".join(
|
129 |
+
[f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
|
130 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
131 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
132 |
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
133 |
newCount = currentCount + len(text)
|
134 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0][
|
135 |
+
"tokenLimit"]
|
136 |
if newCount < int(limit):
|
137 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq(
|
138 |
+
"chatbotname", chatbotname).execute()
|
139 |
+
return addDocuments(text=text, vectorstore=vectorstore)
|
140 |
else:
|
141 |
return {
|
142 |
"output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
143 |
}
|
144 |
|
145 |
+
|
146 |
@app.post("/answerQuery")
|
147 |
async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
|
148 |
return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
|
|
|
154 |
client.table('ConversAI_ChatbotInfo').delete().eq('username', username).eq('chatbotname', chatbotName).execute()
|
155 |
return deleteTable(tableName=chatbotName)
|
156 |
|
157 |
+
|
158 |
@app.post("/listChatbots")
|
159 |
async def delete(username: str):
|
160 |
return listTables(username=username)
|
161 |
|
162 |
+
|
163 |
@app.post("/getLinks")
|
164 |
async def crawlUrl(baseUrl: str):
|
165 |
return {
|
166 |
"urls": getLinks(url=baseUrl, timeout=30)
|
167 |
+
}
|
168 |
+
|
169 |
|
170 |
@app.post("/getCurrentCount")
|
171 |
async def getCount(vectorstore: str):
|
|
|
173 |
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
174 |
return {
|
175 |
"currentCount": df[(df['username'] == username) & (df['chatbotname'] == chatbotName)]['charactercount'].iloc[0]
|
176 |
+
}
|
177 |
+
|
178 |
|
179 |
@app.post("/getYoutubeTranscript")
|
180 |
async def getYTTranscript(urls: str):
|
181 |
+
return getTranscript(urls=urls)
|
182 |
|
183 |
|
184 |
@app.post("/analyzeData")
|
|
|
187 |
try:
|
188 |
if extension in ["xls", "xlsx", "xlsm", "xlsb"]:
|
189 |
df = pd.read_excel(io.BytesIO(await file.read()))
|
190 |
+
response = analyzeData(query=query, dataframe=df)
|
191 |
elif extension == "csv":
|
192 |
df = pd.read_csv(io.BytesIO(await file.read()))
|
193 |
+
response = analyzeData(query=query, dataframe=df)
|
194 |
else:
|
195 |
response = "INVALID FILE TYPE"
|
196 |
return {
|
|
|
199 |
except:
|
200 |
return {
|
201 |
"output": "UNABLE TO ANSWER QUERY"
|
202 |
+
}
|
requirements.txt
CHANGED
@@ -24,4 +24,5 @@ pdf2image
|
|
24 |
sentence-transformers
|
25 |
supabase
|
26 |
unstructured
|
27 |
-
urllib3
|
|
|
|
24 |
sentence-transformers
|
25 |
supabase
|
26 |
unstructured
|
27 |
+
urllib3
|
28 |
+
gtts
|
src/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
|
6 |
+
import logging.config
|
7 |
+
import yaml
|
8 |
+
import os
|
9 |
+
|
10 |
+
if os.path.exists("logs"):
|
11 |
+
pass
|
12 |
+
else:
|
13 |
+
os.makedirs("logs")
|
14 |
+
|
15 |
+
log_config_path = os.path.join(os.getcwd(), "logging_config.yaml")
|
16 |
+
with open(log_config_path, 'r') as file:
|
17 |
+
config = yaml.safe_load(file.read())
|
18 |
+
|
19 |
+
logging.config.dictConfig(config)
|
src/api/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
src/api/speech_api.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
from fastapi.responses import JSONResponse
|
8 |
+
from fastapi import Form
|
9 |
+
from fastapi import UploadFile, HTTPException, status
|
10 |
+
from src.models.models import TextToSpeechRequest
|
11 |
+
from fastapi.routing import APIRouter
|
12 |
+
from src.pipeline.speech_transcription_pipeline import SpeechTranscriptionPipeline
|
13 |
+
from src import logging
|
14 |
+
|
15 |
+
speech_translator_router = APIRouter(tags=["SpeechTranscription"])
|
16 |
+
pipeline = SpeechTranscriptionPipeline()
|
17 |
+
|
18 |
+
|
19 |
+
@speech_translator_router.post(
|
20 |
+
"/text_to_speech",
|
21 |
+
description="""
|
22 |
+
** For language refer below points**
|
23 |
+
**Supported Locales:**
|
24 |
+
|
25 |
+
- **English:**
|
26 |
+
- **Australia:**
|
27 |
+
- **Language:** en
|
28 |
+
- **TLD:** com.au
|
29 |
+
- **United Kingdom:**
|
30 |
+
- **Language:** en
|
31 |
+
- **TLD:** co.uk
|
32 |
+
- **United States:**
|
33 |
+
- **Language:** en
|
34 |
+
- **TLD:** us
|
35 |
+
- **Canada:**
|
36 |
+
- **Language:** en
|
37 |
+
- **TLD:** ca
|
38 |
+
- **India:**
|
39 |
+
- **Language:** en
|
40 |
+
- **TLD:** co.in
|
41 |
+
- **Ireland:**
|
42 |
+
- **Language:** en
|
43 |
+
- **TLD:** ie
|
44 |
+
- **South Africa:**
|
45 |
+
- **Language:** en
|
46 |
+
- **TLD:** co.za
|
47 |
+
- **Nigeria:**
|
48 |
+
- **Language:** en
|
49 |
+
- **TLD:** com.ng
|
50 |
+
|
51 |
+
- **French:**
|
52 |
+
- **Canada:**
|
53 |
+
- **Language:** fr
|
54 |
+
- **TLD:** ca
|
55 |
+
- **France:**
|
56 |
+
- **Language:** fr
|
57 |
+
- **TLD:** fr
|
58 |
+
|
59 |
+
- **Mandarin:**
|
60 |
+
- **China Mainland:**
|
61 |
+
- **Language:** zh-CN
|
62 |
+
- **TLD:** any
|
63 |
+
- **Taiwan:**
|
64 |
+
- **Language:** zh-TW
|
65 |
+
- **TLD:** any
|
66 |
+
|
67 |
+
- **Portuguese:**
|
68 |
+
- **Brazil:**
|
69 |
+
- **Language:** pt
|
70 |
+
- **TLD:** com.br
|
71 |
+
- **Portugal:**
|
72 |
+
- **Language:** pt
|
73 |
+
- **TLD:** pt
|
74 |
+
|
75 |
+
- **Spanish:**
|
76 |
+
- **Mexico:**
|
77 |
+
- **Language:** es
|
78 |
+
- **TLD:** com.mx
|
79 |
+
- **Spain:**
|
80 |
+
- **Language:** es
|
81 |
+
- **TLD:** es
|
82 |
+
- **United States:**
|
83 |
+
- **Language:** es
|
84 |
+
- **TLD:** us
|
85 |
+
"""
|
86 |
+
)
|
87 |
+
async def text_to_speech(request: TextToSpeechRequest):
|
88 |
+
logging.info(f"Text to speech request received")
|
89 |
+
try:
|
90 |
+
audio_bytes = pipeline.text_to_speech(request.text, request.lang, request.tld)
|
91 |
+
if not audio_bytes:
|
92 |
+
logging.error(f"Audio generation failed.")
|
93 |
+
raise ValueError("Audio generation failed.")
|
94 |
+
logging.info(f"Text to speech request processed successfully")
|
95 |
+
return JSONResponse(content={"audio": audio_bytes, "status_code": status.HTTP_200_OK}, status_code=200)
|
96 |
+
except ValueError as ve:
|
97 |
+
logging.error(f"Error processing text to speech request: {str(ve)}")
|
98 |
+
raise HTTPException(status_code=400, detail=str(ve))
|
99 |
+
except Exception as e:
|
100 |
+
logging.error(f"Internal Server Error: {str(e)}")
|
101 |
+
raise HTTPException(status_code=500, detail="Internal Server Error")
|
102 |
+
|
103 |
+
|
104 |
+
@speech_translator_router.post(
|
105 |
+
"/speech_to_text",
|
106 |
+
description="""
|
107 |
+
** Specify the language used in the audio **
|
108 |
+
**Supported Languages:**
|
109 |
+
|
110 |
+
**Major Languages:**
|
111 |
+
- **English:** en
|
112 |
+
- **Mandarin Chinese:** zh
|
113 |
+
- **Spanish:** es
|
114 |
+
- **French:** fr
|
115 |
+
- **German:** de
|
116 |
+
- **Italian:** it
|
117 |
+
- **Japanese:** ja
|
118 |
+
- **Korean:** ko
|
119 |
+
- **Russian:** ru
|
120 |
+
- **Portuguese:** pt
|
121 |
+
- **Arabic:** ar
|
122 |
+
|
123 |
+
**Additional Languages:**
|
124 |
+
|
125 |
+
- **Indic Languages:**
|
126 |
+
- **Hindi:** hi
|
127 |
+
- **Bengali:** bn
|
128 |
+
- **Tamil:** ta
|
129 |
+
- **Telugu:** te
|
130 |
+
|
131 |
+
- **Southeast Asian Languages:**
|
132 |
+
- **Vietnamese:** vi
|
133 |
+
- **Thai:** th
|
134 |
+
- **Indonesian:** id
|
135 |
+
- **Malay:** ms
|
136 |
+
|
137 |
+
- **African Languages:**
|
138 |
+
- **Swahili:** sw
|
139 |
+
- **Yoruba:** yo
|
140 |
+
- **Hausa:** ha
|
141 |
+
|
142 |
+
- **European Languages:**
|
143 |
+
- **Polish:** pl
|
144 |
+
- **Dutch:** nl
|
145 |
+
- **Swedish:** sv
|
146 |
+
- **Norwegian:** no
|
147 |
+
"""
|
148 |
+
)
|
149 |
+
async def speech_to_text(audio: UploadFile, lang: str = Form(...)):
|
150 |
+
logging.info(f"Speech to text request received")
|
151 |
+
try:
|
152 |
+
audio_bytes = await audio.read()
|
153 |
+
if not audio_bytes:
|
154 |
+
logging.error(f"Empty audio file")
|
155 |
+
raise ValueError("Empty audio file")
|
156 |
+
except Exception as e:
|
157 |
+
logging.error(f"Invalid audio file {e}")
|
158 |
+
raise HTTPException(
|
159 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
160 |
+
detail="Invalid audio file"
|
161 |
+
)
|
162 |
+
|
163 |
+
try:
|
164 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
165 |
+
temp_audio_file.write(audio_bytes)
|
166 |
+
temp_audio_file_path = temp_audio_file.name
|
167 |
+
logging.info(f"Temporary audio file created at {temp_audio_file_path}")
|
168 |
+
except Exception as e:
|
169 |
+
logging.error(f"Could not process audio file{e}")
|
170 |
+
raise HTTPException(
|
171 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
172 |
+
detail="Could not process audio file"
|
173 |
+
)
|
174 |
+
|
175 |
+
try:
|
176 |
+
logging.info(f"Transcribing audio to text")
|
177 |
+
transcript = pipeline.speech_to_text(temp_audio_file_path, lang)
|
178 |
+
except FileNotFoundError as fnfe:
|
179 |
+
logging.error(f"Temporary file not found{fnfel}")
|
180 |
+
raise HTTPException(
|
181 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
182 |
+
detail="Temporary file not found"
|
183 |
+
)
|
184 |
+
except Exception as e:
|
185 |
+
logging.error(f"Error processing speech-to-text: {str(e)}")
|
186 |
+
raise HTTPException(
|
187 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
188 |
+
detail="Error processing speech-to-text"
|
189 |
+
)
|
190 |
+
finally:
|
191 |
+
logging.info(f"Cleaning up temporary audio file")
|
192 |
+
if os.path.exists(temp_audio_file_path):
|
193 |
+
os.remove(temp_audio_file_path)
|
194 |
+
|
195 |
+
return JSONResponse(content={"transcript": transcript, "status_code": status.HTTP_200_OK}, status_code=200)
|
src/components/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
src/components/speech_to_text.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
import torch
|
6 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
7 |
+
|
8 |
+
|
9 |
+
class SpeechToText:
|
10 |
+
def __init__(self):
|
11 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
+
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
13 |
+
|
14 |
+
model_id = "openai/whisper-large-v3"
|
15 |
+
|
16 |
+
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
17 |
+
model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
18 |
+
).to(self.device)
|
19 |
+
self.processor = AutoProcessor.from_pretrained(model_id)
|
20 |
+
self.speech_to_text_pipeline = self.pipeline()
|
21 |
+
|
22 |
+
def pipeline(self):
|
23 |
+
pipe = pipeline(
|
24 |
+
"automatic-speech-recognition",
|
25 |
+
model=self.model,
|
26 |
+
tokenizer=self.processor.tokenizer,
|
27 |
+
feature_extractor=self.processor.feature_extractor,
|
28 |
+
max_new_tokens=128, # max number of tokens to generate at a time
|
29 |
+
chunk_length_s=30, # length of audio chunks to process at a time
|
30 |
+
batch_size=16, # number of chunks to process at a time
|
31 |
+
return_timestamps=True,
|
32 |
+
torch_dtype=self.torch_dtype,
|
33 |
+
device=self.device,
|
34 |
+
|
35 |
+
)
|
36 |
+
return pipe
|
37 |
+
|
38 |
+
def transcribe_audio(self, audio, language: str = "en"):
|
39 |
+
"""
|
40 |
+
This function is for transcribing audio to text.
|
41 |
+
:param audio: upload your audio file
|
42 |
+
:param language: choose the languaage of the audio file
|
43 |
+
:return:
|
44 |
+
"""
|
45 |
+
|
46 |
+
result = self.speech_to_text_pipeline(audio, return_timestamps=True,
|
47 |
+
generate_kwargs={"language": language, "task": "translate"})
|
48 |
+
return result["chunks"], result["text"]
|
src/components/text_to_speech_gtts.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
import base64
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from gtts import gTTS
|
9 |
+
from gtts.tokenizer import pre_processors
|
10 |
+
|
11 |
+
|
12 |
+
class TextToSpeech:
|
13 |
+
def __init__(self):
|
14 |
+
self.preprocessing = [pre_processors.tone_marks, pre_processors.end_of_line, pre_processors.word_sub,
|
15 |
+
pre_processors.abbreviations]
|
16 |
+
|
17 |
+
def conversion(self, text: str, lang: str, tld: str) -> str:
|
18 |
+
"""
|
19 |
+
Convert text to speech and return the Base64-encoded MP3 data.
|
20 |
+
:param text: The text to convert to speech.
|
21 |
+
:param lang: The language in which to convert the text.
|
22 |
+
:return: Base64-encoded MP3 data as a string.
|
23 |
+
"""
|
24 |
+
tts = gTTS(text=text, lang=lang, slow=False, tld=tld, pre_processor_funcs=self.preprocessing)
|
25 |
+
mp3_fp = BytesIO()
|
26 |
+
tts.write_to_fp(mp3_fp)
|
27 |
+
mp3_fp.seek(0)
|
28 |
+
|
29 |
+
mp3_binary = mp3_fp.getvalue()
|
30 |
+
|
31 |
+
base64_mp3 = base64.b64encode(mp3_binary).decode("utf-8")
|
32 |
+
|
33 |
+
return base64_mp3
|
src/models/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
src/models/models.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
from fastapi import UploadFile
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
|
9 |
+
class TextToSpeechRequest(BaseModel):
|
10 |
+
text: str
|
11 |
+
lang: str
|
12 |
+
tld: str
|
13 |
+
|
14 |
+
|
15 |
+
class SpeechToTextRequest(BaseModel):
|
16 |
+
lang: str
|
src/pipeline/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
src/pipeline/speech_transcription_pipeline.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
5 |
+
from src.components.speech_to_text import SpeechToText
|
6 |
+
from src.components.text_to_speech_gtts import TextToSpeech
|
7 |
+
|
8 |
+
|
9 |
+
class SpeechTranscriptionPipeline:
|
10 |
+
def __init__(self):
|
11 |
+
self.speech_to_text_ = SpeechToText()
|
12 |
+
self.text_to_speech_ = TextToSpeech()
|
13 |
+
|
14 |
+
def text_to_speech(self, text: str, lang: str, tld: str) -> str:
|
15 |
+
"""
|
16 |
+
Convert text to speech.
|
17 |
+
:param text: The text to convert to speech.
|
18 |
+
:param lang: The language in which to convert the text.
|
19 |
+
:return: The speech representation of the text.
|
20 |
+
"""
|
21 |
+
speech = self.text_to_speech_.conversion(text, lang, tld)
|
22 |
+
return speech
|
23 |
+
|
24 |
+
def speech_to_text(self, audio, lang: str) -> str:
|
25 |
+
"""
|
26 |
+
Convert speech to text.
|
27 |
+
:param audio: The audio data to convert to text.
|
28 |
+
:param lang: The language in which the audio is spoken.
|
29 |
+
:return: The text representation of the audio.
|
30 |
+
"""
|
31 |
+
transcript_with_timestamp, transcript = self.speech_to_text_.transcribe_audio(audio=audio, language=lang)
|
32 |
+
return transcript
|
src/utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-07-31
|
4 |
+
"""
|
src/utils/utils.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Created By: ishwor subedi
|
3 |
+
Date: 2024-08-02
|
4 |
+
"""
|
5 |
+
import yaml
|
6 |
+
|
7 |
+
|
8 |
+
def load_config(file_path):
|
9 |
+
with open(file_path, 'r') as file:
|
10 |
+
config = yaml.safe_load(file)
|
11 |
+
return config
|