Spaces:
Sleeping
Sleeping
UPDATE: New Endpoints
Browse files- Dockerfile +3 -1
- app.py +18 -1
- functions.py +20 -1
- requirements.txt +4 -0
Dockerfile
CHANGED
@@ -10,7 +10,9 @@ RUN apt-get update && apt-get install -y \
|
|
10 |
build-essential \
|
11 |
cmake \
|
12 |
&& apt-get clean \
|
13 |
-
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
|
14 |
|
15 |
RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
|
16 |
|
|
|
10 |
build-essential \
|
11 |
cmake \
|
12 |
&& apt-get clean \
|
13 |
+
&& rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
RUN apt-get install poppler-utils -y
|
16 |
|
17 |
RUN mkdir -p /app/nltk_data && chmod -R 777 /app/nltk_data
|
18 |
|
app.py
CHANGED
@@ -8,7 +8,6 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
9 |
|
10 |
|
11 |
-
|
12 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
13 |
app.add_middleware(
|
14 |
CORSMiddleware,
|
@@ -64,6 +63,24 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
64 |
}
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
@app.post("/addText")
|
68 |
async def addText(vectorstore: str, text: str):
|
69 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
|
|
8 |
from langchain_community.document_loaders import UnstructuredURLLoader
|
9 |
|
10 |
|
|
|
11 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
12 |
app.add_middleware(
|
13 |
CORSMiddleware,
|
|
|
63 |
}
|
64 |
|
65 |
|
66 |
+
@app.post("/addImagePDF")
|
67 |
+
async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
68 |
+
pdf = await pdf.read()
|
69 |
+
text = getTextFromImagePDF(pdfBytes = pdf)
|
70 |
+
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
71 |
+
df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
72 |
+
currentCount = df[(df["username"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
73 |
+
limit = client.table("ConversAI_UserConfig").select("tokenLimit").eq("username", username).execute().data[0]["tokenLimit"]
|
74 |
+
newCount = currentCount + len(text)
|
75 |
+
if newCount < int(limit):
|
76 |
+
client.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("username", username).eq("chatbotname", chatbotname).execute()
|
77 |
+
return addDocuments(text = text, vectorstore = vectorstore)
|
78 |
+
else:
|
79 |
+
return {
|
80 |
+
"output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
|
81 |
+
}
|
82 |
+
|
83 |
+
|
84 |
@app.post("/addText")
|
85 |
async def addText(vectorstore: str, text: str):
|
86 |
username, chatbotname = vectorstore.split("-")[1], vectorstore.split("-")[2]
|
functions.py
CHANGED
@@ -18,6 +18,9 @@ from langchain.retrievers.document_compressors import FlashrankRerank
|
|
18 |
from supabase.client import create_client
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
|
|
|
|
|
|
21 |
from bs4 import BeautifulSoup
|
22 |
from urllib.parse import urlparse, urljoin
|
23 |
from supabase import create_client
|
@@ -37,6 +40,7 @@ vectorEmbeddings = HuggingFaceEmbeddings(
|
|
37 |
model_kwargs = model_kwargs,
|
38 |
encode_kwargs = encode_kwargs
|
39 |
)
|
|
|
40 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
41 |
prompt = """
|
42 |
INSTRUCTIONS:
|
@@ -282,4 +286,19 @@ def getLinks(url: str, timeout = 30):
|
|
282 |
break
|
283 |
else:
|
284 |
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
285 |
-
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
from supabase.client import create_client
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
21 |
+
from pdf2image import convert_from_bytes
|
22 |
+
import numpy as np
|
23 |
+
from paddleocr import PaddleOCR
|
24 |
from bs4 import BeautifulSoup
|
25 |
from urllib.parse import urlparse, urljoin
|
26 |
from supabase import create_client
|
|
|
40 |
model_kwargs = model_kwargs,
|
41 |
encode_kwargs = encode_kwargs
|
42 |
)
|
43 |
+
ocr = PaddleOCR(use_angle_cls=True, lang='en')
|
44 |
sparseEmbeddings = FastEmbedSparse(model = "Qdrant/BM25")
|
45 |
prompt = """
|
46 |
INSTRUCTIONS:
|
|
|
286 |
break
|
287 |
else:
|
288 |
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
289 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
290 |
+
|
291 |
+
|
292 |
+
def getTextFromImagePDF(pdfBytes):
|
293 |
+
global ocr
|
294 |
+
allImages = convert_from_bytes(pdfBytes)
|
295 |
+
allImages = [np.array(image) for image in allImages]
|
296 |
+
pageWiseText = []
|
297 |
+
for page in allImages:
|
298 |
+
result = ocr.ocr(page)
|
299 |
+
if result[0]:
|
300 |
+
retrievedText = "\n".join([result[0][x][1][0] for x in range(len(result[0]))])
|
301 |
+
else:
|
302 |
+
retrievedText = ""
|
303 |
+
pageWiseText.append(retrievedText)
|
304 |
+
return "\n\n\n".join(pageWiseText)
|
requirements.txt
CHANGED
@@ -12,10 +12,14 @@ langchain-qdrant
|
|
12 |
langchain-groq
|
13 |
langsmith
|
14 |
lxml
|
|
|
15 |
PyPDF2
|
16 |
python-dotenv
|
17 |
pydantic
|
18 |
pandas
|
|
|
|
|
|
|
19 |
sentence-transformers
|
20 |
supabase
|
21 |
unstructured
|
|
|
12 |
langchain-groq
|
13 |
langsmith
|
14 |
lxml
|
15 |
+
numpy
|
16 |
PyPDF2
|
17 |
python-dotenv
|
18 |
pydantic
|
19 |
pandas
|
20 |
+
paddlepaddle-gpu
|
21 |
+
paddleocr
|
22 |
+
pdf2image
|
23 |
sentence-transformers
|
24 |
supabase
|
25 |
unstructured
|