Spaces:

techconspartners
/

ConversAI

Sleeping

Rauhan commited on Aug 27, 2024

Commit

9245bf5

1 Parent(s): ea8ad26

DEBUG: base64 -> plain text

Files changed (2) hide show

app.py CHANGED Viewed

@@ -329,7 +329,7 @@ async def loadText(addTextConfig: AddText):
         "output": text,
         "source": "Text"
     }
-    numTokens = len(" ".join([base64.b64decode(text[x].encode("utf-8")).decode("utf-8") for x in text]).translate(str.maketrans('', '', string.punctuation)).split(" "))
     dct = json.dumps(dct, indent=1).encode("utf-8")
     fileName = createDataSourceName(sourceName="Text")
     response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")

         "output": text,
         "source": "Text"
     }
+    numTokens = len(" ".join([text[x] for x in text]).translate(str.maketrans('', '', string.punctuation)).split(" "))
     dct = json.dumps(dct, indent=1).encode("utf-8")
     fileName = createDataSourceName(sourceName="Text")
     response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")

functions.py CHANGED Viewed

@@ -297,7 +297,7 @@ def getTextFromImagePDF(pdfBytes):
         return cleanText(text = text)
     allImages = convert_from_bytes(pdfBytes)
-    texts = [base64.b64encode(getText(image).encode("utf-8")).decode("utf-8") for image in allImages]
     return {x + 1: y for x, y in enumerate(texts)}
@@ -313,7 +313,6 @@ def getTranscript(urls: str):
         except:
             doc = ""
             texts.append(doc)
-    texts = [base64.b64encode(text.encode("utf-8")).decode("utf-8") for text in texts]
     return {x: y for x, y in zip(urls, texts)}
@@ -331,8 +330,7 @@ def analyzeData(query, dataframe):
 def extractTextFromPage(page):
-    text = cleanText(text = page.get_text())
-    return base64.b64encode(text.encode("utf-8")).decode("utf-8")
 def extractTextFromPdf(pdf_path):
@@ -349,8 +347,7 @@ def extractTextFromUrl(url):
     response.raise_for_status()
     html = response.text
     soup = BeautifulSoup(html, 'lxml')
-    text = cleanText(text = soup.get_text(separator=' ', strip=True))
-    return base64.b64encode(text.encode("utf-8")).decode("utf-8")
 def extractTextFromUrlList(urls):

         return cleanText(text = text)
     allImages = convert_from_bytes(pdfBytes)
+    texts = [getText(image) for image in allImages]
     return {x + 1: y for x, y in enumerate(texts)}
         except:
             doc = ""
             texts.append(doc)
     return {x: y for x, y in zip(urls, texts)}
 def extractTextFromPage(page):
+    return cleanText(text = page.get_text())
 def extractTextFromPdf(pdf_path):
     response.raise_for_status()
     html = response.text
     soup = BeautifulSoup(html, 'lxml')
+    return cleanText(text = soup.get_text(separator=' ', strip=True))
 def extractTextFromUrlList(urls):