Spaces:
Sleeping
Sleeping
DEBUG: base64 -> plain text
Browse files- app.py +1 -1
- functions.py +3 -6
app.py
CHANGED
@@ -329,7 +329,7 @@ async def loadText(addTextConfig: AddText):
|
|
329 |
"output": text,
|
330 |
"source": "Text"
|
331 |
}
|
332 |
-
numTokens = len(" ".join([
|
333 |
dct = json.dumps(dct, indent=1).encode("utf-8")
|
334 |
fileName = createDataSourceName(sourceName="Text")
|
335 |
response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")
|
|
|
329 |
"output": text,
|
330 |
"source": "Text"
|
331 |
}
|
332 |
+
numTokens = len(" ".join([text[x] for x in text]).translate(str.maketrans('', '', string.punctuation)).split(" "))
|
333 |
dct = json.dumps(dct, indent=1).encode("utf-8")
|
334 |
fileName = createDataSourceName(sourceName="Text")
|
335 |
response = supabase.storage.from_("ConversAI").upload(file=dct, path=f"{fileName}_data.json")
|
functions.py
CHANGED
@@ -297,7 +297,7 @@ def getTextFromImagePDF(pdfBytes):
|
|
297 |
return cleanText(text = text)
|
298 |
|
299 |
allImages = convert_from_bytes(pdfBytes)
|
300 |
-
texts = [
|
301 |
return {x + 1: y for x, y in enumerate(texts)}
|
302 |
|
303 |
|
@@ -313,7 +313,6 @@ def getTranscript(urls: str):
|
|
313 |
except:
|
314 |
doc = ""
|
315 |
texts.append(doc)
|
316 |
-
texts = [base64.b64encode(text.encode("utf-8")).decode("utf-8") for text in texts]
|
317 |
return {x: y for x, y in zip(urls, texts)}
|
318 |
|
319 |
|
@@ -331,8 +330,7 @@ def analyzeData(query, dataframe):
|
|
331 |
|
332 |
|
333 |
def extractTextFromPage(page):
|
334 |
-
|
335 |
-
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
336 |
|
337 |
|
338 |
def extractTextFromPdf(pdf_path):
|
@@ -349,8 +347,7 @@ def extractTextFromUrl(url):
|
|
349 |
response.raise_for_status()
|
350 |
html = response.text
|
351 |
soup = BeautifulSoup(html, 'lxml')
|
352 |
-
|
353 |
-
return base64.b64encode(text.encode("utf-8")).decode("utf-8")
|
354 |
|
355 |
|
356 |
def extractTextFromUrlList(urls):
|
|
|
297 |
return cleanText(text = text)
|
298 |
|
299 |
allImages = convert_from_bytes(pdfBytes)
|
300 |
+
texts = [getText(image) for image in allImages]
|
301 |
return {x + 1: y for x, y in enumerate(texts)}
|
302 |
|
303 |
|
|
|
313 |
except:
|
314 |
doc = ""
|
315 |
texts.append(doc)
|
|
|
316 |
return {x: y for x, y in zip(urls, texts)}
|
317 |
|
318 |
|
|
|
330 |
|
331 |
|
332 |
def extractTextFromPage(page):
|
333 |
+
return cleanText(text = page.get_text())
|
|
|
334 |
|
335 |
|
336 |
def extractTextFromPdf(pdf_path):
|
|
|
347 |
response.raise_for_status()
|
348 |
html = response.text
|
349 |
soup = BeautifulSoup(html, 'lxml')
|
350 |
+
return cleanText(text = soup.get_text(separator=' ', strip=True))
|
|
|
351 |
|
352 |
|
353 |
def extractTextFromUrlList(urls):
|