Spaces:
Sleeping
Sleeping
Luciferalive
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,6 @@ import numpy as np
|
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
from typing import List
|
8 |
-
from docx import Document
|
9 |
from sentence_transformers import SentenceTransformer
|
10 |
from langchain_community.vectorstores import Chroma
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -13,25 +12,12 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
|
|
13 |
from groq import Groq
|
14 |
import gradio as gr
|
15 |
import requests
|
16 |
-
from zipfile import ZipFile
|
17 |
|
18 |
# Ensure the Tesseract OCR path is set correctly
|
19 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
|
20 |
|
21 |
GROQ_API_KEY = "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"
|
22 |
|
23 |
-
def extract_text_from_doc(doc_content):
|
24 |
-
"""Extract text from DOC file content."""
|
25 |
-
try:
|
26 |
-
with ZipFile(io.BytesIO(doc_content)) as zip_file:
|
27 |
-
xml_content = zip_file.read('word/document.xml')
|
28 |
-
doc = Document(io.BytesIO(xml_content))
|
29 |
-
extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
30 |
-
return extracted_text
|
31 |
-
except Exception as e:
|
32 |
-
print("Failed to extract text from DOC:", e)
|
33 |
-
return ""
|
34 |
-
|
35 |
def preprocess_text(text):
|
36 |
try:
|
37 |
text = text.replace('\n', ' ').replace('\r', ' ')
|
@@ -44,14 +30,6 @@ def preprocess_text(text):
|
|
44 |
print("Failed to preprocess text:", e)
|
45 |
return ""
|
46 |
|
47 |
-
def process_files(file_contents: List[bytes]):
|
48 |
-
all_text = ""
|
49 |
-
for file_content in file_contents:
|
50 |
-
extracted_text = extract_text_from_doc(file_content)
|
51 |
-
preprocessed_text = preprocess_text(extracted_text)
|
52 |
-
all_text += preprocessed_text + " "
|
53 |
-
return all_text
|
54 |
-
|
55 |
def compute_cosine_similarity_scores(query, retrieved_docs):
|
56 |
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
57 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
@@ -60,26 +38,22 @@ def compute_cosine_similarity_scores(query, retrieved_docs):
|
|
60 |
readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
|
61 |
return readable_scores
|
62 |
|
63 |
-
def
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
print(f"Failed to download {file_name}: {e}")
|
77 |
-
return file_contents
|
78 |
-
|
79 |
-
def create_vector_store(all_text):
|
80 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
81 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
82 |
-
texts = text_splitter.split_text(
|
83 |
if not texts:
|
84 |
print("No text chunks created.")
|
85 |
return None
|
@@ -102,17 +76,12 @@ def answer_query_with_similarity(query):
|
|
102 |
try:
|
103 |
vector_store = load_vector_store()
|
104 |
if not vector_store:
|
105 |
-
|
106 |
-
if not
|
107 |
-
print("No
|
108 |
-
return None
|
109 |
-
|
110 |
-
all_text = process_files(file_contents)
|
111 |
-
if not all_text.strip():
|
112 |
-
print("No text extracted from documents.")
|
113 |
return None
|
114 |
|
115 |
-
vector_store = create_vector_store(
|
116 |
if not vector_store:
|
117 |
print("Failed to create Vector DB.")
|
118 |
return None
|
|
|
5 |
import pytesseract
|
6 |
from PIL import Image
|
7 |
from typing import List
|
|
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
from langchain_community.vectorstores import Chroma
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
12 |
from groq import Groq
|
13 |
import gradio as gr
|
14 |
import requests
|
|
|
15 |
|
16 |
# Ensure the Tesseract OCR path is set correctly
|
17 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
|
18 |
|
19 |
GROQ_API_KEY = "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"
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def preprocess_text(text):
|
22 |
try:
|
23 |
text = text.replace('\n', ' ').replace('\r', ' ')
|
|
|
30 |
print("Failed to preprocess text:", e)
|
31 |
return ""
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def compute_cosine_similarity_scores(query, retrieved_docs):
|
34 |
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
35 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
38 |
readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
|
39 |
return readable_scores
|
40 |
|
41 |
+
def fetch_text_file_from_huggingface_space():
|
42 |
+
url = "https://huggingface.co/spaces/Luciferalive/goosev9/blob/main/extracted_text.txt"
|
43 |
+
try:
|
44 |
+
response = requests.get(url)
|
45 |
+
response.raise_for_status()
|
46 |
+
text_content = response.text
|
47 |
+
print("Successfully downloaded the text file")
|
48 |
+
return text_content
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Failed to download the text file: {e}")
|
51 |
+
return ""
|
52 |
+
|
53 |
+
def create_vector_store(text_content):
|
|
|
|
|
|
|
|
|
54 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
55 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
56 |
+
texts = text_splitter.split_text(text_content)
|
57 |
if not texts:
|
58 |
print("No text chunks created.")
|
59 |
return None
|
|
|
76 |
try:
|
77 |
vector_store = load_vector_store()
|
78 |
if not vector_store:
|
79 |
+
text_content = fetch_text_file_from_huggingface_space()
|
80 |
+
if not text_content.strip():
|
81 |
+
print("No text content fetched.")
|
|
|
|
|
|
|
|
|
|
|
82 |
return None
|
83 |
|
84 |
+
vector_store = create_vector_store(text_content)
|
85 |
if not vector_store:
|
86 |
print("Failed to create Vector DB.")
|
87 |
return None
|