Luciferalive commited on
Commit
694ee68
·
verified ·
1 Parent(s): fbc8219

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -49
app.py CHANGED
@@ -5,7 +5,6 @@ import numpy as np
5
  import pytesseract
6
  from PIL import Image
7
  from typing import List
8
- from docx import Document
9
  from sentence_transformers import SentenceTransformer
10
  from langchain_community.vectorstores import Chroma
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -13,25 +12,12 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
13
  from groq import Groq
14
  import gradio as gr
15
  import requests
16
- from zipfile import ZipFile
17
 
18
  # Ensure the Tesseract OCR path is set correctly
19
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
20
 
21
  GROQ_API_KEY = "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"
22
 
23
- def extract_text_from_doc(doc_content):
24
- """Extract text from DOC file content."""
25
- try:
26
- with ZipFile(io.BytesIO(doc_content)) as zip_file:
27
- xml_content = zip_file.read('word/document.xml')
28
- doc = Document(io.BytesIO(xml_content))
29
- extracted_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
30
- return extracted_text
31
- except Exception as e:
32
- print("Failed to extract text from DOC:", e)
33
- return ""
34
-
35
  def preprocess_text(text):
36
  try:
37
  text = text.replace('\n', ' ').replace('\r', ' ')
@@ -44,14 +30,6 @@ def preprocess_text(text):
44
  print("Failed to preprocess text:", e)
45
  return ""
46
 
47
- def process_files(file_contents: List[bytes]):
48
- all_text = ""
49
- for file_content in file_contents:
50
- extracted_text = extract_text_from_doc(file_content)
51
- preprocessed_text = preprocess_text(extracted_text)
52
- all_text += preprocessed_text + " "
53
- return all_text
54
-
55
  def compute_cosine_similarity_scores(query, retrieved_docs):
56
  model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
57
  query_embedding = model.encode(query, convert_to_tensor=True)
@@ -60,26 +38,22 @@ def compute_cosine_similarity_scores(query, retrieved_docs):
60
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
61
  return readable_scores
62
 
63
- def fetch_files_from_huggingface_space():
64
- base_url = "https://huggingface.co/spaces/Luciferalive/goosev9/blob/main/"
65
- file_names = [f"{i}.docx" for i in range(2, 22)]
66
-
67
- file_contents = []
68
- for file_name in file_names:
69
- file_url = f"{base_url}{file_name}"
70
- try:
71
- response = requests.get(file_url)
72
- response.raise_for_status()
73
- file_contents.append(response.content)
74
- print(f"Successfully downloaded {file_name}")
75
- except Exception as e:
76
- print(f"Failed to download {file_name}: {e}")
77
- return file_contents
78
-
79
- def create_vector_store(all_text):
80
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
81
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
82
- texts = text_splitter.split_text(all_text)
83
  if not texts:
84
  print("No text chunks created.")
85
  return None
@@ -102,17 +76,12 @@ def answer_query_with_similarity(query):
102
  try:
103
  vector_store = load_vector_store()
104
  if not vector_store:
105
- file_contents = fetch_files_from_huggingface_space()
106
- if not file_contents:
107
- print("No files fetched from Hugging Face Space.")
108
- return None
109
-
110
- all_text = process_files(file_contents)
111
- if not all_text.strip():
112
- print("No text extracted from documents.")
113
  return None
114
 
115
- vector_store = create_vector_store(all_text)
116
  if not vector_store:
117
  print("Failed to create Vector DB.")
118
  return None
 
5
  import pytesseract
6
  from PIL import Image
7
  from typing import List
 
8
  from sentence_transformers import SentenceTransformer
9
  from langchain_community.vectorstores import Chroma
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
12
  from groq import Groq
13
  import gradio as gr
14
  import requests
 
15
 
16
  # Ensure the Tesseract OCR path is set correctly
17
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
18
 
19
  GROQ_API_KEY = "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"
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def preprocess_text(text):
22
  try:
23
  text = text.replace('\n', ' ').replace('\r', ' ')
 
30
  print("Failed to preprocess text:", e)
31
  return ""
32
 
 
 
 
 
 
 
 
 
33
  def compute_cosine_similarity_scores(query, retrieved_docs):
34
  model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
35
  query_embedding = model.encode(query, convert_to_tensor=True)
 
38
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
39
  return readable_scores
40
 
41
+ def fetch_text_file_from_huggingface_space():
42
+ url = "https://huggingface.co/spaces/Luciferalive/goosev9/blob/main/extracted_text.txt"
43
+ try:
44
+ response = requests.get(url)
45
+ response.raise_for_status()
46
+ text_content = response.text
47
+ print("Successfully downloaded the text file")
48
+ return text_content
49
+ except Exception as e:
50
+ print(f"Failed to download the text file: {e}")
51
+ return ""
52
+
53
+ def create_vector_store(text_content):
 
 
 
 
54
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
55
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
56
+ texts = text_splitter.split_text(text_content)
57
  if not texts:
58
  print("No text chunks created.")
59
  return None
 
76
  try:
77
  vector_store = load_vector_store()
78
  if not vector_store:
79
+ text_content = fetch_text_file_from_huggingface_space()
80
+ if not text_content.strip():
81
+ print("No text content fetched.")
 
 
 
 
 
82
  return None
83
 
84
+ vector_store = create_vector_store(text_content)
85
  if not vector_store:
86
  print("Failed to create Vector DB.")
87
  return None