8bitnand commited on
Commit
8b6196b
1 Parent(s): 871255a

Multi processing for reading urls

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. __init__.py +1 -1
  3. app.py +3 -3
  4. model.py +5 -5
  5. google.py → search.py +30 -11
README.md CHANGED
@@ -5,4 +5,4 @@ app_file: app.py
5
  licese: mit
6
  ---
7
 
8
- install nltk.download("punkt")s
 
5
  licese: mit
6
  ---
7
 
8
+ install nltk.download("punkt")
__init__.py CHANGED
@@ -1 +1 @@
1
- from google import GoogleSearch, Document, SemanticSearch
 
1
+ import search
app.py CHANGED
@@ -1,6 +1,5 @@
1
- import sys
2
  import streamlit as st
3
- from google import SemanticSearch, GoogleSearch, Document
4
  from model import RAGModel, load_configs
5
 
6
 
@@ -38,7 +37,7 @@ if prompt := st.chat_input("Search Here insetad of Google"):
38
  st.session_state.messages.append({"role": "user", "content": prompt})
39
 
40
  search(prompt)
41
- s = SemanticSearch(
42
  prompt,
43
  st.session_state.doc,
44
  configs["model"]["embeding_model"],
@@ -51,3 +50,4 @@ if prompt := st.chat_input("Search Here insetad of Google"):
51
  st.markdown(response)
52
 
53
  st.session_state.messages.append({"role": "assistant", "content": response})
 
 
1
+ from search import SemanticSearch, GoogleSearch, Document
2
  import streamlit as st
 
3
  from model import RAGModel, load_configs
4
 
5
 
 
37
  st.session_state.messages.append({"role": "user", "content": prompt})
38
 
39
  search(prompt)
40
+ s, u = SemanticSearch(
41
  prompt,
42
  st.session_state.doc,
43
  configs["model"]["embeding_model"],
 
50
  st.markdown(response)
51
 
52
  st.session_state.messages.append({"role": "assistant", "content": response})
53
+
model.py CHANGED
@@ -1,4 +1,4 @@
1
- from google import SemanticSearch, GoogleSearch, Document
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from transformers import BitsAndBytesConfig
4
  from transformers.utils import is_flash_attn_2_available
@@ -71,8 +71,8 @@ if __name__ == "__main__":
71
  # g = GoogleSearch(query)
72
  # data = g.all_page_data
73
  # d = Document(data, 512)
74
- # s = SemanticSearch( "all-mpnet-base-v2", "mps")
75
  # topk = s.semantic_search(query=query, k=32)
76
- r = RAGModel(configs)
77
- output = r.answer_query(query=query, topk_items=[""])
78
- print(output)
 
1
+ from search import SemanticSearch, GoogleSearch, Document
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  from transformers import BitsAndBytesConfig
4
  from transformers.utils import is_flash_attn_2_available
 
71
  # g = GoogleSearch(query)
72
  # data = g.all_page_data
73
  # d = Document(data, 512)
74
+ # s, u = SemanticSearch( "all-mpnet-base-v2", "mps")
75
  # topk = s.semantic_search(query=query, k=32)
76
+ # r = RAGModel(configs)
77
+ # output = r.answer_query(query=query, topk_items=[""])
78
+ # print(output)
google.py → search.py RENAMED
@@ -5,6 +5,7 @@ import nltk
5
  import torch
6
  from typing import Union
7
  from sentence_transformers import SentenceTransformer, util
 
8
 
9
 
10
  class GoogleSearch:
@@ -33,10 +34,12 @@ class GoogleSearch:
33
  for link in sublist
34
  if len(link) > 0
35
  ]
 
36
  return links
37
 
38
  def read_url_page(self, url: str) -> str:
39
 
 
40
  response = requests.get(url, headers=self.headers)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, "html.parser")
@@ -55,11 +58,25 @@ class GoogleSearch:
55
  def all_pages(self) -> list[tuple[str, str]]:
56
 
57
  data: list[tuple[str, str]] = []
58
- for url in self.links:
59
- try:
60
- data.append((url, self.read_url_page(url)))
61
- except requests.exceptions.HTTPError as e:
62
- print(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  return data
65
 
@@ -111,7 +128,7 @@ class SemanticSearch:
111
  def __init__(
112
  self, doc_chunks: tuple[list, list], model_path: str, device: str
113
  ) -> None:
114
- query = query
115
  self.doc_chunks, self.urls = doc_chunks
116
  self.st = SentenceTransformer(
117
  model_path,
@@ -125,7 +142,7 @@ class SemanticSearch:
125
  scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
126
 
127
  top_k = torch.topk(scores, k=k)[1].cpu().tolist()
128
- return [(self.doc_chunks[i], scores[i]) for i in top_k]
129
 
130
  def get_embeding(self, text: Union[list[str], str]):
131
  en = self.st.encode(text)
@@ -137,10 +154,12 @@ if __name__ == "__main__":
137
  query = "what is LLM"
138
  g = GoogleSearch(query)
139
  data = g.all_page_data
140
- d = Document(data, 333)
141
-
142
- s = SemanticSearch("all-mpnet-base-v2", "mps")
143
- print(len(s.semantic_search(query, k=64)))
 
 
144
 
145
  # g = GoogleSearch("what is LLM")
146
  # d = Document(g.all_page_data)
 
5
  import torch
6
  from typing import Union
7
  from sentence_transformers import SentenceTransformer, util
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
 
10
 
11
  class GoogleSearch:
 
34
  for link in sublist
35
  if len(link) > 0
36
  ]
37
+ print(links)
38
  return links
39
 
40
  def read_url_page(self, url: str) -> str:
41
 
42
+ print(url)
43
  response = requests.get(url, headers=self.headers)
44
  response.raise_for_status()
45
  soup = BeautifulSoup(response.text, "html.parser")
 
58
  def all_pages(self) -> list[tuple[str, str]]:
59
 
60
  data: list[tuple[str, str]] = []
61
+ with ThreadPoolExecutor(max_workers=4) as executor:
62
+
63
+ future_to_url = {
64
+ executor.submit(self.read_url_page, url): url for url in self.links
65
+ }
66
+ for future in as_completed(future_to_url):
67
+ url = future_to_url[future]
68
+ try:
69
+ output = future.result()
70
+ data.append((url, output))
71
+
72
+ except requests.exceptions.HTTPError as e:
73
+ print(e)
74
+
75
+ # for url in self.links:
76
+ # try:
77
+ # data.append((url, self.read_url_page(url)))
78
+ # except requests.exceptions.HTTPError as e:
79
+ # print(e)
80
 
81
  return data
82
 
 
128
  def __init__(
129
  self, doc_chunks: tuple[list, list], model_path: str, device: str
130
  ) -> None:
131
+
132
  self.doc_chunks, self.urls = doc_chunks
133
  self.st = SentenceTransformer(
134
  model_path,
 
142
  scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
143
 
144
  top_k = torch.topk(scores, k=k)[1].cpu().tolist()
145
+ return [self.doc_chunks[i] for i in top_k], self.urls
146
 
147
  def get_embeding(self, text: Union[list[str], str]):
148
  en = self.st.encode(text)
 
154
  query = "what is LLM"
155
  g = GoogleSearch(query)
156
  data = g.all_page_data
157
+ # d = Document(data, 333)
158
+ # doc_chunks = d.doc()
159
+ # s = SemanticSearch(doc_chunks, "all-mpnet-base-v2", "mps")
160
+ # topk, u = s.semantic_search(query, k=64)
161
+ # print(len(topk))
162
+ # print(topk, u)
163
 
164
  # g = GoogleSearch("what is LLM")
165
  # d = Document(g.all_page_data)