8bitnand commited on
Commit
ca9a177
1 Parent(s): b229aa1

initial commit - basic chat model UI

Browse files
Files changed (7) hide show
  1. README.md +1 -13
  2. __pycache__/google.cpython-39.pyc +0 -0
  3. app.py +24 -0
  4. google.py +144 -0
  5. model.py +15 -0
  6. rag.configs.yml +8 -0
  7. requirments.txt +1 -0
README.md CHANGED
@@ -1,13 +1 @@
1
- ---
2
- title: GoogleSearchWithLLM
3
- emoji: 📚
4
- colorFrom: yellow
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.33.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ install nltk.download("punkt")
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/google.cpython-39.pyc ADDED
Binary file (5.39 kB). View file
 
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.title("LLM powred Google search")
4
+
5
+ if "messages" not in st.session_state:
6
+ st.session_state.messages = []
7
+
8
+ for message in st.session_state.messages:
9
+ with st.chat_message(message["role"]):
10
+ st.markdown(message["content"])
11
+
12
+
13
+ if prompt := st.chat_input("Search Here insetad of Google"):
14
+ st.chat_message("user").markdown(prompt)
15
+ st.session_state.messages.append({"role": "user", "content": prompt})
16
+
17
+ response = (
18
+ f"Ans - {prompt}" # TODO add answer to the prompt by calling the answer method
19
+ )
20
+
21
+ with st.chat_message("assistant"):
22
+ st.markdown(response)
23
+
24
+ st.session_state.messages.append({"role": "assistant", "content": response})
google.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import urllib
3
+ import requests
4
+ import nltk
5
+ import torch
6
+ from typing import Union
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+
10
+ class GoogleSearch:
11
+ def __init__(self, query: str) -> None:
12
+ self.query = query
13
+ escaped_query = urllib.parse.quote_plus(query)
14
+ self.URL = f"https://www.google.com/search?q={escaped_query}"
15
+
16
+ self.headers = headers = {
17
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
18
+ }
19
+ self.links = self.get_initial_links()
20
+ self.all_page_data = self.all_pages()
21
+
22
+ def clean_urls(self, anchors: list[str]) -> list[str]:
23
+
24
+ links: list[str] = []
25
+ for a in anchors:
26
+ links.append(
27
+ list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
28
+ )
29
+
30
+ links = [
31
+ link.split("url=")[-1]
32
+ for sublist in links
33
+ for link in sublist
34
+ if len(link) > 0
35
+ ]
36
+ return links
37
+
38
+ def read_url_page(self, url: str) -> str:
39
+
40
+ response = requests.get(url, headers=self.headers)
41
+ response.raise_for_status()
42
+ soup = BeautifulSoup(response.text, "html.parser")
43
+ return soup.get_text(strip=True)
44
+
45
+ def get_initial_links(self) -> list[str]:
46
+ """
47
+ scrape google for the query with keyword based search
48
+ """
49
+
50
+ response = requests.get(self.URL, headers=self.headers)
51
+ soup = BeautifulSoup(response.text, "html.parser")
52
+ anchors = soup.find_all("a", href=True)
53
+ return self.clean_urls(anchors)
54
+
55
+ def all_pages(self) -> list[tuple[str, str]]:
56
+
57
+ data: list[tuple[str, str]] = []
58
+ for url in self.links:
59
+ try:
60
+ data.append((url, self.read_url_page(url)))
61
+ except requests.exceptions.HTTPError as e:
62
+ print(e)
63
+
64
+ return data
65
+
66
+
67
+ class Document:
68
+
69
+ def __init__(self, data: list[tuple[str, str]], min_char_len: int) -> None:
70
+ """
71
+ data : list[tuple[str, str]]
72
+ url and page data
73
+ """
74
+ self.data = data
75
+ self.min_char_len = min_char_len
76
+
77
+ def make_min_len_chunk(self):
78
+ raise NotImplementedError
79
+
80
+ def chunk_page(
81
+ self,
82
+ page_text: str,
83
+ ) -> list[str]:
84
+
85
+ min_len_chunks: list[str] = []
86
+ chunk_text = nltk.tokenize.sent_tokenize(page_text)
87
+ sentence: str = ""
88
+ for sent in chunk_text:
89
+ if len(sentence) > self.min_char_len:
90
+ min_len_chunks.append(sentence)
91
+ sent = ""
92
+ sentence = ""
93
+ else:
94
+ sentence += sent
95
+ return min_len_chunks
96
+
97
+ def doc(self) -> tuple[list[str], list[str]]:
98
+ chunked_data: list[str] = []
99
+ urls: list[str] = []
100
+ for url, dataitem in self.data:
101
+ data = self.chunk_page(dataitem)
102
+ chunked_data.append(data)
103
+ urls.append(url)
104
+
105
+ chunked_data = [chunk for sublist in chunked_data for chunk in sublist]
106
+ return chunked_data, url
107
+
108
+
109
+ class SemanticSearch:
110
+ def __init__(
111
+ self, query: str, d: Document, g: GoogleSearch, model_path: str, device: str
112
+ ) -> None:
113
+ query = query
114
+ self.doc_chunks, self.urls = d.doc()
115
+ self.st = SentenceTransformer(
116
+ model_path,
117
+ device,
118
+ )
119
+
120
+ def semanti_search(self, query: str, k: int = 10):
121
+ query_embeding = self.get_embeding(query)
122
+ doc_embeding = self.get_embeding(self.doc_chunks)
123
+ scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
124
+
125
+ top_k = torch.topk(scores, k=k)[1].cpu().tolist()
126
+ return [(self.doc_chunks[i], scores[i]) for i in top_k]
127
+
128
+ def get_embeding(self, text: Union[list[str], str]):
129
+ en = self.st.encode(text)
130
+ return en
131
+
132
+
133
+ if __name__ == "__main__":
134
+
135
+ query = "what is LLM"
136
+ g = GoogleSearch(query)
137
+ data = g.all_page_data
138
+ d = Document(data, 333)
139
+ s = SemanticSearch(query, d, g, "all-mpnet-base-v2", "mps")
140
+ print(len(s.semanti_search(query, k=64)))
141
+
142
+ # g = GoogleSearch("what is LLM")
143
+ # d = Document(g.all_page_data)
144
+ # print(len(d.doc()[0]))
model.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google import SemanticSearch
2
+ from transformers import AutoTokenizer, AutoModel
3
+
4
+
5
+ class RAGModel:
6
+ def __init__(self, configs) -> None:
7
+ self.configs = configs
8
+ model_url = configs["RAG"]["genration_model"]
9
+ self.model = AutoModel.from_pretrained(model_url)
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_url)
11
+
12
+ def create_propmt(self, topk_items: list[str]):
13
+
14
+
15
+ def answer_query(self, query: str, context: list[str]) :
rag.configs.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ document:
2
+ min_char_length: 333
3
+
4
+ common:
5
+ embeding_model: all-mpnet-base-v2
6
+ genration_model: meta-llama/Llama-2-7b
7
+ device: cpu
8
+
requirments.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ beautifulsoup4=4.12.3