Spaces:
Sleeping
Sleeping
8bitnand
commited on
Commit
•
ca9a177
1
Parent(s):
b229aa1
initial commit - basic chat model UI
Browse files- README.md +1 -13
- __pycache__/google.cpython-39.pyc +0 -0
- app.py +24 -0
- google.py +144 -0
- model.py +15 -0
- rag.configs.yml +8 -0
- requirments.txt +1 -0
README.md
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
title: GoogleSearchWithLLM
|
3 |
-
emoji: 📚
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.33.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
install nltk.download("punkt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/google.cpython-39.pyc
ADDED
Binary file (5.39 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("LLM powred Google search")
|
4 |
+
|
5 |
+
if "messages" not in st.session_state:
|
6 |
+
st.session_state.messages = []
|
7 |
+
|
8 |
+
for message in st.session_state.messages:
|
9 |
+
with st.chat_message(message["role"]):
|
10 |
+
st.markdown(message["content"])
|
11 |
+
|
12 |
+
|
13 |
+
if prompt := st.chat_input("Search Here insetad of Google"):
|
14 |
+
st.chat_message("user").markdown(prompt)
|
15 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
16 |
+
|
17 |
+
response = (
|
18 |
+
f"Ans - {prompt}" # TODO add answer to the prompt by calling the answer method
|
19 |
+
)
|
20 |
+
|
21 |
+
with st.chat_message("assistant"):
|
22 |
+
st.markdown(response)
|
23 |
+
|
24 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
google.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import urllib
|
3 |
+
import requests
|
4 |
+
import nltk
|
5 |
+
import torch
|
6 |
+
from typing import Union
|
7 |
+
from sentence_transformers import SentenceTransformer, util
|
8 |
+
|
9 |
+
|
10 |
+
class GoogleSearch:
|
11 |
+
def __init__(self, query: str) -> None:
|
12 |
+
self.query = query
|
13 |
+
escaped_query = urllib.parse.quote_plus(query)
|
14 |
+
self.URL = f"https://www.google.com/search?q={escaped_query}"
|
15 |
+
|
16 |
+
self.headers = headers = {
|
17 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
|
18 |
+
}
|
19 |
+
self.links = self.get_initial_links()
|
20 |
+
self.all_page_data = self.all_pages()
|
21 |
+
|
22 |
+
def clean_urls(self, anchors: list[str]) -> list[str]:
|
23 |
+
|
24 |
+
links: list[str] = []
|
25 |
+
for a in anchors:
|
26 |
+
links.append(
|
27 |
+
list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
|
28 |
+
)
|
29 |
+
|
30 |
+
links = [
|
31 |
+
link.split("url=")[-1]
|
32 |
+
for sublist in links
|
33 |
+
for link in sublist
|
34 |
+
if len(link) > 0
|
35 |
+
]
|
36 |
+
return links
|
37 |
+
|
38 |
+
def read_url_page(self, url: str) -> str:
|
39 |
+
|
40 |
+
response = requests.get(url, headers=self.headers)
|
41 |
+
response.raise_for_status()
|
42 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
43 |
+
return soup.get_text(strip=True)
|
44 |
+
|
45 |
+
def get_initial_links(self) -> list[str]:
|
46 |
+
"""
|
47 |
+
scrape google for the query with keyword based search
|
48 |
+
"""
|
49 |
+
|
50 |
+
response = requests.get(self.URL, headers=self.headers)
|
51 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
52 |
+
anchors = soup.find_all("a", href=True)
|
53 |
+
return self.clean_urls(anchors)
|
54 |
+
|
55 |
+
def all_pages(self) -> list[tuple[str, str]]:
|
56 |
+
|
57 |
+
data: list[tuple[str, str]] = []
|
58 |
+
for url in self.links:
|
59 |
+
try:
|
60 |
+
data.append((url, self.read_url_page(url)))
|
61 |
+
except requests.exceptions.HTTPError as e:
|
62 |
+
print(e)
|
63 |
+
|
64 |
+
return data
|
65 |
+
|
66 |
+
|
67 |
+
class Document:
|
68 |
+
|
69 |
+
def __init__(self, data: list[tuple[str, str]], min_char_len: int) -> None:
|
70 |
+
"""
|
71 |
+
data : list[tuple[str, str]]
|
72 |
+
url and page data
|
73 |
+
"""
|
74 |
+
self.data = data
|
75 |
+
self.min_char_len = min_char_len
|
76 |
+
|
77 |
+
def make_min_len_chunk(self):
|
78 |
+
raise NotImplementedError
|
79 |
+
|
80 |
+
def chunk_page(
|
81 |
+
self,
|
82 |
+
page_text: str,
|
83 |
+
) -> list[str]:
|
84 |
+
|
85 |
+
min_len_chunks: list[str] = []
|
86 |
+
chunk_text = nltk.tokenize.sent_tokenize(page_text)
|
87 |
+
sentence: str = ""
|
88 |
+
for sent in chunk_text:
|
89 |
+
if len(sentence) > self.min_char_len:
|
90 |
+
min_len_chunks.append(sentence)
|
91 |
+
sent = ""
|
92 |
+
sentence = ""
|
93 |
+
else:
|
94 |
+
sentence += sent
|
95 |
+
return min_len_chunks
|
96 |
+
|
97 |
+
def doc(self) -> tuple[list[str], list[str]]:
|
98 |
+
chunked_data: list[str] = []
|
99 |
+
urls: list[str] = []
|
100 |
+
for url, dataitem in self.data:
|
101 |
+
data = self.chunk_page(dataitem)
|
102 |
+
chunked_data.append(data)
|
103 |
+
urls.append(url)
|
104 |
+
|
105 |
+
chunked_data = [chunk for sublist in chunked_data for chunk in sublist]
|
106 |
+
return chunked_data, url
|
107 |
+
|
108 |
+
|
109 |
+
class SemanticSearch:
|
110 |
+
def __init__(
|
111 |
+
self, query: str, d: Document, g: GoogleSearch, model_path: str, device: str
|
112 |
+
) -> None:
|
113 |
+
query = query
|
114 |
+
self.doc_chunks, self.urls = d.doc()
|
115 |
+
self.st = SentenceTransformer(
|
116 |
+
model_path,
|
117 |
+
device,
|
118 |
+
)
|
119 |
+
|
120 |
+
def semanti_search(self, query: str, k: int = 10):
|
121 |
+
query_embeding = self.get_embeding(query)
|
122 |
+
doc_embeding = self.get_embeding(self.doc_chunks)
|
123 |
+
scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
|
124 |
+
|
125 |
+
top_k = torch.topk(scores, k=k)[1].cpu().tolist()
|
126 |
+
return [(self.doc_chunks[i], scores[i]) for i in top_k]
|
127 |
+
|
128 |
+
def get_embeding(self, text: Union[list[str], str]):
|
129 |
+
en = self.st.encode(text)
|
130 |
+
return en
|
131 |
+
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
|
135 |
+
query = "what is LLM"
|
136 |
+
g = GoogleSearch(query)
|
137 |
+
data = g.all_page_data
|
138 |
+
d = Document(data, 333)
|
139 |
+
s = SemanticSearch(query, d, g, "all-mpnet-base-v2", "mps")
|
140 |
+
print(len(s.semanti_search(query, k=64)))
|
141 |
+
|
142 |
+
# g = GoogleSearch("what is LLM")
|
143 |
+
# d = Document(g.all_page_data)
|
144 |
+
# print(len(d.doc()[0]))
|
model.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from google import SemanticSearch
|
2 |
+
from transformers import AutoTokenizer, AutoModel
|
3 |
+
|
4 |
+
|
5 |
+
class RAGModel:
|
6 |
+
def __init__(self, configs) -> None:
|
7 |
+
self.configs = configs
|
8 |
+
model_url = configs["RAG"]["genration_model"]
|
9 |
+
self.model = AutoModel.from_pretrained(model_url)
|
10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_url)
|
11 |
+
|
12 |
+
def create_propmt(self, topk_items: list[str]):
|
13 |
+
|
14 |
+
|
15 |
+
def answer_query(self, query: str, context: list[str]) :
|
rag.configs.yml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
document:
|
2 |
+
min_char_length: 333
|
3 |
+
|
4 |
+
common:
|
5 |
+
embeding_model: all-mpnet-base-v2
|
6 |
+
genration_model: meta-llama/Llama-2-7b
|
7 |
+
device: cpu
|
8 |
+
|
requirments.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
beautifulsoup4=4.12.3
|