Spaces:

nand-tmp
/

GoogleSearchWithLLM

Sleeping

App Files Files Community

8bitnand commited on Apr 10

Commit

ca9a177

•

1 Parent(s): b229aa1

initial commit - basic chat model UI

Browse files

Files changed (7) hide show

README.md +1 -13
__pycache__/google.cpython-39.pyc +0 -0
app.py +24 -0
google.py +144 -0
model.py +15 -0
rag.configs.yml +8 -0
requirments.txt +1 -0

README.md CHANGED Viewed

@@ -1,13 +1 @@
----
-title: GoogleSearchWithLLM
-emoji: 📚
-colorFrom: yellow
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.33.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ install nltk.download("punkt")

__pycache__/google.cpython-39.pyc ADDED Viewed

Binary file (5.39 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+st.title("LLM powred Google search")
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if prompt := st.chat_input("Search Here insetad of Google"):
+    st.chat_message("user").markdown(prompt)
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    response = (
+        f"Ans - {prompt}"  # TODO add answer to the prompt by calling the answer method
+    )
+    with st.chat_message("assistant"):
+        st.markdown(response)
+    st.session_state.messages.append({"role": "assistant", "content": response})

google.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from bs4 import BeautifulSoup
+import urllib
+import requests
+import nltk
+import torch
+from typing import Union
+from sentence_transformers import SentenceTransformer, util
+class GoogleSearch:
+    def __init__(self, query: str) -> None:
+        self.query = query
+        escaped_query = urllib.parse.quote_plus(query)
+        self.URL = f"https://www.google.com/search?q={escaped_query}"
+        self.headers = headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
+        }
+        self.links = self.get_initial_links()
+        self.all_page_data = self.all_pages()
+    def clean_urls(self, anchors: list[str]) -> list[str]:
+        links: list[str] = []
+        for a in anchors:
+            links.append(
+                list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
+            )
+        links = [
+            link.split("url=")[-1]
+            for sublist in links
+            for link in sublist
+            if len(link) > 0
+        ]
+        return links
+    def read_url_page(self, url: str) -> str:
+        response = requests.get(url, headers=self.headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        return soup.get_text(strip=True)
+    def get_initial_links(self) -> list[str]:
+        """
+        scrape google for the query with keyword based search
+        """
+        response = requests.get(self.URL, headers=self.headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        anchors = soup.find_all("a", href=True)
+        return self.clean_urls(anchors)
+    def all_pages(self) -> list[tuple[str, str]]:
+        data: list[tuple[str, str]] = []
+        for url in self.links:
+            try:
+                data.append((url, self.read_url_page(url)))
+            except requests.exceptions.HTTPError as e:
+                print(e)
+        return data
+class Document:
+    def __init__(self, data: list[tuple[str, str]], min_char_len: int) -> None:
+        """
+        data : list[tuple[str, str]]
+            url and page data
+        """
+        self.data = data
+        self.min_char_len = min_char_len
+    def make_min_len_chunk(self):
+        raise NotImplementedError
+    def chunk_page(
+        self,
+        page_text: str,
+    ) -> list[str]:
+        min_len_chunks: list[str] = []
+        chunk_text = nltk.tokenize.sent_tokenize(page_text)
+        sentence: str = ""
+        for sent in chunk_text:
+            if len(sentence) > self.min_char_len:
+                min_len_chunks.append(sentence)
+                sent = ""
+                sentence = ""
+            else:
+                sentence += sent
+        return min_len_chunks
+    def doc(self) -> tuple[list[str], list[str]]:
+        chunked_data: list[str] = []
+        urls: list[str] = []
+        for url, dataitem in self.data:
+            data = self.chunk_page(dataitem)
+            chunked_data.append(data)
+            urls.append(url)
+        chunked_data = [chunk for sublist in chunked_data for chunk in sublist]
+        return chunked_data, url
+class SemanticSearch:
+    def __init__(
+        self, query: str, d: Document, g: GoogleSearch, model_path: str, device: str
+    ) -> None:
+        query = query
+        self.doc_chunks, self.urls = d.doc()
+        self.st = SentenceTransformer(
+            model_path,
+            device,
+        )
+    def semanti_search(self, query: str, k: int = 10):
+        query_embeding = self.get_embeding(query)
+        doc_embeding = self.get_embeding(self.doc_chunks)
+        scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
+        top_k = torch.topk(scores, k=k)[1].cpu().tolist()
+        return [(self.doc_chunks[i], scores[i]) for i in top_k]
+    def get_embeding(self, text: Union[list[str], str]):
+        en = self.st.encode(text)
+        return en
+if __name__ == "__main__":
+    query = "what is LLM"
+    g = GoogleSearch(query)
+    data = g.all_page_data
+    d = Document(data, 333)
+    s = SemanticSearch(query, d, g, "all-mpnet-base-v2", "mps")
+    print(len(s.semanti_search(query, k=64)))
+    # g = GoogleSearch("what is LLM")
+    # d = Document(g.all_page_data)
+    # print(len(d.doc()[0]))

model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from google import SemanticSearch
+from transformers import AutoTokenizer, AutoModel
+class RAGModel:
+    def __init__(self, configs) -> None:
+        self.configs = configs
+        model_url = configs["RAG"]["genration_model"]
+        self.model = AutoModel.from_pretrained(model_url)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_url)
+    def create_propmt(self, topk_items: list[str]):
+    def answer_query(self, query: str, context: list[str]) :

rag.configs.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+document:
+  min_char_length: 333
+common:
+  embeding_model: all-mpnet-base-v2
+  genration_model: meta-llama/Llama-2-7b
+  device: cpu

requirments.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ beautifulsoup4=4.12.3