XThomasBU commited on
Commit
b83cc65
·
1 Parent(s): 8591fb3

hf sync commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
10
+
11
+ RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
12
+
13
+ COPY . /code
14
+
15
+ RUN ls -R
16
+
17
+ # Change permissions to allow writing to the directory
18
+ RUN chmod -R 777 /code
19
+
20
+ # Create a logs directory and set permissions
21
+ RUN mkdir /code/logs && chmod 777 /code/logs
22
+
23
+ # Create a cache directory within the application's working directory
24
+ RUN mkdir /.cache && chmod -R 777 /.cache
25
+
26
+ RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true
27
+ RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true
28
+
29
+ CMD ["chainlit", "run", "code/main.py", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,13 +1,13 @@
1
- # dl4ds_tutor
 
 
 
 
 
 
 
2
 
3
- ## Setup
4
- 1. conda create -n dl4ds_tutor python=3.9
5
- 2. conda activate dl4ds_tutor
6
- 3. pip install -r requirements.txt
7
- 4. Create a .env file and add your openai api key as 'OPENAI_API_KEY=XXX'
8
-
9
- ## Instructions
10
- 1. Add files to `data/`
11
- 2. cd code
12
- 3. chainlit run main.py
13
 
 
 
1
+ ---
2
+ title: Dl4ds Tutor
3
+ emoji: 🏃
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
 
10
+ DL4DS Tutor
11
+ ===========
 
 
 
 
 
 
 
 
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
code/chainlit.md → chainlit.md RENAMED
@@ -3,6 +3,8 @@
3
  Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
4
  This is still very much a Work in Progress.
5
 
 
 
6
  ## Useful Links 🔗
7
 
8
  - **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
 
3
  Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
4
  This is still very much a Work in Progress.
5
 
6
+ ### --- Please wait while the Tutor loads... ---
7
+
8
  ## Useful Links 🔗
9
 
10
  - **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
code/config.yml CHANGED
@@ -1,26 +1,29 @@
1
  embedding_options:
2
  embedd_files: True # bool
3
  persist_directory: null # str or None
4
- data_path: '../data' # str
 
 
5
  db_option : 'FAISS' # str
6
  db_path : 'vectorstores' # str
7
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
8
- search_top_k : 5 # int
9
  llm_params:
10
- use_history: True # bool
11
- llm_loader: 'openai' # str [ctransformers, openai]
12
  openai_params:
13
  model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
14
- ctransformers_params:
15
- model: "TheBloke/Llama-2-7B-Chat-GGML"
16
  model_type: "llama"
 
17
  splitter_options:
18
  use_splitter: True # bool
19
  split_by_token : True # bool
20
  remove_leftover_delimiters: True # bool
21
  remove_chunks: False # bool
22
- chunk_size : 800 # int
23
- chunk_overlap : 80 # int
24
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
25
  front_chunks_to_remove : null # int or None
26
  last_chunks_to_remove : null # int or None
 
1
  embedding_options:
2
  embedd_files: True # bool
3
  persist_directory: null # str or None
4
+ data_path: 'storage/data' # str
5
+ url_file_path: 'storage/data/urls.txt' # str
6
+ expand_urls: True # bool
7
  db_option : 'FAISS' # str
8
  db_path : 'vectorstores' # str
9
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
10
+ search_top_k : 3 # int
11
  llm_params:
12
+ use_history: False # bool
13
+ llm_loader: 'local_llm' # str [local_llm, openai]
14
  openai_params:
15
  model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
16
+ local_llm_params:
17
+ model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
18
  model_type: "llama"
19
+ temperature: 0.2
20
  splitter_options:
21
  use_splitter: True # bool
22
  split_by_token : True # bool
23
  remove_leftover_delimiters: True # bool
24
  remove_chunks: False # bool
25
+ chunk_size : 300 # int
26
+ chunk_overlap : 30 # int
27
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
28
  front_chunks_to_remove : null # int or None
29
  last_chunks_to_remove : null # int or None
code/main.py CHANGED
@@ -12,6 +12,8 @@ import logging
12
  from dotenv import load_dotenv
13
 
14
  from modules.llm_tutor import LLMTutor
 
 
15
 
16
 
17
  logger = logging.getLogger(__name__)
@@ -31,22 +33,70 @@ file_handler.setLevel(logging.INFO)
31
  file_handler.setFormatter(formatter)
32
  logger.addHandler(file_handler)
33
 
34
- with open("config.yml", "r") as f:
35
- config = yaml.safe_load(f)
36
- print(config)
37
- logger.info("Config file loaded")
38
- logger.info(f"Config: {config}")
39
- logger.info("Creating llm_tutor instance")
40
- llm_tutor = LLMTutor(config, logger=logger)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  # chainlit code
44
  @cl.on_chat_start
45
  async def start():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  chain = llm_tutor.qa_bot()
47
- msg = cl.Message(content="Starting the bot...")
 
48
  await msg.send()
49
- msg.content = "Hey, What Can I Help You With?"
50
  await msg.update()
51
 
52
  cl.user_session.set("chain", chain)
@@ -54,56 +104,21 @@ async def start():
54
 
55
  @cl.on_message
56
  async def main(message):
 
57
  chain = cl.user_session.get("chain")
58
- cb = cl.AsyncLangchainCallbackHandler(
59
- stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
60
- )
61
- cb.answer_reached = True
62
  # res=await chain.acall(message, callbacks=[cb])
63
- res = await chain.acall(message.content, callbacks=[cb])
64
- # print(f"response: {res}")
65
  try:
66
  answer = res["answer"]
67
  except:
68
  answer = res["result"]
69
  print(f"answer: {answer}")
70
- source_elements_dict = {}
71
- source_elements = []
72
- found_sources = []
73
-
74
- for idx, source in enumerate(res["source_documents"]):
75
- title = source.metadata["source"]
76
 
77
- if title not in source_elements_dict:
78
- source_elements_dict[title] = {
79
- "page_number": [source.metadata["page"]],
80
- "url": source.metadata["source"],
81
- "content": source.page_content,
82
- }
83
 
84
- else:
85
- source_elements_dict[title]["page_number"].append(source.metadata["page"])
86
- source_elements_dict[title][
87
- "content_" + str(source.metadata["page"])
88
- ] = source.page_content
89
- # sort the page numbers
90
- # source_elements_dict[title]["page_number"].sort()
91
-
92
- for title, source in source_elements_dict.items():
93
- # create a string for the page numbers
94
- page_numbers = ", ".join([str(x) for x in source["page_number"]])
95
- text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
96
- source_elements.append(cl.Pdf(name="File", path=title))
97
- found_sources.append("File")
98
- # for pn in source["page_number"]:
99
- # source_elements.append(
100
- # cl.Text(name=str(pn), content=source["content_"+str(pn)])
101
- # )
102
- # found_sources.append(str(pn))
103
-
104
- if found_sources:
105
- answer += f"\nSource:{', '.join(found_sources)}"
106
- else:
107
- answer += f"\nNo source found."
108
-
109
- await cl.Message(content=answer, elements=source_elements).send()
 
12
  from dotenv import load_dotenv
13
 
14
  from modules.llm_tutor import LLMTutor
15
+ from modules.constants import *
16
+ from modules.helpers import get_sources
17
 
18
 
19
  logger = logging.getLogger(__name__)
 
33
  file_handler.setFormatter(formatter)
34
  logger.addHandler(file_handler)
35
 
36
+
37
+ # Adding option to select the chat profile
38
+ @cl.set_chat_profiles
39
+ async def chat_profile():
40
+ return [
41
+ cl.ChatProfile(
42
+ name="Llama",
43
+ markdown_description="Use the local LLM: **Tiny Llama**.",
44
+ ),
45
+ # cl.ChatProfile(
46
+ # name="Mistral",
47
+ # markdown_description="Use the local LLM: **Mistral**.",
48
+ # ),
49
+ cl.ChatProfile(
50
+ name="gpt-3.5-turbo-1106",
51
+ markdown_description="Use OpenAI API for **gpt-3.5-turbo-1106**.",
52
+ ),
53
+ cl.ChatProfile(
54
+ name="gpt-4",
55
+ markdown_description="Use OpenAI API for **gpt-4**.",
56
+ ),
57
+ ]
58
+
59
+
60
+ @cl.author_rename
61
+ def rename(orig_author: str):
62
+ rename_dict = {"Chatbot": "AI Tutor"}
63
+ return rename_dict.get(orig_author, orig_author)
64
 
65
 
66
  # chainlit code
67
  @cl.on_chat_start
68
  async def start():
69
+ with open("code/config.yml", "r") as f:
70
+ config = yaml.safe_load(f)
71
+ print(config)
72
+ logger.info("Config file loaded")
73
+ logger.info(f"Config: {config}")
74
+ logger.info("Creating llm_tutor instance")
75
+
76
+ chat_profile = cl.user_session.get("chat_profile")
77
+ if chat_profile is not None:
78
+ if chat_profile.lower() in ["gpt-3.5-turbo-1106", "gpt-4"]:
79
+ config["llm_params"]["llm_loader"] = "openai"
80
+ config["llm_params"]["openai_params"]["model"] = chat_profile.lower()
81
+ elif chat_profile.lower() == "llama":
82
+ config["llm_params"]["llm_loader"] = "local_llm"
83
+ config["llm_params"]["local_llm_params"]["model"] = LLAMA_PATH
84
+ config["llm_params"]["local_llm_params"]["model_type"] = "llama"
85
+ elif chat_profile.lower() == "mistral":
86
+ config["llm_params"]["llm_loader"] = "local_llm"
87
+ config["llm_params"]["local_llm_params"]["model"] = MISTRAL_PATH
88
+ config["llm_params"]["local_llm_params"]["model_type"] = "mistral"
89
+
90
+ else:
91
+ pass
92
+
93
+ llm_tutor = LLMTutor(config, logger=logger)
94
+
95
  chain = llm_tutor.qa_bot()
96
+ model = config["llm_params"]["local_llm_params"]["model"]
97
+ msg = cl.Message(content=f"Starting the bot {model}...")
98
  await msg.send()
99
+ msg.content = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else! You can find me at {model}"
100
  await msg.update()
101
 
102
  cl.user_session.set("chain", chain)
 
104
 
105
  @cl.on_message
106
  async def main(message):
107
+ user = cl.user_session.get("user")
108
  chain = cl.user_session.get("chain")
109
+ # cb = cl.AsyncLangchainCallbackHandler(
110
+ # stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
111
+ # )
112
+ # cb.answer_reached = True
113
  # res=await chain.acall(message, callbacks=[cb])
114
+ res = await chain.acall(message.content)
115
+ print(f"response: {res}")
116
  try:
117
  answer = res["answer"]
118
  except:
119
  answer = res["result"]
120
  print(f"answer: {answer}")
 
 
 
 
 
 
121
 
122
+ answer_with_sources, source_elements = get_sources(res, answer)
 
 
 
 
 
123
 
124
+ await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/modules/chat_model_loader.py CHANGED
@@ -1,24 +1,38 @@
1
  from langchain_community.chat_models import ChatOpenAI
2
  from langchain.llms import CTransformers
 
 
 
 
 
 
 
 
3
 
4
 
5
  class ChatModelLoader:
6
  def __init__(self, config):
7
  self.config = config
 
8
 
9
  def load_chat_model(self):
10
  if self.config["llm_params"]["llm_loader"] == "openai":
11
  llm = ChatOpenAI(
12
  model_name=self.config["llm_params"]["openai_params"]["model"]
13
  )
14
- elif self.config["llm_params"]["llm_loader"] == "Ctransformers":
15
- llm = CTransformers(
16
- model=self.config["llm_params"]["ctransformers_params"]["model"],
17
- model_type=self.config["llm_params"]["ctransformers_params"][
18
- "model_type"
 
 
 
 
 
 
 
19
  ],
20
- max_new_tokens=512,
21
- temperature=0.5,
22
  )
23
  else:
24
  raise ValueError("Invalid LLM Loader")
 
1
  from langchain_community.chat_models import ChatOpenAI
2
  from langchain.llms import CTransformers
3
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
4
+ from transformers import AutoTokenizer, TextStreamer
5
+ from langchain.llms import LlamaCpp
6
+ import torch
7
+ import transformers
8
+ import os
9
+ from langchain.callbacks.manager import CallbackManager
10
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
11
 
12
 
13
  class ChatModelLoader:
14
  def __init__(self, config):
15
  self.config = config
16
+ self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
 
18
  def load_chat_model(self):
19
  if self.config["llm_params"]["llm_loader"] == "openai":
20
  llm = ChatOpenAI(
21
  model_name=self.config["llm_params"]["openai_params"]["model"]
22
  )
23
+ elif self.config["llm_params"]["llm_loader"] == "local_llm":
24
+ n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
25
+ model_path = self.config["llm_params"]["local_llm_params"]["model"]
26
+ llm = LlamaCpp(
27
+ model_path=model_path,
28
+ n_batch=n_batch,
29
+ n_ctx=2048,
30
+ f16_kv=True,
31
+ verbose=True,
32
+ n_threads=2,
33
+ temperature=self.config["llm_params"]["local_llm_params"][
34
+ "temperature"
35
  ],
 
 
36
  )
37
  else:
38
  raise ValueError("Invalid LLM Loader")
code/modules/constants.py CHANGED
@@ -10,15 +10,15 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
10
 
11
  # Prompt Templates
12
 
13
- prompt_template = """Use the following pieces of information to answer the user's question.
14
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
15
 
16
- Context: {context}
17
- Question: {question}
18
 
19
- Only return the helpful answer below and nothing else.
20
- Helpful answer:
21
- """
22
 
23
  prompt_template_with_history = """Use the following pieces of information to answer the user's question.
24
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
@@ -31,3 +31,27 @@ Question: {question}
31
  Only return the helpful answer below and nothing else.
32
  Helpful answer:
33
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Prompt Templates
12
 
13
+ # prompt_template = """Use the following pieces of information to answer the user's question.
14
+ # If you don't know the answer, just say that you don't know.
15
 
16
+ # Context: {context}
17
+ # Question: {question}
18
 
19
+ # Only return the helpful answer below and nothing else.
20
+ # Helpful answer:
21
+ # """
22
 
23
  prompt_template_with_history = """Use the following pieces of information to answer the user's question.
24
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
31
  Only return the helpful answer below and nothing else.
32
  Helpful answer:
33
  """
34
+
35
+ prompt_template = """
36
+ <|im_start|>system
37
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
38
+
39
+ Context:
40
+ {context}
41
+ <|im_end|>
42
+ <|im_start|>user
43
+ Question: Who is the instructor for this course?
44
+ <|im_end|>
45
+ <|im_start|>assistant
46
+ The instructor for this course is Prof. Thomas Gardos.
47
+ <|im_end|>
48
+ <|im_start|>user
49
+ Question: {question}
50
+ <|im_end|>
51
+ <|im_start|>assistant
52
+ """
53
+
54
+ # Model Paths
55
+
56
+ LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
57
+ MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
code/modules/data_loader.py CHANGED
@@ -225,21 +225,24 @@ class DataLoader:
225
 
226
  # Handle link by link
227
  for link_index, link in enumerate(weblinks):
228
- logger.info(f"\tSplitting link {link_index+1} : {link}")
229
- if "youtube" in link:
230
- title, document_chunks = get_youtube_transcript(link)
231
- else:
232
- title, document_chunks = get_html(link)
233
-
234
- # Additional wrangling - Remove leftover delimiters and any specified chunks
235
- if self.remove_leftover_delimiters:
236
- document_chunks = remove_delimiters(document_chunks)
237
- if self.config["splitter_options"]["remove_chunks"]:
238
- document_chunks = remove_chunks(document_chunks)
239
-
240
- print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
241
- self.document_names.append(title)
242
- self.document_chunks_full.extend(document_chunks)
 
 
 
243
 
244
  logger.info(
245
  f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
 
225
 
226
  # Handle link by link
227
  for link_index, link in enumerate(weblinks):
228
+ try:
229
+ logger.info(f"\tSplitting link {link_index+1} : {link}")
230
+ if "youtube" in link:
231
+ title, document_chunks = get_youtube_transcript(link)
232
+ else:
233
+ title, document_chunks = get_html(link)
234
+
235
+ # Additional wrangling - Remove leftover delimiters and any specified chunks
236
+ if self.remove_leftover_delimiters:
237
+ document_chunks = remove_delimiters(document_chunks)
238
+ if self.config["splitter_options"]["remove_chunks"]:
239
+ document_chunks = remove_chunks(document_chunks)
240
+
241
+ print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
242
+ self.document_names.append(title)
243
+ self.document_chunks_full.extend(document_chunks)
244
+ except:
245
+ logger.info(f"\t\tError splitting link {link_index+1} : {link}")
246
 
247
  logger.info(
248
  f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
code/modules/embedding_model_loader.py CHANGED
@@ -1,6 +1,8 @@
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain.embeddings import HuggingFaceEmbeddings
 
3
  from modules.constants import *
 
4
 
5
 
6
  class EmbeddingModelLoader:
@@ -20,4 +22,8 @@ class EmbeddingModelLoader:
20
  model_name="sentence-transformers/all-MiniLM-L6-v2",
21
  model_kwargs={"device": "cpu"},
22
  )
 
 
 
 
23
  return embedding_model
 
1
  from langchain_community.embeddings import OpenAIEmbeddings
2
  from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain.embeddings import LlamaCppEmbeddings
4
  from modules.constants import *
5
+ import os
6
 
7
 
8
  class EmbeddingModelLoader:
 
22
  model_name="sentence-transformers/all-MiniLM-L6-v2",
23
  model_kwargs={"device": "cpu"},
24
  )
25
+ # embedding_model = LlamaCppEmbeddings(
26
+ # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
27
+ # )
28
+
29
  return embedding_model
code/modules/helpers.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from tqdm import tqdm
4
+ from urllib.parse import urlparse
5
+ import chainlit as cl
6
+
7
+ """
8
+ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113
9
+ """
10
+
11
+
12
+ class WebpageCrawler:
13
+ def __init__(self):
14
+ pass
15
+
16
+ def getdata(self, url):
17
+ r = requests.get(url)
18
+ return r.text
19
+
20
+ def url_exists(self, url):
21
+ try:
22
+ response = requests.head(url)
23
+ return response.status_code == 200
24
+ except requests.ConnectionError:
25
+ return False
26
+
27
+ def get_links(self, website_link, base_url=None):
28
+ if base_url is None:
29
+ base_url = website_link
30
+ html_data = self.getdata(website_link)
31
+ soup = BeautifulSoup(html_data, "html.parser")
32
+ list_links = []
33
+ for link in soup.find_all("a", href=True):
34
+ # Append to list if new link contains original link
35
+ if str(link["href"]).startswith((str(website_link))):
36
+ list_links.append(link["href"])
37
+
38
+ # Include all href that do not start with website link but with "/"
39
+ if str(link["href"]).startswith("/"):
40
+ if link["href"] not in self.dict_href_links:
41
+ print(link["href"])
42
+ self.dict_href_links[link["href"]] = None
43
+ link_with_www = base_url + link["href"][1:]
44
+ if self.url_exists(link_with_www):
45
+ print("adjusted link =", link_with_www)
46
+ list_links.append(link_with_www)
47
+
48
+ # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
49
+ dict_links = dict.fromkeys(list_links, "Not-checked")
50
+ return dict_links
51
+
52
+ def get_subpage_links(self, l, base_url):
53
+ for link in tqdm(l):
54
+ # If not crawled through this page start crawling and get links
55
+ if l[link] == "Not-checked":
56
+ dict_links_subpages = self.get_links(link, base_url)
57
+ # Change the dictionary value of the link to "Checked"
58
+ l[link] = "Checked"
59
+ else:
60
+ # Create an empty dictionary in case every link is checked
61
+ dict_links_subpages = {}
62
+ # Add new dictionary to old dictionary
63
+ l = {**dict_links_subpages, **l}
64
+ return l
65
+
66
+ def get_all_pages(self, url, base_url):
67
+ dict_links = {url: "Not-checked"}
68
+ self.dict_href_links = {}
69
+ counter, counter2 = None, 0
70
+ while counter != 0:
71
+ counter2 += 1
72
+ dict_links2 = self.get_subpage_links(dict_links, base_url)
73
+ # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
74
+ # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
75
+ counter = sum(value == "Not-checked" for value in dict_links2.values())
76
+ dict_links = dict_links2
77
+ checked_urls = [
78
+ url for url, status in dict_links.items() if status == "Checked"
79
+ ]
80
+ return checked_urls
81
+
82
+
83
+ def get_urls_from_file(file_path: str):
84
+ """
85
+ Function to get urls from a file
86
+ """
87
+ with open(file_path, "r") as f:
88
+ urls = f.readlines()
89
+ urls = [url.strip() for url in urls]
90
+ return urls
91
+
92
+
93
+ def get_base_url(url):
94
+ parsed_url = urlparse(url)
95
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
96
+ return base_url
97
+
98
+
99
+ def get_sources(res, answer):
100
+ source_elements_dict = {}
101
+ source_elements = []
102
+ found_sources = []
103
+
104
+ source_dict = {} # Dictionary to store URL elements
105
+
106
+ for idx, source in enumerate(res["source_documents"]):
107
+ source_metadata = source.metadata
108
+ url = source_metadata["source"]
109
+
110
+ if url not in source_dict:
111
+ source_dict[url] = [source.page_content]
112
+ else:
113
+ source_dict[url].append(source.page_content)
114
+
115
+ for source_idx, (url, text_list) in enumerate(source_dict.items()):
116
+ full_text = ""
117
+ for url_idx, text in enumerate(text_list):
118
+ full_text += f"Source {url_idx+1}:\n {text}\n\n\n"
119
+ source_elements.append(cl.Text(name=url, content=full_text))
120
+ found_sources.append(url)
121
+
122
+ if found_sources:
123
+ answer += f"\n\nSources: {', '.join(found_sources)} "
124
+ else:
125
+ answer += f"\n\nNo source found."
126
+
127
+ # for idx, source in enumerate(res["source_documents"]):
128
+ # title = source.metadata["source"]
129
+
130
+ # if title not in source_elements_dict:
131
+ # source_elements_dict[title] = {
132
+ # "page_number": [source.metadata["page"]],
133
+ # "url": source.metadata["source"],
134
+ # "content": source.page_content,
135
+ # }
136
+
137
+ # else:
138
+ # source_elements_dict[title]["page_number"].append(source.metadata["page"])
139
+ # source_elements_dict[title][
140
+ # "content_" + str(source.metadata["page"])
141
+ # ] = source.page_content
142
+ # # sort the page numbers
143
+ # # source_elements_dict[title]["page_number"].sort()
144
+
145
+ # for title, source in source_elements_dict.items():
146
+ # # create a string for the page numbers
147
+ # page_numbers = ", ".join([str(x) for x in source["page_number"]])
148
+ # text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
149
+ # source_elements.append(cl.Pdf(name="File", path=title))
150
+ # found_sources.append("File")
151
+ # # for pn in source["page_number"]:
152
+ # # source_elements.append(
153
+ # # cl.Text(name=str(pn), content=source["content_"+str(pn)])
154
+ # # )
155
+ # # found_sources.append(str(pn))
156
+
157
+ # if found_sources:
158
+ # answer += f"\nSource:{', '.join(found_sources)}"
159
+ # else:
160
+ # answer += f"\nNo source found."
161
+
162
+ return answer, source_elements
code/modules/llm_tutor.py CHANGED
@@ -18,7 +18,7 @@ class LLMTutor:
18
  def __init__(self, config, logger=None):
19
  self.config = config
20
  self.vector_db = VectorDB(config, logger=logger)
21
- if self.config['embedding_options']['embedd_files']:
22
  self.vector_db.create_database()
23
  self.vector_db.save_database()
24
 
@@ -47,7 +47,11 @@ class LLMTutor:
47
  qa_chain = ConversationalRetrievalChain.from_llm(
48
  llm=llm,
49
  chain_type="stuff",
50
- retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
 
 
 
 
51
  return_source_documents=True,
52
  memory=memory,
53
  combine_docs_chain_kwargs={"prompt": prompt},
@@ -56,7 +60,11 @@ class LLMTutor:
56
  qa_chain = RetrievalQA.from_chain_type(
57
  llm=llm,
58
  chain_type="stuff",
59
- retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
 
 
 
 
60
  return_source_documents=True,
61
  chain_type_kwargs={"prompt": prompt},
62
  )
 
18
  def __init__(self, config, logger=None):
19
  self.config = config
20
  self.vector_db = VectorDB(config, logger=logger)
21
+ if self.config["embedding_options"]["embedd_files"]:
22
  self.vector_db.create_database()
23
  self.vector_db.save_database()
24
 
 
47
  qa_chain = ConversationalRetrievalChain.from_llm(
48
  llm=llm,
49
  chain_type="stuff",
50
+ retriever=db.as_retriever(
51
+ search_kwargs={
52
+ "k": self.config["embedding_options"]["search_top_k"]
53
+ }
54
+ ),
55
  return_source_documents=True,
56
  memory=memory,
57
  combine_docs_chain_kwargs={"prompt": prompt},
 
60
  qa_chain = RetrievalQA.from_chain_type(
61
  llm=llm,
62
  chain_type="stuff",
63
+ retriever=db.as_retriever(
64
+ search_kwargs={
65
+ "k": self.config["embedding_options"]["search_top_k"]
66
+ }
67
+ ),
68
  return_source_documents=True,
69
  chain_type_kwargs={"prompt": prompt},
70
  )
code/modules/vector_db.py CHANGED
@@ -6,6 +6,7 @@ from modules.embedding_model_loader import EmbeddingModelLoader
6
  from langchain.vectorstores import FAISS
7
  from modules.data_loader import DataLoader
8
  from modules.constants import *
 
9
 
10
 
11
  class VectorDB:
@@ -13,6 +14,7 @@ class VectorDB:
13
  self.config = config
14
  self.db_option = config["embedding_options"]["db_option"]
15
  self.document_names = None
 
16
 
17
  # Set up logging to both console and a file
18
  if logger is None:
@@ -43,7 +45,14 @@ class VectorDB:
43
  os.path.join(self.config["embedding_options"]["data_path"], file)
44
  for file in files
45
  ]
46
- return files
 
 
 
 
 
 
 
47
 
48
  def create_embedding_model(self):
49
  self.logger.info("Creating embedding function")
@@ -63,8 +72,8 @@ class VectorDB:
63
  def create_database(self):
64
  data_loader = DataLoader(self.config)
65
  self.logger.info("Loading data")
66
- files = self.load_files()
67
- document_chunks, document_names = data_loader.get_chunks(files, [""])
68
  self.logger.info("Completed loading data")
69
 
70
  self.create_embedding_model()
 
6
  from langchain.vectorstores import FAISS
7
  from modules.data_loader import DataLoader
8
  from modules.constants import *
9
+ from modules.helpers import *
10
 
11
 
12
  class VectorDB:
 
14
  self.config = config
15
  self.db_option = config["embedding_options"]["db_option"]
16
  self.document_names = None
17
+ self.webpage_crawler = WebpageCrawler()
18
 
19
  # Set up logging to both console and a file
20
  if logger is None:
 
45
  os.path.join(self.config["embedding_options"]["data_path"], file)
46
  for file in files
47
  ]
48
+ urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"])
49
+ if self.config["embedding_options"]["expand_urls"]:
50
+ all_urls = []
51
+ for url in urls:
52
+ base_url = get_base_url(url)
53
+ all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url))
54
+ urls = all_urls
55
+ return files, urls
56
 
57
  def create_embedding_model(self):
58
  self.logger.info("Creating embedding function")
 
72
  def create_database(self):
73
  data_loader = DataLoader(self.config)
74
  self.logger.info("Loading data")
75
+ files, urls = self.load_files()
76
+ document_chunks, document_names = data_loader.get_chunks(files, urls)
77
  self.logger.info("Completed loading data")
78
 
79
  self.create_embedding_model()
data/webpage.pdf DELETED
Binary file (51.3 kB)
 
requirements.txt CHANGED
@@ -11,4 +11,8 @@ ctransformers==0.2.27
11
  python-dotenv==1.0.0
12
  openai==1.6.1
13
  pymupdf==1.23.8
14
- chainlit==0.7.700
 
 
 
 
 
11
  python-dotenv==1.0.0
12
  openai==1.6.1
13
  pymupdf==1.23.8
14
+ chainlit==0.7.700
15
+ beautifulsoup4==4.12.2
16
+ fake-useragent==1.4.0
17
+ git+https://github.com/huggingface/accelerate.git
18
+ llama-cpp-python
storage/data/urls.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://dl4ds.github.io/sp2024/