hf sync commit
Browse files- Dockerfile +29 -0
- README.md +11 -11
- code/chainlit.md → chainlit.md +2 -0
- code/config.yml +11 -8
- code/main.py +68 -53
- code/modules/chat_model_loader.py +21 -7
- code/modules/constants.py +31 -7
- code/modules/data_loader.py +18 -15
- code/modules/embedding_model_loader.py +6 -0
- code/modules/helpers.py +162 -0
- code/modules/llm_tutor.py +11 -3
- code/modules/vector_db.py +12 -3
- data/webpage.pdf +0 -0
- requirements.txt +5 -1
- storage/data/urls.txt +1 -0
Dockerfile
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
|
10 |
+
|
11 |
+
RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
|
12 |
+
|
13 |
+
COPY . /code
|
14 |
+
|
15 |
+
RUN ls -R
|
16 |
+
|
17 |
+
# Change permissions to allow writing to the directory
|
18 |
+
RUN chmod -R 777 /code
|
19 |
+
|
20 |
+
# Create a logs directory and set permissions
|
21 |
+
RUN mkdir /code/logs && chmod 777 /code/logs
|
22 |
+
|
23 |
+
# Create a cache directory within the application's working directory
|
24 |
+
RUN mkdir /.cache && chmod -R 777 /.cache
|
25 |
+
|
26 |
+
RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true
|
27 |
+
RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true
|
28 |
+
|
29 |
+
CMD ["chainlit", "run", "code/main.py", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
2. conda activate dl4ds_tutor
|
6 |
-
3. pip install -r requirements.txt
|
7 |
-
4. Create a .env file and add your openai api key as 'OPENAI_API_KEY=XXX'
|
8 |
-
|
9 |
-
## Instructions
|
10 |
-
1. Add files to `data/`
|
11 |
-
2. cd code
|
12 |
-
3. chainlit run main.py
|
13 |
|
|
|
|
1 |
+
---
|
2 |
+
title: Dl4ds Tutor
|
3 |
+
emoji: 🏃
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
|
10 |
+
DL4DS Tutor
|
11 |
+
===========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
code/chainlit.md → chainlit.md
RENAMED
@@ -3,6 +3,8 @@
|
|
3 |
Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
|
4 |
This is still very much a Work in Progress.
|
5 |
|
|
|
|
|
6 |
## Useful Links 🔗
|
7 |
|
8 |
- **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
|
|
|
3 |
Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
|
4 |
This is still very much a Work in Progress.
|
5 |
|
6 |
+
### --- Please wait while the Tutor loads... ---
|
7 |
+
|
8 |
## Useful Links 🔗
|
9 |
|
10 |
- **Documentation:** [Chainlit Documentation](https://docs.chainlit.io) 📚
|
code/config.yml
CHANGED
@@ -1,26 +1,29 @@
|
|
1 |
embedding_options:
|
2 |
embedd_files: True # bool
|
3 |
persist_directory: null # str or None
|
4 |
-
data_path: '
|
|
|
|
|
5 |
db_option : 'FAISS' # str
|
6 |
db_path : 'vectorstores' # str
|
7 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
8 |
-
search_top_k :
|
9 |
llm_params:
|
10 |
-
use_history:
|
11 |
-
llm_loader: '
|
12 |
openai_params:
|
13 |
model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
|
14 |
-
|
15 |
-
model: "
|
16 |
model_type: "llama"
|
|
|
17 |
splitter_options:
|
18 |
use_splitter: True # bool
|
19 |
split_by_token : True # bool
|
20 |
remove_leftover_delimiters: True # bool
|
21 |
remove_chunks: False # bool
|
22 |
-
chunk_size :
|
23 |
-
chunk_overlap :
|
24 |
chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
|
25 |
front_chunks_to_remove : null # int or None
|
26 |
last_chunks_to_remove : null # int or None
|
|
|
1 |
embedding_options:
|
2 |
embedd_files: True # bool
|
3 |
persist_directory: null # str or None
|
4 |
+
data_path: 'storage/data' # str
|
5 |
+
url_file_path: 'storage/data/urls.txt' # str
|
6 |
+
expand_urls: True # bool
|
7 |
db_option : 'FAISS' # str
|
8 |
db_path : 'vectorstores' # str
|
9 |
model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
|
10 |
+
search_top_k : 3 # int
|
11 |
llm_params:
|
12 |
+
use_history: False # bool
|
13 |
+
llm_loader: 'local_llm' # str [local_llm, openai]
|
14 |
openai_params:
|
15 |
model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
|
16 |
+
local_llm_params:
|
17 |
+
model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
|
18 |
model_type: "llama"
|
19 |
+
temperature: 0.2
|
20 |
splitter_options:
|
21 |
use_splitter: True # bool
|
22 |
split_by_token : True # bool
|
23 |
remove_leftover_delimiters: True # bool
|
24 |
remove_chunks: False # bool
|
25 |
+
chunk_size : 300 # int
|
26 |
+
chunk_overlap : 30 # int
|
27 |
chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
|
28 |
front_chunks_to_remove : null # int or None
|
29 |
last_chunks_to_remove : null # int or None
|
code/main.py
CHANGED
@@ -12,6 +12,8 @@ import logging
|
|
12 |
from dotenv import load_dotenv
|
13 |
|
14 |
from modules.llm_tutor import LLMTutor
|
|
|
|
|
15 |
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
@@ -31,22 +33,70 @@ file_handler.setLevel(logging.INFO)
|
|
31 |
file_handler.setFormatter(formatter)
|
32 |
logger.addHandler(file_handler)
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
|
43 |
# chainlit code
|
44 |
@cl.on_chat_start
|
45 |
async def start():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
chain = llm_tutor.qa_bot()
|
47 |
-
|
|
|
48 |
await msg.send()
|
49 |
-
msg.content = "Hey, What Can I Help You With
|
50 |
await msg.update()
|
51 |
|
52 |
cl.user_session.set("chain", chain)
|
@@ -54,56 +104,21 @@ async def start():
|
|
54 |
|
55 |
@cl.on_message
|
56 |
async def main(message):
|
|
|
57 |
chain = cl.user_session.get("chain")
|
58 |
-
cb = cl.AsyncLangchainCallbackHandler(
|
59 |
-
|
60 |
-
)
|
61 |
-
cb.answer_reached = True
|
62 |
# res=await chain.acall(message, callbacks=[cb])
|
63 |
-
res = await chain.acall(message.content
|
64 |
-
|
65 |
try:
|
66 |
answer = res["answer"]
|
67 |
except:
|
68 |
answer = res["result"]
|
69 |
print(f"answer: {answer}")
|
70 |
-
source_elements_dict = {}
|
71 |
-
source_elements = []
|
72 |
-
found_sources = []
|
73 |
-
|
74 |
-
for idx, source in enumerate(res["source_documents"]):
|
75 |
-
title = source.metadata["source"]
|
76 |
|
77 |
-
|
78 |
-
source_elements_dict[title] = {
|
79 |
-
"page_number": [source.metadata["page"]],
|
80 |
-
"url": source.metadata["source"],
|
81 |
-
"content": source.page_content,
|
82 |
-
}
|
83 |
|
84 |
-
|
85 |
-
source_elements_dict[title]["page_number"].append(source.metadata["page"])
|
86 |
-
source_elements_dict[title][
|
87 |
-
"content_" + str(source.metadata["page"])
|
88 |
-
] = source.page_content
|
89 |
-
# sort the page numbers
|
90 |
-
# source_elements_dict[title]["page_number"].sort()
|
91 |
-
|
92 |
-
for title, source in source_elements_dict.items():
|
93 |
-
# create a string for the page numbers
|
94 |
-
page_numbers = ", ".join([str(x) for x in source["page_number"]])
|
95 |
-
text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
|
96 |
-
source_elements.append(cl.Pdf(name="File", path=title))
|
97 |
-
found_sources.append("File")
|
98 |
-
# for pn in source["page_number"]:
|
99 |
-
# source_elements.append(
|
100 |
-
# cl.Text(name=str(pn), content=source["content_"+str(pn)])
|
101 |
-
# )
|
102 |
-
# found_sources.append(str(pn))
|
103 |
-
|
104 |
-
if found_sources:
|
105 |
-
answer += f"\nSource:{', '.join(found_sources)}"
|
106 |
-
else:
|
107 |
-
answer += f"\nNo source found."
|
108 |
-
|
109 |
-
await cl.Message(content=answer, elements=source_elements).send()
|
|
|
12 |
from dotenv import load_dotenv
|
13 |
|
14 |
from modules.llm_tutor import LLMTutor
|
15 |
+
from modules.constants import *
|
16 |
+
from modules.helpers import get_sources
|
17 |
|
18 |
|
19 |
logger = logging.getLogger(__name__)
|
|
|
33 |
file_handler.setFormatter(formatter)
|
34 |
logger.addHandler(file_handler)
|
35 |
|
36 |
+
|
37 |
+
# Adding option to select the chat profile
|
38 |
+
@cl.set_chat_profiles
|
39 |
+
async def chat_profile():
|
40 |
+
return [
|
41 |
+
cl.ChatProfile(
|
42 |
+
name="Llama",
|
43 |
+
markdown_description="Use the local LLM: **Tiny Llama**.",
|
44 |
+
),
|
45 |
+
# cl.ChatProfile(
|
46 |
+
# name="Mistral",
|
47 |
+
# markdown_description="Use the local LLM: **Mistral**.",
|
48 |
+
# ),
|
49 |
+
cl.ChatProfile(
|
50 |
+
name="gpt-3.5-turbo-1106",
|
51 |
+
markdown_description="Use OpenAI API for **gpt-3.5-turbo-1106**.",
|
52 |
+
),
|
53 |
+
cl.ChatProfile(
|
54 |
+
name="gpt-4",
|
55 |
+
markdown_description="Use OpenAI API for **gpt-4**.",
|
56 |
+
),
|
57 |
+
]
|
58 |
+
|
59 |
+
|
60 |
+
@cl.author_rename
|
61 |
+
def rename(orig_author: str):
|
62 |
+
rename_dict = {"Chatbot": "AI Tutor"}
|
63 |
+
return rename_dict.get(orig_author, orig_author)
|
64 |
|
65 |
|
66 |
# chainlit code
|
67 |
@cl.on_chat_start
|
68 |
async def start():
|
69 |
+
with open("code/config.yml", "r") as f:
|
70 |
+
config = yaml.safe_load(f)
|
71 |
+
print(config)
|
72 |
+
logger.info("Config file loaded")
|
73 |
+
logger.info(f"Config: {config}")
|
74 |
+
logger.info("Creating llm_tutor instance")
|
75 |
+
|
76 |
+
chat_profile = cl.user_session.get("chat_profile")
|
77 |
+
if chat_profile is not None:
|
78 |
+
if chat_profile.lower() in ["gpt-3.5-turbo-1106", "gpt-4"]:
|
79 |
+
config["llm_params"]["llm_loader"] = "openai"
|
80 |
+
config["llm_params"]["openai_params"]["model"] = chat_profile.lower()
|
81 |
+
elif chat_profile.lower() == "llama":
|
82 |
+
config["llm_params"]["llm_loader"] = "local_llm"
|
83 |
+
config["llm_params"]["local_llm_params"]["model"] = LLAMA_PATH
|
84 |
+
config["llm_params"]["local_llm_params"]["model_type"] = "llama"
|
85 |
+
elif chat_profile.lower() == "mistral":
|
86 |
+
config["llm_params"]["llm_loader"] = "local_llm"
|
87 |
+
config["llm_params"]["local_llm_params"]["model"] = MISTRAL_PATH
|
88 |
+
config["llm_params"]["local_llm_params"]["model_type"] = "mistral"
|
89 |
+
|
90 |
+
else:
|
91 |
+
pass
|
92 |
+
|
93 |
+
llm_tutor = LLMTutor(config, logger=logger)
|
94 |
+
|
95 |
chain = llm_tutor.qa_bot()
|
96 |
+
model = config["llm_params"]["local_llm_params"]["model"]
|
97 |
+
msg = cl.Message(content=f"Starting the bot {model}...")
|
98 |
await msg.send()
|
99 |
+
msg.content = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else! You can find me at {model}"
|
100 |
await msg.update()
|
101 |
|
102 |
cl.user_session.set("chain", chain)
|
|
|
104 |
|
105 |
@cl.on_message
|
106 |
async def main(message):
|
107 |
+
user = cl.user_session.get("user")
|
108 |
chain = cl.user_session.get("chain")
|
109 |
+
# cb = cl.AsyncLangchainCallbackHandler(
|
110 |
+
# stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
|
111 |
+
# )
|
112 |
+
# cb.answer_reached = True
|
113 |
# res=await chain.acall(message, callbacks=[cb])
|
114 |
+
res = await chain.acall(message.content)
|
115 |
+
print(f"response: {res}")
|
116 |
try:
|
117 |
answer = res["answer"]
|
118 |
except:
|
119 |
answer = res["result"]
|
120 |
print(f"answer: {answer}")
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
answer_with_sources, source_elements = get_sources(res, answer)
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
await cl.Message(content=answer_with_sources, elements=source_elements).send()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code/modules/chat_model_loader.py
CHANGED
@@ -1,24 +1,38 @@
|
|
1 |
from langchain_community.chat_models import ChatOpenAI
|
2 |
from langchain.llms import CTransformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
class ChatModelLoader:
|
6 |
def __init__(self, config):
|
7 |
self.config = config
|
|
|
8 |
|
9 |
def load_chat_model(self):
|
10 |
if self.config["llm_params"]["llm_loader"] == "openai":
|
11 |
llm = ChatOpenAI(
|
12 |
model_name=self.config["llm_params"]["openai_params"]["model"]
|
13 |
)
|
14 |
-
elif self.config["llm_params"]["llm_loader"] == "
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
],
|
20 |
-
max_new_tokens=512,
|
21 |
-
temperature=0.5,
|
22 |
)
|
23 |
else:
|
24 |
raise ValueError("Invalid LLM Loader")
|
|
|
1 |
from langchain_community.chat_models import ChatOpenAI
|
2 |
from langchain.llms import CTransformers
|
3 |
+
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
|
4 |
+
from transformers import AutoTokenizer, TextStreamer
|
5 |
+
from langchain.llms import LlamaCpp
|
6 |
+
import torch
|
7 |
+
import transformers
|
8 |
+
import os
|
9 |
+
from langchain.callbacks.manager import CallbackManager
|
10 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
11 |
|
12 |
|
13 |
class ChatModelLoader:
|
14 |
def __init__(self, config):
|
15 |
self.config = config
|
16 |
+
self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
17 |
|
18 |
def load_chat_model(self):
|
19 |
if self.config["llm_params"]["llm_loader"] == "openai":
|
20 |
llm = ChatOpenAI(
|
21 |
model_name=self.config["llm_params"]["openai_params"]["model"]
|
22 |
)
|
23 |
+
elif self.config["llm_params"]["llm_loader"] == "local_llm":
|
24 |
+
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
|
25 |
+
model_path = self.config["llm_params"]["local_llm_params"]["model"]
|
26 |
+
llm = LlamaCpp(
|
27 |
+
model_path=model_path,
|
28 |
+
n_batch=n_batch,
|
29 |
+
n_ctx=2048,
|
30 |
+
f16_kv=True,
|
31 |
+
verbose=True,
|
32 |
+
n_threads=2,
|
33 |
+
temperature=self.config["llm_params"]["local_llm_params"][
|
34 |
+
"temperature"
|
35 |
],
|
|
|
|
|
36 |
)
|
37 |
else:
|
38 |
raise ValueError("Invalid LLM Loader")
|
code/modules/constants.py
CHANGED
@@ -10,15 +10,15 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
10 |
|
11 |
# Prompt Templates
|
12 |
|
13 |
-
prompt_template = """Use the following pieces of information to answer the user's question.
|
14 |
-
If you don't know the answer, just say that you don't know
|
15 |
|
16 |
-
Context: {context}
|
17 |
-
Question: {question}
|
18 |
|
19 |
-
Only return the helpful answer below and nothing else.
|
20 |
-
Helpful answer:
|
21 |
-
"""
|
22 |
|
23 |
prompt_template_with_history = """Use the following pieces of information to answer the user's question.
|
24 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
@@ -31,3 +31,27 @@ Question: {question}
|
|
31 |
Only return the helpful answer below and nothing else.
|
32 |
Helpful answer:
|
33 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# Prompt Templates
|
12 |
|
13 |
+
# prompt_template = """Use the following pieces of information to answer the user's question.
|
14 |
+
# If you don't know the answer, just say that you don't know.
|
15 |
|
16 |
+
# Context: {context}
|
17 |
+
# Question: {question}
|
18 |
|
19 |
+
# Only return the helpful answer below and nothing else.
|
20 |
+
# Helpful answer:
|
21 |
+
# """
|
22 |
|
23 |
prompt_template_with_history = """Use the following pieces of information to answer the user's question.
|
24 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
|
|
31 |
Only return the helpful answer below and nothing else.
|
32 |
Helpful answer:
|
33 |
"""
|
34 |
+
|
35 |
+
prompt_template = """
|
36 |
+
<|im_start|>system
|
37 |
+
Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
|
38 |
+
|
39 |
+
Context:
|
40 |
+
{context}
|
41 |
+
<|im_end|>
|
42 |
+
<|im_start|>user
|
43 |
+
Question: Who is the instructor for this course?
|
44 |
+
<|im_end|>
|
45 |
+
<|im_start|>assistant
|
46 |
+
The instructor for this course is Prof. Thomas Gardos.
|
47 |
+
<|im_end|>
|
48 |
+
<|im_start|>user
|
49 |
+
Question: {question}
|
50 |
+
<|im_end|>
|
51 |
+
<|im_start|>assistant
|
52 |
+
"""
|
53 |
+
|
54 |
+
# Model Paths
|
55 |
+
|
56 |
+
LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
|
57 |
+
MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
|
code/modules/data_loader.py
CHANGED
@@ -225,21 +225,24 @@ class DataLoader:
|
|
225 |
|
226 |
# Handle link by link
|
227 |
for link_index, link in enumerate(weblinks):
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
243 |
|
244 |
logger.info(
|
245 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
|
|
225 |
|
226 |
# Handle link by link
|
227 |
for link_index, link in enumerate(weblinks):
|
228 |
+
try:
|
229 |
+
logger.info(f"\tSplitting link {link_index+1} : {link}")
|
230 |
+
if "youtube" in link:
|
231 |
+
title, document_chunks = get_youtube_transcript(link)
|
232 |
+
else:
|
233 |
+
title, document_chunks = get_html(link)
|
234 |
+
|
235 |
+
# Additional wrangling - Remove leftover delimiters and any specified chunks
|
236 |
+
if self.remove_leftover_delimiters:
|
237 |
+
document_chunks = remove_delimiters(document_chunks)
|
238 |
+
if self.config["splitter_options"]["remove_chunks"]:
|
239 |
+
document_chunks = remove_chunks(document_chunks)
|
240 |
+
|
241 |
+
print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
|
242 |
+
self.document_names.append(title)
|
243 |
+
self.document_chunks_full.extend(document_chunks)
|
244 |
+
except:
|
245 |
+
logger.info(f"\t\tError splitting link {link_index+1} : {link}")
|
246 |
|
247 |
logger.info(
|
248 |
f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"
|
code/modules/embedding_model_loader.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
from langchain_community.embeddings import OpenAIEmbeddings
|
2 |
from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
3 |
from modules.constants import *
|
|
|
4 |
|
5 |
|
6 |
class EmbeddingModelLoader:
|
@@ -20,4 +22,8 @@ class EmbeddingModelLoader:
|
|
20 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
21 |
model_kwargs={"device": "cpu"},
|
22 |
)
|
|
|
|
|
|
|
|
|
23 |
return embedding_model
|
|
|
1 |
from langchain_community.embeddings import OpenAIEmbeddings
|
2 |
from langchain.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain.embeddings import LlamaCppEmbeddings
|
4 |
from modules.constants import *
|
5 |
+
import os
|
6 |
|
7 |
|
8 |
class EmbeddingModelLoader:
|
|
|
22 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
23 |
model_kwargs={"device": "cpu"},
|
24 |
)
|
25 |
+
# embedding_model = LlamaCppEmbeddings(
|
26 |
+
# model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
|
27 |
+
# )
|
28 |
+
|
29 |
return embedding_model
|
code/modules/helpers.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from tqdm import tqdm
|
4 |
+
from urllib.parse import urlparse
|
5 |
+
import chainlit as cl
|
6 |
+
|
7 |
+
"""
|
8 |
+
Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113
|
9 |
+
"""
|
10 |
+
|
11 |
+
|
12 |
+
class WebpageCrawler:
|
13 |
+
def __init__(self):
|
14 |
+
pass
|
15 |
+
|
16 |
+
def getdata(self, url):
|
17 |
+
r = requests.get(url)
|
18 |
+
return r.text
|
19 |
+
|
20 |
+
def url_exists(self, url):
|
21 |
+
try:
|
22 |
+
response = requests.head(url)
|
23 |
+
return response.status_code == 200
|
24 |
+
except requests.ConnectionError:
|
25 |
+
return False
|
26 |
+
|
27 |
+
def get_links(self, website_link, base_url=None):
|
28 |
+
if base_url is None:
|
29 |
+
base_url = website_link
|
30 |
+
html_data = self.getdata(website_link)
|
31 |
+
soup = BeautifulSoup(html_data, "html.parser")
|
32 |
+
list_links = []
|
33 |
+
for link in soup.find_all("a", href=True):
|
34 |
+
# Append to list if new link contains original link
|
35 |
+
if str(link["href"]).startswith((str(website_link))):
|
36 |
+
list_links.append(link["href"])
|
37 |
+
|
38 |
+
# Include all href that do not start with website link but with "/"
|
39 |
+
if str(link["href"]).startswith("/"):
|
40 |
+
if link["href"] not in self.dict_href_links:
|
41 |
+
print(link["href"])
|
42 |
+
self.dict_href_links[link["href"]] = None
|
43 |
+
link_with_www = base_url + link["href"][1:]
|
44 |
+
if self.url_exists(link_with_www):
|
45 |
+
print("adjusted link =", link_with_www)
|
46 |
+
list_links.append(link_with_www)
|
47 |
+
|
48 |
+
# Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
|
49 |
+
dict_links = dict.fromkeys(list_links, "Not-checked")
|
50 |
+
return dict_links
|
51 |
+
|
52 |
+
def get_subpage_links(self, l, base_url):
|
53 |
+
for link in tqdm(l):
|
54 |
+
# If not crawled through this page start crawling and get links
|
55 |
+
if l[link] == "Not-checked":
|
56 |
+
dict_links_subpages = self.get_links(link, base_url)
|
57 |
+
# Change the dictionary value of the link to "Checked"
|
58 |
+
l[link] = "Checked"
|
59 |
+
else:
|
60 |
+
# Create an empty dictionary in case every link is checked
|
61 |
+
dict_links_subpages = {}
|
62 |
+
# Add new dictionary to old dictionary
|
63 |
+
l = {**dict_links_subpages, **l}
|
64 |
+
return l
|
65 |
+
|
66 |
+
def get_all_pages(self, url, base_url):
|
67 |
+
dict_links = {url: "Not-checked"}
|
68 |
+
self.dict_href_links = {}
|
69 |
+
counter, counter2 = None, 0
|
70 |
+
while counter != 0:
|
71 |
+
counter2 += 1
|
72 |
+
dict_links2 = self.get_subpage_links(dict_links, base_url)
|
73 |
+
# Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
|
74 |
+
# https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
|
75 |
+
counter = sum(value == "Not-checked" for value in dict_links2.values())
|
76 |
+
dict_links = dict_links2
|
77 |
+
checked_urls = [
|
78 |
+
url for url, status in dict_links.items() if status == "Checked"
|
79 |
+
]
|
80 |
+
return checked_urls
|
81 |
+
|
82 |
+
|
83 |
+
def get_urls_from_file(file_path: str):
|
84 |
+
"""
|
85 |
+
Function to get urls from a file
|
86 |
+
"""
|
87 |
+
with open(file_path, "r") as f:
|
88 |
+
urls = f.readlines()
|
89 |
+
urls = [url.strip() for url in urls]
|
90 |
+
return urls
|
91 |
+
|
92 |
+
|
93 |
+
def get_base_url(url):
|
94 |
+
parsed_url = urlparse(url)
|
95 |
+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
|
96 |
+
return base_url
|
97 |
+
|
98 |
+
|
99 |
+
def get_sources(res, answer):
|
100 |
+
source_elements_dict = {}
|
101 |
+
source_elements = []
|
102 |
+
found_sources = []
|
103 |
+
|
104 |
+
source_dict = {} # Dictionary to store URL elements
|
105 |
+
|
106 |
+
for idx, source in enumerate(res["source_documents"]):
|
107 |
+
source_metadata = source.metadata
|
108 |
+
url = source_metadata["source"]
|
109 |
+
|
110 |
+
if url not in source_dict:
|
111 |
+
source_dict[url] = [source.page_content]
|
112 |
+
else:
|
113 |
+
source_dict[url].append(source.page_content)
|
114 |
+
|
115 |
+
for source_idx, (url, text_list) in enumerate(source_dict.items()):
|
116 |
+
full_text = ""
|
117 |
+
for url_idx, text in enumerate(text_list):
|
118 |
+
full_text += f"Source {url_idx+1}:\n {text}\n\n\n"
|
119 |
+
source_elements.append(cl.Text(name=url, content=full_text))
|
120 |
+
found_sources.append(url)
|
121 |
+
|
122 |
+
if found_sources:
|
123 |
+
answer += f"\n\nSources: {', '.join(found_sources)} "
|
124 |
+
else:
|
125 |
+
answer += f"\n\nNo source found."
|
126 |
+
|
127 |
+
# for idx, source in enumerate(res["source_documents"]):
|
128 |
+
# title = source.metadata["source"]
|
129 |
+
|
130 |
+
# if title not in source_elements_dict:
|
131 |
+
# source_elements_dict[title] = {
|
132 |
+
# "page_number": [source.metadata["page"]],
|
133 |
+
# "url": source.metadata["source"],
|
134 |
+
# "content": source.page_content,
|
135 |
+
# }
|
136 |
+
|
137 |
+
# else:
|
138 |
+
# source_elements_dict[title]["page_number"].append(source.metadata["page"])
|
139 |
+
# source_elements_dict[title][
|
140 |
+
# "content_" + str(source.metadata["page"])
|
141 |
+
# ] = source.page_content
|
142 |
+
# # sort the page numbers
|
143 |
+
# # source_elements_dict[title]["page_number"].sort()
|
144 |
+
|
145 |
+
# for title, source in source_elements_dict.items():
|
146 |
+
# # create a string for the page numbers
|
147 |
+
# page_numbers = ", ".join([str(x) for x in source["page_number"]])
|
148 |
+
# text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
|
149 |
+
# source_elements.append(cl.Pdf(name="File", path=title))
|
150 |
+
# found_sources.append("File")
|
151 |
+
# # for pn in source["page_number"]:
|
152 |
+
# # source_elements.append(
|
153 |
+
# # cl.Text(name=str(pn), content=source["content_"+str(pn)])
|
154 |
+
# # )
|
155 |
+
# # found_sources.append(str(pn))
|
156 |
+
|
157 |
+
# if found_sources:
|
158 |
+
# answer += f"\nSource:{', '.join(found_sources)}"
|
159 |
+
# else:
|
160 |
+
# answer += f"\nNo source found."
|
161 |
+
|
162 |
+
return answer, source_elements
|
code/modules/llm_tutor.py
CHANGED
@@ -18,7 +18,7 @@ class LLMTutor:
|
|
18 |
def __init__(self, config, logger=None):
|
19 |
self.config = config
|
20 |
self.vector_db = VectorDB(config, logger=logger)
|
21 |
-
if self.config[
|
22 |
self.vector_db.create_database()
|
23 |
self.vector_db.save_database()
|
24 |
|
@@ -47,7 +47,11 @@ class LLMTutor:
|
|
47 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
48 |
llm=llm,
|
49 |
chain_type="stuff",
|
50 |
-
retriever=db.as_retriever(
|
|
|
|
|
|
|
|
|
51 |
return_source_documents=True,
|
52 |
memory=memory,
|
53 |
combine_docs_chain_kwargs={"prompt": prompt},
|
@@ -56,7 +60,11 @@ class LLMTutor:
|
|
56 |
qa_chain = RetrievalQA.from_chain_type(
|
57 |
llm=llm,
|
58 |
chain_type="stuff",
|
59 |
-
retriever=db.as_retriever(
|
|
|
|
|
|
|
|
|
60 |
return_source_documents=True,
|
61 |
chain_type_kwargs={"prompt": prompt},
|
62 |
)
|
|
|
18 |
def __init__(self, config, logger=None):
|
19 |
self.config = config
|
20 |
self.vector_db = VectorDB(config, logger=logger)
|
21 |
+
if self.config["embedding_options"]["embedd_files"]:
|
22 |
self.vector_db.create_database()
|
23 |
self.vector_db.save_database()
|
24 |
|
|
|
47 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
48 |
llm=llm,
|
49 |
chain_type="stuff",
|
50 |
+
retriever=db.as_retriever(
|
51 |
+
search_kwargs={
|
52 |
+
"k": self.config["embedding_options"]["search_top_k"]
|
53 |
+
}
|
54 |
+
),
|
55 |
return_source_documents=True,
|
56 |
memory=memory,
|
57 |
combine_docs_chain_kwargs={"prompt": prompt},
|
|
|
60 |
qa_chain = RetrievalQA.from_chain_type(
|
61 |
llm=llm,
|
62 |
chain_type="stuff",
|
63 |
+
retriever=db.as_retriever(
|
64 |
+
search_kwargs={
|
65 |
+
"k": self.config["embedding_options"]["search_top_k"]
|
66 |
+
}
|
67 |
+
),
|
68 |
return_source_documents=True,
|
69 |
chain_type_kwargs={"prompt": prompt},
|
70 |
)
|
code/modules/vector_db.py
CHANGED
@@ -6,6 +6,7 @@ from modules.embedding_model_loader import EmbeddingModelLoader
|
|
6 |
from langchain.vectorstores import FAISS
|
7 |
from modules.data_loader import DataLoader
|
8 |
from modules.constants import *
|
|
|
9 |
|
10 |
|
11 |
class VectorDB:
|
@@ -13,6 +14,7 @@ class VectorDB:
|
|
13 |
self.config = config
|
14 |
self.db_option = config["embedding_options"]["db_option"]
|
15 |
self.document_names = None
|
|
|
16 |
|
17 |
# Set up logging to both console and a file
|
18 |
if logger is None:
|
@@ -43,7 +45,14 @@ class VectorDB:
|
|
43 |
os.path.join(self.config["embedding_options"]["data_path"], file)
|
44 |
for file in files
|
45 |
]
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def create_embedding_model(self):
|
49 |
self.logger.info("Creating embedding function")
|
@@ -63,8 +72,8 @@ class VectorDB:
|
|
63 |
def create_database(self):
|
64 |
data_loader = DataLoader(self.config)
|
65 |
self.logger.info("Loading data")
|
66 |
-
files = self.load_files()
|
67 |
-
document_chunks, document_names = data_loader.get_chunks(files,
|
68 |
self.logger.info("Completed loading data")
|
69 |
|
70 |
self.create_embedding_model()
|
|
|
6 |
from langchain.vectorstores import FAISS
|
7 |
from modules.data_loader import DataLoader
|
8 |
from modules.constants import *
|
9 |
+
from modules.helpers import *
|
10 |
|
11 |
|
12 |
class VectorDB:
|
|
|
14 |
self.config = config
|
15 |
self.db_option = config["embedding_options"]["db_option"]
|
16 |
self.document_names = None
|
17 |
+
self.webpage_crawler = WebpageCrawler()
|
18 |
|
19 |
# Set up logging to both console and a file
|
20 |
if logger is None:
|
|
|
45 |
os.path.join(self.config["embedding_options"]["data_path"], file)
|
46 |
for file in files
|
47 |
]
|
48 |
+
urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"])
|
49 |
+
if self.config["embedding_options"]["expand_urls"]:
|
50 |
+
all_urls = []
|
51 |
+
for url in urls:
|
52 |
+
base_url = get_base_url(url)
|
53 |
+
all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url))
|
54 |
+
urls = all_urls
|
55 |
+
return files, urls
|
56 |
|
57 |
def create_embedding_model(self):
|
58 |
self.logger.info("Creating embedding function")
|
|
|
72 |
def create_database(self):
|
73 |
data_loader = DataLoader(self.config)
|
74 |
self.logger.info("Loading data")
|
75 |
+
files, urls = self.load_files()
|
76 |
+
document_chunks, document_names = data_loader.get_chunks(files, urls)
|
77 |
self.logger.info("Completed loading data")
|
78 |
|
79 |
self.create_embedding_model()
|
data/webpage.pdf
DELETED
Binary file (51.3 kB)
|
|
requirements.txt
CHANGED
@@ -11,4 +11,8 @@ ctransformers==0.2.27
|
|
11 |
python-dotenv==1.0.0
|
12 |
openai==1.6.1
|
13 |
pymupdf==1.23.8
|
14 |
-
chainlit==0.7.700
|
|
|
|
|
|
|
|
|
|
11 |
python-dotenv==1.0.0
|
12 |
openai==1.6.1
|
13 |
pymupdf==1.23.8
|
14 |
+
chainlit==0.7.700
|
15 |
+
beautifulsoup4==4.12.2
|
16 |
+
fake-useragent==1.4.0
|
17 |
+
git+https://github.com/huggingface/accelerate.git
|
18 |
+
llama-cpp-python
|
storage/data/urls.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
https://dl4ds.github.io/sp2024/
|