Spaces:
Sleeping
Sleeping
Adds all necessary files
Browse files- .env +2 -0
- .gitattributes +1 -0
- .gitignore +12 -0
- Dockerfile +52 -0
- README_PROJECT.md +73 -0
- app/.chainlit/config.toml +97 -0
- app/.chainlit/translations/en-US.json +155 -0
- app/app.py +99 -0
- app/chainlit.md +7 -0
- app/helper.py +217 -0
- app/prompts.py +39 -0
- app/public/logo_dark.png +0 -0
- app/public/logo_light.png +0 -0
- chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/data_level0.bin +3 -0
- chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/header.bin +3 -0
- chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/length.bin +3 -0
- chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/link_lists.bin +3 -0
- chroma/chroma.sqlite3 +3 -0
- index_preparation/build_index.ipynb +250 -0
- index_preparation/create_QA_set_documents.ipynb +84 -0
- index_preparation/create_pdf_documents.ipynb +139 -0
- index_preparation/create_template_documents.ipynb +140 -0
- index_preparation/create_web_documents.ipynb +176 -0
- index_preparation/preprocess_data.ipynb +229 -0
- init_embedding_model.py +4 -0
- input_data/PDF/documents/all_documents +0 -0
- input_data/PDF/documents/new_documents +0 -0
- input_data/QA_dataset/all_documents +0 -0
- input_data/QA_dataset/golden_qa_set.json +0 -0
- input_data/Templates/documents/all_documents +5 -0
- input_data/Templates/documents/new_documents +0 -0
- input_data/Templates/template_files/processed/Backup policy.docx +0 -0
- input_data/Templates/template_files/processed/Change management policy.docx +0 -0
- input_data/Templates/template_files/processed/Encryption policy.docx +0 -0
- input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx +0 -0
- input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx +0 -0
- input_data/Web/URLs/cleaned_urls.txt +62 -0
- input_data/Web/URLs/uncleaned_urls.txt +0 -0
- input_data/Web/documents/all_documents +0 -0
- input_data/Web/documents/new_documents +0 -0
- requirements.txt +0 -0
- requirements_Docker.txt +0 -0
- setup.sh +16 -0
- sparse_index/sparse_1536_264 +0 -0
.env
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
TESSERACT_PATH=C:\Program Files\Tesseract-OCR\tesseract.exe
|
2 |
+
CHROMA_PATH=./../chroma
|
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
.vscode
|
3 |
+
venv/
|
4 |
+
evaluationResults/*
|
5 |
+
|
6 |
+
# Byte-compiled / optimized / DLL files
|
7 |
+
__pycache__/
|
8 |
+
*.py[cod]
|
9 |
+
*$py.class
|
10 |
+
|
11 |
+
# Jupyter Notebook
|
12 |
+
.ipynb_checkpoints
|
Dockerfile
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# The python builder image, used to build the virtual environment
|
2 |
+
FROM python:3.11-slim-buster
|
3 |
+
|
4 |
+
# Install system dependencies
|
5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends git && \
|
6 |
+
rm -rf /var/lib/apt/lists/*
|
7 |
+
|
8 |
+
# Create a user to run the app
|
9 |
+
RUN useradd -m -u 1000 user
|
10 |
+
|
11 |
+
# Switch to user and set environment variables
|
12 |
+
USER user
|
13 |
+
ENV HOME=/home/user \
|
14 |
+
PATH="/home/user/.local/bin:$PATH" \
|
15 |
+
VIRTUAL_ENV=/home/user/venv \
|
16 |
+
LISTEN_PORT=7860 \
|
17 |
+
HOST=0.0.0.0
|
18 |
+
|
19 |
+
# Set the working directory in the container
|
20 |
+
WORKDIR $HOME
|
21 |
+
|
22 |
+
# Create a virtual environment to isolate our package dependencies locally
|
23 |
+
RUN python -m venv $VIRTUAL_ENV
|
24 |
+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
25 |
+
|
26 |
+
# Copy necessary files to container directory
|
27 |
+
COPY --chown=user ./app ./app/app
|
28 |
+
COPY --chown=user ./chroma ./chroma
|
29 |
+
COPY --chown=user ./embedding_model ./embedding_model
|
30 |
+
COPY --chown=user ./sparse_index ./sparse_index
|
31 |
+
COPY --chown=user ./.env ./app/.env
|
32 |
+
COPY --chown=user ./app/chainlit.md ./app/chainlit.md
|
33 |
+
COPY --chown=user ./app/.chainlit ./app/.chainlit
|
34 |
+
COPY --chown=user ./app/public ./app/public
|
35 |
+
COPY --chown=user ./input_data/Templates/template_files ./input_data/Templates/template_files
|
36 |
+
COPY --chown=user ./requirements_Docker.txt ./app/requirements_Docker.txt
|
37 |
+
COPY --chown=user ./init_embedding_model.py ./init_embedding_model.py
|
38 |
+
|
39 |
+
WORKDIR $HOME/app
|
40 |
+
|
41 |
+
# Install Python dependencies
|
42 |
+
RUN pip install --upgrade pip && \
|
43 |
+
pip install -r ./requirements_Docker.txt
|
44 |
+
|
45 |
+
# Run the script to initialize and cache the fine-tuned embedding model
|
46 |
+
RUN python ./init_model.py
|
47 |
+
|
48 |
+
# Expose the port the app runs on
|
49 |
+
EXPOSE $LISTEN_PORT
|
50 |
+
|
51 |
+
# Run the chainlit app
|
52 |
+
CMD ["chainlit", "run", "app/app.py", "--host", "0.0.0.0", "--port", "7860"]
|
README_PROJECT.md
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# General Information
|
2 |
+
|
3 |
+
## 1. Project Initilization
|
4 |
+
- After pulling the project, do the following to initialize the project:
|
5 |
+
- Make sure that a Python Version >= 3.11 is installed
|
6 |
+
- Run the following command to execute the initialization script: "source setup.sh"
|
7 |
+
- If you want to insert new PDF documents and update the document base, you first need to install Tesseract which is the OCR engine used in this code:
|
8 |
+
- Download Tesseract Installer for Windows: https://github.com/UB-Mannheim/tesseract/wiki
|
9 |
+
- For others, see here: https://tesseract-ocr.github.io/tessdoc/Installation.html
|
10 |
+
- Create a .env file on the root directory level, with the following keys:
|
11 |
+
- TESSERACT_PATH={path}
|
12 |
+
- Set path to the installation path of tesseract, e.g. "C:\Program Files\Tesseract-OCR\tesseract.exe"
|
13 |
+
- CHROMA_PATH=./../chroma
|
14 |
+
|
15 |
+
## 2. Using the Chatbot locally
|
16 |
+
- In the app/helper.py file comment out lines 8 to 10 if you are not on a Linux machine
|
17 |
+
- To start the chatbot locally, run "cd app" and "chainlit run app.py -w"
|
18 |
+
- To use the chatbot, you need two API keys which you can create under the following links
|
19 |
+
- [OpenAI](https://openai.com/blog/openai-api)
|
20 |
+
- [Cohere](https://dashboard.cohere.com/api-keys)
|
21 |
+
|
22 |
+
## 3. Using Docker
|
23 |
+
- Go to helper.py and uncomment the three lines for the import. This is necessary to use Chroma within the container
|
24 |
+
- Build the docker file: "docker build -t iso_27001_chatbot ."
|
25 |
+
- Running the docker file: "docker run -p 7860:7860 iso_27001_chatbot"
|
26 |
+
- Access at: http://localhost:7860
|
27 |
+
- Note that the docker file uses the requirements_Docker.txt which do not include Cuda support, as the free version of HF spaces does not come with GPU availability. If you want to include Cuda support, you need to integrate the command seen above for installing torch into the Dockerfile.
|
28 |
+
|
29 |
+
# Project Structure
|
30 |
+
|
31 |
+
## app
|
32 |
+
Contains the chatbot web application, created with Chainlit. Also, includes classes for prompts and helper functions.
|
33 |
+
|
34 |
+
## chroma
|
35 |
+
The chroma folder consists of all the indices that were created by using the notebooks inside the index_preparation folder.
|
36 |
+
|
37 |
+
## embedding_model
|
38 |
+
This folder contains the embedding model fine-tuned on an ISO 27001 text corpus under. It is based on [bge-large-en-v.1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) and can be accessed and downloaded on [HuggingFace](https://huggingface.co/Basti8499/bge-large-en-v1.5-ISO-27001).
|
39 |
+
|
40 |
+
## index_preparation
|
41 |
+
Stores all Jupyter notebooks needed to create the vector database which stores the ISO 27001 documents. Before creating the index with the build_index.ipynb, the documents for PDFs, Web and Templates need be created inside the other notebooks.
|
42 |
+
|
43 |
+
## input_data
|
44 |
+
|
45 |
+
### PDF Files (/PDF)
|
46 |
+
- Directory structure:
|
47 |
+
- PDF/files: After manually cleaning the PDFs (removing pages), the PDFs should be moved manually to this folder.
|
48 |
+
- PDF/documents
|
49 |
+
- /all_documents: JSON file for all processed PDF documents
|
50 |
+
- /new_documents: JSON file for newly processed PDF documents
|
51 |
+
- PDF/PDF_images: Empty folder in which the images during OCR are stored and deleted afterwards.
|
52 |
+
|
53 |
+
### Web Files (/Web)
|
54 |
+
- Directory structure:
|
55 |
+
- Web/documents:
|
56 |
+
- /all_documents: JSON file for all processed web documents
|
57 |
+
- /newl_documents: JSON file for newly processed web documents
|
58 |
+
- Web/URLs:
|
59 |
+
- /cleaned_urls.txt: .txt file for URLs that were already processed and documents exist
|
60 |
+
- /uncleaned_urls.txt: .txt file for URLs that have not been processed
|
61 |
+
|
62 |
+
### Template Files (/Templates)
|
63 |
+
- Directory structure:
|
64 |
+
- Templates/documents:
|
65 |
+
- /all_documents: JSON file for all processed template documents
|
66 |
+
- /new_documents: JSON file for all newly processed template documents
|
67 |
+
- Templates/template_files:
|
68 |
+
- /new: Not yet processed template files
|
69 |
+
- /processed: Already processed template files
|
70 |
+
- For templates it is important that the actual files to the template are stored under Templates/template_files/new for processing, as the paths are used in the chatbot.
|
71 |
+
|
72 |
+
## sparse_index
|
73 |
+
Stores the chunked documents that were created in the build_index.ipynb in a .txt file for later sparse retrieval.
|
app/.chainlit/config.toml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
# Whether to enable telemetry (default: true). No personal data is collected.
|
3 |
+
enable_telemetry = true
|
4 |
+
|
5 |
+
|
6 |
+
# List of environment variables to be provided by each user to use the app.
|
7 |
+
user_env = ["OPENAI_API_KEY","COHERE_API_KEY"]
|
8 |
+
|
9 |
+
# Duration (in seconds) during which the session is saved when the connection is lost
|
10 |
+
session_timeout = 3600
|
11 |
+
|
12 |
+
# Enable third parties caching (e.g LangChain cache)
|
13 |
+
cache = false
|
14 |
+
|
15 |
+
# Authorized origins
|
16 |
+
allow_origins = ["*"]
|
17 |
+
|
18 |
+
# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
|
19 |
+
# follow_symlink = false
|
20 |
+
|
21 |
+
[features]
|
22 |
+
# Show the prompt playground
|
23 |
+
prompt_playground = false
|
24 |
+
|
25 |
+
# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
|
26 |
+
unsafe_allow_html = false
|
27 |
+
|
28 |
+
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
29 |
+
latex = false
|
30 |
+
|
31 |
+
# Authorize users to upload files with messages
|
32 |
+
multi_modal = false
|
33 |
+
|
34 |
+
# Allows user to use speech to text
|
35 |
+
[features.speech_to_text]
|
36 |
+
enabled = false
|
37 |
+
# See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
|
38 |
+
# language = "en-US"
|
39 |
+
|
40 |
+
[UI]
|
41 |
+
# Name of the app and chatbot.
|
42 |
+
name = "ISO 27001 Chatbot"
|
43 |
+
|
44 |
+
# Show the readme while the thread is empty.
|
45 |
+
show_readme_as_default = true
|
46 |
+
|
47 |
+
# Description of the app and chatbot. This is used for HTML tags.
|
48 |
+
# description = ""
|
49 |
+
|
50 |
+
# Large size content are by default collapsed for a cleaner ui
|
51 |
+
default_collapse_content = true
|
52 |
+
|
53 |
+
# The default value for the expand messages settings.
|
54 |
+
default_expand_messages = false
|
55 |
+
|
56 |
+
# Hide the chain of thought details from the user in the UI.
|
57 |
+
hide_cot = true
|
58 |
+
|
59 |
+
# Link to your github repo. This will add a github button in the UI's header.
|
60 |
+
# github = ".."
|
61 |
+
|
62 |
+
# Specify a CSS file that can be used to customize the user interface.
|
63 |
+
# The CSS file can be served from the public directory or via an external link.
|
64 |
+
# custom_css = "/public/test.css"
|
65 |
+
|
66 |
+
# Specify a Javascript file that can be used to customize the user interface.
|
67 |
+
# The Javascript file can be served from the public directory.
|
68 |
+
# custom_js = "/public/test.js"
|
69 |
+
|
70 |
+
# Specify a custom font url.
|
71 |
+
# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
|
72 |
+
|
73 |
+
# Override default MUI light theme. (Check theme.ts)
|
74 |
+
[UI.theme]
|
75 |
+
#font_family = "Inter, sans-serif"
|
76 |
+
[UI.theme.light]
|
77 |
+
#background = "#FAFAFA"
|
78 |
+
#paper = "#FFFFFF"
|
79 |
+
|
80 |
+
[UI.theme.light.primary]
|
81 |
+
#main = "#F80061"
|
82 |
+
#dark = "#980039"
|
83 |
+
#light = "#FFE7EB"
|
84 |
+
|
85 |
+
# Override default MUI dark theme. (Check theme.ts)
|
86 |
+
[UI.theme.dark]
|
87 |
+
#background = "#FAFAFA"
|
88 |
+
#paper = "#FFFFFF"
|
89 |
+
|
90 |
+
[UI.theme.dark.primary]
|
91 |
+
#main = "#F80061"
|
92 |
+
#dark = "#980039"
|
93 |
+
#light = "#FFE7EB"
|
94 |
+
|
95 |
+
|
96 |
+
[meta]
|
97 |
+
generated_by = "1.0.401"
|
app/.chainlit/translations/en-US.json
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"components": {
|
3 |
+
"atoms": {
|
4 |
+
"buttons": {
|
5 |
+
"userButton": {
|
6 |
+
"menu": {
|
7 |
+
"settings": "Settings",
|
8 |
+
"settingsKey": "S",
|
9 |
+
"APIKeys": "API Keys",
|
10 |
+
"logout": "Logout"
|
11 |
+
}
|
12 |
+
}
|
13 |
+
}
|
14 |
+
},
|
15 |
+
"molecules": {
|
16 |
+
"newChatButton": {
|
17 |
+
"newChat": "New Chat"
|
18 |
+
},
|
19 |
+
"tasklist": {
|
20 |
+
"TaskList": {
|
21 |
+
"title": "\ud83d\uddd2\ufe0f Task List",
|
22 |
+
"loading": "Loading...",
|
23 |
+
"error": "An error occured"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"attachments": {
|
27 |
+
"cancelUpload": "Cancel upload",
|
28 |
+
"removeAttachment": "Remove attachment"
|
29 |
+
},
|
30 |
+
"newChatDialog": {
|
31 |
+
"createNewChat": "Create new chat?",
|
32 |
+
"clearChat": "This will clear the current messages and start a new chat.",
|
33 |
+
"cancel": "Cancel",
|
34 |
+
"confirm": "Confirm"
|
35 |
+
},
|
36 |
+
"settingsModal": {
|
37 |
+
"expandMessages": "Expand Messages",
|
38 |
+
"hideChainOfThought": "Hide Chain of Thought",
|
39 |
+
"darkMode": "Dark Mode"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"organisms": {
|
43 |
+
"chat": {
|
44 |
+
"history": {
|
45 |
+
"index": {
|
46 |
+
"lastInputs": "Last Inputs",
|
47 |
+
"noInputs": "Such empty...",
|
48 |
+
"loading": "Loading..."
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"inputBox": {
|
52 |
+
"input": {
|
53 |
+
"placeholder": "Type your message here..."
|
54 |
+
},
|
55 |
+
"speechButton": {
|
56 |
+
"start": "Start recording",
|
57 |
+
"stop": "Stop recording"
|
58 |
+
},
|
59 |
+
"SubmitButton": {
|
60 |
+
"sendMessage": "Send message",
|
61 |
+
"stopTask": "Stop Task"
|
62 |
+
},
|
63 |
+
"UploadButton": {
|
64 |
+
"attachFiles": "Attach files"
|
65 |
+
},
|
66 |
+
"waterMark": {
|
67 |
+
"text": "Built with"
|
68 |
+
}
|
69 |
+
},
|
70 |
+
"Messages": {
|
71 |
+
"index": {
|
72 |
+
"running": "Running",
|
73 |
+
"executedSuccessfully": "executed successfully",
|
74 |
+
"failed": "failed",
|
75 |
+
"feedbackUpdated": "Feedback updated",
|
76 |
+
"updating": "Updating"
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"dropScreen": {
|
80 |
+
"dropYourFilesHere": "Drop your files here"
|
81 |
+
},
|
82 |
+
"index": {
|
83 |
+
"failedToUpload": "Failed to upload",
|
84 |
+
"cancelledUploadOf": "Cancelled upload of",
|
85 |
+
"couldNotReachServer": "Could not reach the server",
|
86 |
+
"continuingChat": "Continuing previous chat"
|
87 |
+
},
|
88 |
+
"settings": {
|
89 |
+
"settingsPanel": "Settings panel",
|
90 |
+
"reset": "Reset",
|
91 |
+
"cancel": "Cancel",
|
92 |
+
"confirm": "Confirm"
|
93 |
+
}
|
94 |
+
},
|
95 |
+
"threadHistory": {
|
96 |
+
"sidebar": {
|
97 |
+
"filters": {
|
98 |
+
"FeedbackSelect": {
|
99 |
+
"feedbackAll": "Feedback: All",
|
100 |
+
"feedbackPositive": "Feedback: Positive",
|
101 |
+
"feedbackNegative": "Feedback: Negative"
|
102 |
+
},
|
103 |
+
"SearchBar": {
|
104 |
+
"search": "Search"
|
105 |
+
}
|
106 |
+
},
|
107 |
+
"DeleteThreadButton": {
|
108 |
+
"confirmMessage": "This will delete the thread as well as it's messages and elements.",
|
109 |
+
"cancel": "Cancel",
|
110 |
+
"confirm": "Confirm",
|
111 |
+
"deletingChat": "Deleting chat",
|
112 |
+
"chatDeleted": "Chat deleted"
|
113 |
+
},
|
114 |
+
"index": {
|
115 |
+
"pastChats": "Past Chats"
|
116 |
+
},
|
117 |
+
"ThreadList": {
|
118 |
+
"empty": "Empty..."
|
119 |
+
},
|
120 |
+
"TriggerButton": {
|
121 |
+
"closeSidebar": "Close sidebar",
|
122 |
+
"openSidebar": "Open sidebar"
|
123 |
+
}
|
124 |
+
},
|
125 |
+
"Thread": {
|
126 |
+
"backToChat": "Go back to chat",
|
127 |
+
"chatCreatedOn": "This chat was created on"
|
128 |
+
}
|
129 |
+
},
|
130 |
+
"header": {
|
131 |
+
"chat": "Chat",
|
132 |
+
"readme": "Readme"
|
133 |
+
}
|
134 |
+
}
|
135 |
+
},
|
136 |
+
"hooks": {
|
137 |
+
"useLLMProviders": {
|
138 |
+
"failedToFetchProviders": "Failed to fetch providers:"
|
139 |
+
}
|
140 |
+
},
|
141 |
+
"pages": {
|
142 |
+
"Design": {},
|
143 |
+
"Env": {
|
144 |
+
"savedSuccessfully": "Saved successfully",
|
145 |
+
"requiredApiKeys": "Required API Keys",
|
146 |
+
"requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
|
147 |
+
},
|
148 |
+
"Page": {
|
149 |
+
"notPartOfProject": "You are not part of this project."
|
150 |
+
},
|
151 |
+
"ResumeButton": {
|
152 |
+
"resumeChat": "Resume Chat"
|
153 |
+
}
|
154 |
+
}
|
155 |
+
}
|
app/app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chainlit as cl
|
2 |
+
from helper import HelperMethods
|
3 |
+
from pydantic.v1.error_wrappers import ValidationError
|
4 |
+
from cohere.error import CohereAPIError
|
5 |
+
|
6 |
+
COLLECTION_NAME = "ISO_27001_Collection"
|
7 |
+
|
8 |
+
|
9 |
+
@cl.on_chat_start
|
10 |
+
async def on_chat_start():
|
11 |
+
"""
|
12 |
+
Is called when a new chat session is created. Adds an initial message and sets important objects into session state.
|
13 |
+
"""
|
14 |
+
|
15 |
+
await cl.sleep(1)
|
16 |
+
|
17 |
+
msg = cl.Message(author="ISO 27001 - Assistant", content="Hello, do you have questions on ISO 27001? Feel free to ask me.")
|
18 |
+
await msg.send()
|
19 |
+
|
20 |
+
helper = HelperMethods()
|
21 |
+
|
22 |
+
try:
|
23 |
+
llm, MAX_CONTEXT_SIZE = await helper.get_LLM()
|
24 |
+
except ValidationError as e:
|
25 |
+
error_message = cl.ErrorMessage(
|
26 |
+
author="ISO 27001 - Assistant",
|
27 |
+
content="A validation error occurred. Please ensure the Open API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
|
28 |
+
)
|
29 |
+
await error_message.send()
|
30 |
+
return
|
31 |
+
|
32 |
+
state = {"llm": llm, "max_context_size": MAX_CONTEXT_SIZE, "vectordb": helper.get_index_vector_db(COLLECTION_NAME)}
|
33 |
+
cl.user_session.set("state_ISO", state)
|
34 |
+
|
35 |
+
|
36 |
+
@cl.on_message
|
37 |
+
async def on_message(message: cl.Message):
|
38 |
+
"""
|
39 |
+
Is called when a new message is sent by the user. Executes the RAG pipeline (check english, retrieve contexts, check relevancy, check context size, prompt LLM)
|
40 |
+
"""
|
41 |
+
|
42 |
+
state = cl.user_session.get("state_ISO")
|
43 |
+
helper = HelperMethods()
|
44 |
+
query = message.content
|
45 |
+
|
46 |
+
if helper.check_if_english(query):
|
47 |
+
|
48 |
+
try:
|
49 |
+
docs = helper.retrieve_contexts(state["vectordb"], query)
|
50 |
+
except CohereAPIError as e:
|
51 |
+
error_message = cl.ErrorMessage(
|
52 |
+
author="ISO 27001 - Assistant",
|
53 |
+
content="A validation error occurred. Please ensure the Cohere API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
|
54 |
+
)
|
55 |
+
await error_message.send()
|
56 |
+
return
|
57 |
+
|
58 |
+
if helper.check_if_relevant(docs):
|
59 |
+
if helper.is_context_size_valid(docs, query, state["max_context_size"]):
|
60 |
+
|
61 |
+
msg = cl.Message(author="ISO 27001 - Assistant", content="")
|
62 |
+
await msg.send()
|
63 |
+
|
64 |
+
full_prompt, sources, template_path, template_source= helper.get_full_prompt_sources_and_template(docs, state["llm"], query)
|
65 |
+
|
66 |
+
try:
|
67 |
+
stream = state["llm"].astream(full_prompt)
|
68 |
+
except ValidationError as e:
|
69 |
+
error_message = cl.ErrorMessage(
|
70 |
+
author="ISO 27001 - Assistant",
|
71 |
+
content="A validation error occurred. Please ensure the Open API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
|
72 |
+
)
|
73 |
+
await error_message.send()
|
74 |
+
return
|
75 |
+
|
76 |
+
async for part in stream:
|
77 |
+
await msg.stream_token(part.content)
|
78 |
+
|
79 |
+
if template_path == "":
|
80 |
+
sources_str = "\n\nSources: \n" + sources
|
81 |
+
msg.content += sources_str
|
82 |
+
await msg.update()
|
83 |
+
else:
|
84 |
+
sources_str = "\n\nSources: \n" + sources
|
85 |
+
elements = [cl.File(name=template_source, path=template_path, display="inline")]
|
86 |
+
msg.content += sources_str
|
87 |
+
msg.elements = elements
|
88 |
+
await msg.update()
|
89 |
+
else:
|
90 |
+
await cl.Message(
|
91 |
+
author="ISO 27001 - Assistant",
|
92 |
+
content="I am sorry. I cannot process your question, as it would exceed my token limit. Please try to reformulate your question, or ask something else.",
|
93 |
+
).send()
|
94 |
+
else:
|
95 |
+
await cl.Message(author="ISO 27001 - Assistant", content="I am sorry. I cannot process your question, as it is not related to ISO 27001.").send()
|
96 |
+
else:
|
97 |
+
await cl.Message(
|
98 |
+
author="ISO 27001 - Assistant", content="I am sorry. I cannot process your question, as I can only answer questions written in English."
|
99 |
+
).send()
|
app/chainlit.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to the ISO 27001 Chatbot! 🤖
|
2 |
+
|
3 |
+
Hello, this Chainlit application lets you chat with a Retrieval Augmented Generation Pipeline with the focus on ISO 27001. It will only answer English questions related to this topic. Please be aware that this is not a production ready system and do not fully trust the answers, as it can still include errors.
|
4 |
+
|
5 |
+
The RAG Pipeline needs to load the underlying models, so you can start as soon as the Assistant greets you. However, to use the chatbot, you need two API keys which you can create under the following links: [OpenAI](https://openai.com/blog/openai-api) (for the generation) and [Cohere](https://dashboard.cohere.com/api-keys) (for the retrieval re-ranking). After creating both keys, you need to assign them. This can be done by clicking on the user profile and setting the keys. After that please reload the page.
|
6 |
+
|
7 |
+
Please be advised that because this HF Space runs on the free tier, no GPU is available. That's why answering a question takes up to 40 seconds. With GPU support this would be reduced to below 10 seconds.
|
app/helper.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
from langchain.retrievers.document_compressors import CohereRerank
|
4 |
+
from langchain_community.retrievers import BM25Retriever
|
5 |
+
import tiktoken
|
6 |
+
|
7 |
+
# ONLY USE WITH DOCKER, then uncomment
|
8 |
+
import pysqlite3
|
9 |
+
import sys
|
10 |
+
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
11 |
+
|
12 |
+
import chromadb
|
13 |
+
import chainlit as cl
|
14 |
+
from langdetect import detect
|
15 |
+
from langchain_community.vectorstores import Chroma
|
16 |
+
from typing import List
|
17 |
+
from typing import Tuple
|
18 |
+
import os
|
19 |
+
import json
|
20 |
+
from langchain.docstore.document import Document
|
21 |
+
from prompts import get_system_prompt, get_human_prompt, get_system_prompt_template, get_full_prompt
|
22 |
+
|
23 |
+
class HelperMethods:
|
24 |
+
"""
|
25 |
+
Helper class with all important methods for the RAG pipeline.
|
26 |
+
"""
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
pass
|
30 |
+
|
31 |
+
def _get_embedding_model(self):
|
32 |
+
"""
|
33 |
+
Gets the finetuned embedding model based on bge-large-en-v1.5
|
34 |
+
"""
|
35 |
+
path = "Basti8499/bge-large-en-v1.5-ISO-27001"
|
36 |
+
model = HuggingFaceEmbeddings(model_name=path)
|
37 |
+
return model
|
38 |
+
|
39 |
+
|
40 |
+
async def get_LLM(self):
|
41 |
+
"""
|
42 |
+
Initializes the gpt-4.5 16k LLM
|
43 |
+
"""
|
44 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0, max_tokens=680, streaming=True, api_key=cl.user_session.get("env")["OPENAI_API_KEY"])
|
45 |
+
max_context_size = 16385
|
46 |
+
return llm, max_context_size
|
47 |
+
|
48 |
+
|
49 |
+
def get_index_vector_db(self, collection_name: str):
|
50 |
+
"""
|
51 |
+
Gets the index vector base based on the collection name, if existent.
|
52 |
+
"""
|
53 |
+
new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
|
54 |
+
|
55 |
+
# Check if collection already exists
|
56 |
+
collection_exists = True
|
57 |
+
try:
|
58 |
+
new_client.get_collection(collection_name)
|
59 |
+
except ValueError as e:
|
60 |
+
collection_exists = False
|
61 |
+
|
62 |
+
if not collection_exists:
|
63 |
+
raise Exception("Error, raised exception: Collection does not exist.")
|
64 |
+
else:
|
65 |
+
embedding_model = self._get_embedding_model()
|
66 |
+
vectordb = Chroma(client=new_client, collection_name=collection_name, embedding_function=embedding_model)
|
67 |
+
|
68 |
+
return vectordb
|
69 |
+
|
70 |
+
|
71 |
+
def _load_documents(self, file_path: str) -> List[Document]:
|
72 |
+
documents = []
|
73 |
+
with open(file_path, "r") as jsonl_file:
|
74 |
+
for line in jsonl_file:
|
75 |
+
data = json.loads(line)
|
76 |
+
obj = Document(**data)
|
77 |
+
documents.append(obj)
|
78 |
+
return documents
|
79 |
+
|
80 |
+
def check_if_english(self, query: str) -> bool:
|
81 |
+
"""
|
82 |
+
Uses the langdetect library based on Google's language-detection library to check which language the query is in.
|
83 |
+
Returns True if it is English.
|
84 |
+
"""
|
85 |
+
language = detect(query)
|
86 |
+
return language == "en"
|
87 |
+
|
88 |
+
def check_if_relevant(self, docs: List[Document]) -> bool:
|
89 |
+
|
90 |
+
relevance_scores = [doc.metadata["relevance_score"] for doc in docs]
|
91 |
+
avg_score = sum(relevance_scores) / len(relevance_scores)
|
92 |
+
return avg_score > 0.75
|
93 |
+
|
94 |
+
def retrieve_contexts(self, vectordb, query: str, k: int = 8, rerank_k: int = 50, dense_percent: float = 0.5) -> List[Document]:
|
95 |
+
"""
|
96 |
+
Retrieves the documents from the vector database by using a hybrid approach (dense (similarity search) + sparse (BM25)) and the Cohere re-ranking endpoint.
|
97 |
+
"""
|
98 |
+
dense_k = int(rerank_k * dense_percent)
|
99 |
+
sparse_k = rerank_k - dense_k
|
100 |
+
|
101 |
+
# Sparse Retrieval
|
102 |
+
sparse_documents = self._load_documents(f"./../sparse_index/sparse_1536_264")
|
103 |
+
bm25_retriever = BM25Retriever.from_documents(sparse_documents)
|
104 |
+
bm25_retriever.k = sparse_k
|
105 |
+
result_documents_BM25 = bm25_retriever.get_relevant_documents(query)
|
106 |
+
|
107 |
+
# Dense Retrieval
|
108 |
+
result_documents_Dense = vectordb.similarity_search(query, k=dense_k)
|
109 |
+
|
110 |
+
result_documents_all = []
|
111 |
+
result_documents_all.extend(result_documents_BM25)
|
112 |
+
result_documents_all.extend(result_documents_Dense)
|
113 |
+
|
114 |
+
# Only get unique documents and remove duplicates that were retrieved in both sparse and dense
|
115 |
+
unique_documents_dict = {}
|
116 |
+
for doc in result_documents_all:
|
117 |
+
if doc.page_content not in unique_documents_dict:
|
118 |
+
unique_documents_dict[doc.page_content] = doc
|
119 |
+
result_documents_unique = list(unique_documents_dict.values())
|
120 |
+
|
121 |
+
# Re-ranking with Cohere
|
122 |
+
compressor = CohereRerank(top_n=k, user_agent="langchain", cohere_api_key=cl.user_session.get("env")["COHERE_API_KEY"])
|
123 |
+
result_documents = compressor.compress_documents(documents=result_documents_unique, query=query)
|
124 |
+
|
125 |
+
return result_documents
|
126 |
+
|
127 |
+
|
128 |
+
def is_context_size_valid(self, contexts: List[Document], query: str, max_context_size: int) -> bool:
|
129 |
+
"""
|
130 |
+
Checks if the context size is valid with the cl100k tokenizer which is used for OpenAI LLM's.
|
131 |
+
"""
|
132 |
+
# Transform List[Document] into List[str]
|
133 |
+
concatenated_contexts = ""
|
134 |
+
for index, document in enumerate(contexts):
|
135 |
+
original_text = document.metadata.get("original_text", "")
|
136 |
+
# Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
|
137 |
+
original_text = original_text.replace("{", "").replace("}", "")
|
138 |
+
concatenated_contexts += f"{index+1}. {original_text}\n\n"
|
139 |
+
|
140 |
+
if not query.endswith("?"):
|
141 |
+
query = query + "?"
|
142 |
+
|
143 |
+
# Get the prompts
|
144 |
+
system_str, system_prompt = get_system_prompt()
|
145 |
+
human_str, human_prompt = get_human_prompt(concatenated_contexts, query)
|
146 |
+
full_prompt = system_str + "\n" + human_str
|
147 |
+
|
148 |
+
# Count token length
|
149 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
150 |
+
token_length = len(tokenizer.encode(full_prompt))
|
151 |
+
|
152 |
+
if token_length <= max_context_size:
|
153 |
+
return True
|
154 |
+
else:
|
155 |
+
return False
|
156 |
+
|
157 |
+
def get_full_prompt_sources_and_template(self, contexts: List[Document], llm, prompt: str) -> Tuple[str, str, str, str]:
|
158 |
+
|
159 |
+
# Check if the query is aimed at a template and check if the context documents also have a template
|
160 |
+
# If it is a template question the query and system prompt has to be altered
|
161 |
+
# Only check first two because otherwise the re-ranked score is not high enough to assume that the retrieved template is valid for that question
|
162 |
+
is_template_question = False
|
163 |
+
template_path = ""
|
164 |
+
template_source = ""
|
165 |
+
if "template" in prompt.lower():
|
166 |
+
for context in contexts[:2]:
|
167 |
+
if "template_path" in context.metadata:
|
168 |
+
is_template_question = True
|
169 |
+
template_path = context.metadata["template_path"]
|
170 |
+
template_source = context.metadata["source"]
|
171 |
+
break
|
172 |
+
|
173 |
+
# Concatenate all document texts and sources
|
174 |
+
concatenated_contexts = ""
|
175 |
+
concatenated_sources = ""
|
176 |
+
seen_sources = set()
|
177 |
+
if is_template_question:
|
178 |
+
|
179 |
+
for index, document in enumerate(contexts[:2]):
|
180 |
+
original_text = document.metadata.get('original_text', '')
|
181 |
+
# Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
|
182 |
+
original_text = original_text.replace("{", "").replace("}", "")
|
183 |
+
concatenated_contexts += f"{index+1}. {original_text}\n\n"
|
184 |
+
|
185 |
+
source = document.metadata.get('source', '')
|
186 |
+
if source not in seen_sources:
|
187 |
+
concatenated_sources += f"{len(seen_sources) + 1}. {source}\n"
|
188 |
+
seen_sources.add(source)
|
189 |
+
|
190 |
+
else:
|
191 |
+
for index, document in enumerate(contexts):
|
192 |
+
original_text = document.metadata.get('original_text', '')
|
193 |
+
# Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
|
194 |
+
original_text = original_text.replace("{", "").replace("}", "")
|
195 |
+
concatenated_contexts += f"{index+1}. {original_text}\n\n"
|
196 |
+
|
197 |
+
source = document.metadata.get('source', '')
|
198 |
+
if source not in seen_sources:
|
199 |
+
concatenated_sources += f"{len(seen_sources) + 1}. {source}\n"
|
200 |
+
seen_sources.add(source)
|
201 |
+
|
202 |
+
# Check if question mark is at the end of the prompt
|
203 |
+
if not prompt.endswith("?"):
|
204 |
+
prompt = prompt + "?"
|
205 |
+
|
206 |
+
if is_template_question:
|
207 |
+
system_str, system_prompt = get_system_prompt_template()
|
208 |
+
human_str, human_prompt = get_human_prompt(concatenated_contexts, prompt)
|
209 |
+
full_prompt = get_full_prompt(system_prompt, human_prompt)
|
210 |
+
#answer = llm.invoke(full_prompt).content
|
211 |
+
else:
|
212 |
+
system_str, system_prompt = get_system_prompt()
|
213 |
+
human_str, human_prompt = get_human_prompt(concatenated_contexts, prompt)
|
214 |
+
full_prompt = get_full_prompt(system_prompt, human_prompt)
|
215 |
+
#answer = llm.invoke(full_prompt).content
|
216 |
+
|
217 |
+
return full_prompt, concatenated_sources, template_path, template_source
|
app/prompts.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
from typing import List
|
3 |
+
from langchain.prompts.chat import (
|
4 |
+
ChatPromptTemplate,
|
5 |
+
SystemMessagePromptTemplate,
|
6 |
+
HumanMessagePromptTemplate,
|
7 |
+
)
|
8 |
+
from langchain_core.messages import BaseMessage
|
9 |
+
from langchain.docstore.document import Document
|
10 |
+
|
11 |
+
"""
|
12 |
+
Python file for getting the prompts and the respective templates.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def get_system_prompt() -> Tuple[str, SystemMessagePromptTemplate]:
|
16 |
+
|
17 |
+
prompt_str = """You are an expert in information security, especially for ISO 27001 certifications. Answer the following question as truthfully as possible, using the provided context and not prior knowledge. If the answer is not contained within the context or the question is not related to the topic of information security or ISO 27001 or the question is not written in English, respond with 'I am sorry. I do not have knowledge on that topic'. Write a maximum of 400 words."""
|
18 |
+
template = SystemMessagePromptTemplate.from_template(prompt_str)
|
19 |
+
return prompt_str, template
|
20 |
+
|
21 |
+
def get_system_prompt_template() -> Tuple[str, SystemMessagePromptTemplate]:
|
22 |
+
|
23 |
+
prompt_str = f"""Answer the following question with that you can provide a template to the user and say that it is attached to this message. After that end your answer."""
|
24 |
+
template = SystemMessagePromptTemplate.from_template(prompt_str)
|
25 |
+
|
26 |
+
return prompt_str, template
|
27 |
+
|
28 |
+
def get_human_prompt(contexts: List[str], question: str) -> Tuple[str, HumanMessagePromptTemplate]:
|
29 |
+
|
30 |
+
prompt_str = f"""Question: {question} \n Context: {contexts}"""
|
31 |
+
template = HumanMessagePromptTemplate.from_template(prompt_str)
|
32 |
+
return prompt_str, template
|
33 |
+
|
34 |
+
def get_full_prompt(system_prompt: SystemMessagePromptTemplate, human_prompt: HumanMessagePromptTemplate) -> List[BaseMessage]:
|
35 |
+
|
36 |
+
full_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
|
37 |
+
prompt_messages = full_prompt.format_prompt().to_messages()
|
38 |
+
|
39 |
+
return prompt_messages
|
app/public/logo_dark.png
ADDED
app/public/logo_light.png
ADDED
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95042e844cfb77b20e578cf65635282a99d7c4dd20e589ac062f38bc389f8e58
|
3 |
+
size 4236000
|
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcc596bc1909f7cc610d5839236c90513b4fbad06776c253fa1b21bfd712e940
|
3 |
+
size 100
|
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
|
3 |
+
size 4000
|
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
3 |
+
size 0
|
chroma/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:491712cb0724ea8cd14691afba9f5c0ae3f07b780ec11e0a383b24c6cd711fe6
|
3 |
+
size 10010624
|
index_preparation/build_index.ipynb
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for creating/updating the dense and sparse indices"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from ipynb.fs.defs.preprocess_data import preprocess_data\n",
|
17 |
+
"from ipynb.fs.defs.preprocess_data import get_documents_from_files\n",
|
18 |
+
"from ipynb.fs.defs.preprocess_data import split_docs\n",
|
19 |
+
"from ipynb.fs.defs.preprocess_data import clean_and_process_chunked_documents\n",
|
20 |
+
"from ipynb.fs.defs.preprocess_data import store_documents\n",
|
21 |
+
"import chromadb\n",
|
22 |
+
"from langchain.vectorstores import Chroma\n",
|
23 |
+
"from langchain.docstore.document import Document\n",
|
24 |
+
"from typing import List\n",
|
25 |
+
"import os\n",
|
26 |
+
"\n",
|
27 |
+
"\n",
|
28 |
+
"def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):\n",
|
29 |
+
" '''\n",
|
30 |
+
" Builds the index vector DB from documents with the specified embeddings and collection_name\n",
|
31 |
+
" If it already exists, updates the index with the new documents\n",
|
32 |
+
" '''\n",
|
33 |
+
" new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
|
34 |
+
"\n",
|
35 |
+
" print(\"Starting to build index for: \", collection_metadata)\n",
|
36 |
+
"\n",
|
37 |
+
" # Check if collection already exists\n",
|
38 |
+
" collection_exists = True\n",
|
39 |
+
" try:\n",
|
40 |
+
" collection = new_client.get_collection(collection_name)\n",
|
41 |
+
" except ValueError as e:\n",
|
42 |
+
" collection_exists = False\n",
|
43 |
+
"\n",
|
44 |
+
" if not collection_exists:\n",
|
45 |
+
" print(\"Collection is new\")\n",
|
46 |
+
" # If collection does not exist, create it\n",
|
47 |
+
" collection = new_client.create_collection(collection_name)\n",
|
48 |
+
" # Each document needs an ID\n",
|
49 |
+
" ids = [str(i) for i in range(1, len(documents) + 1)]\n",
|
50 |
+
"\n",
|
51 |
+
" # Store the text of the document and metadata separately in order to insert it into Chroma\n",
|
52 |
+
" texts = []\n",
|
53 |
+
" metadata_docs = []\n",
|
54 |
+
" for document in documents:\n",
|
55 |
+
" texts.append(document.page_content)\n",
|
56 |
+
" metadata_docs.append(document.metadata)\n",
|
57 |
+
"\n",
|
58 |
+
" # Add them in batches (otherwise Chroma error)\n",
|
59 |
+
" for start_idx in range(0, len(embeddings), 1000):\n",
|
60 |
+
" end_idx = start_idx + 1000\n",
|
61 |
+
" # Ensure not to go out of bounds\n",
|
62 |
+
" embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
|
63 |
+
" texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
|
64 |
+
" ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
|
65 |
+
" metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
|
66 |
+
"\n",
|
67 |
+
" collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
|
68 |
+
" print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
|
69 |
+
"\n",
|
70 |
+
" vectordb = Chroma(\n",
|
71 |
+
" client=new_client,\n",
|
72 |
+
" collection_name=collection_name,\n",
|
73 |
+
" collection_metadata={\n",
|
74 |
+
" \"embedding_model_provider\": collection_metadata[\"embedding_model_provider\"],\n",
|
75 |
+
" \"embedding_model_name\": collection_metadata[\"embedding_model_name\"],\n",
|
76 |
+
" \"chunk_size\": collection_metadata[\"chunk_size\"],\n",
|
77 |
+
" \"chunk_overlap\": collection_metadata[\"chunk_overlap\"],\n",
|
78 |
+
" \"hnsw:space\": dist_function, # either \"l2\" or \"ip\" or \"cosine\"\n",
|
79 |
+
" },\n",
|
80 |
+
" )\n",
|
81 |
+
" print(f\"Collection {collection_name} successfully created.\")\n",
|
82 |
+
" print(\"There are\", vectordb._collection.count(), \"entries in the collection.\")\n",
|
83 |
+
"\n",
|
84 |
+
" return new_client, vectordb\n",
|
85 |
+
"\n",
|
86 |
+
" else:\n",
|
87 |
+
" print(\"Collection already exists\")\n",
|
88 |
+
" vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
|
89 |
+
"\n",
|
90 |
+
" collection_count = vectordb._collection.count()\n",
|
91 |
+
" print(f\"There are {collection_count} entries in the collection {collection_name} prior to updating.\")\n",
|
92 |
+
"\n",
|
93 |
+
" # Continue the IDs from the last ID\n",
|
94 |
+
" ids = [str(i) for i in range(collection_count + 1, collection_count + len(documents) + 1)]\n",
|
95 |
+
" # Store the text of the document and metadata separately in order to insert it into Chroma\n",
|
96 |
+
" texts = []\n",
|
97 |
+
" metadata_docs = []\n",
|
98 |
+
" for document in documents:\n",
|
99 |
+
" texts.append(document.page_content)\n",
|
100 |
+
" metadata_docs.append(document.metadata)\n",
|
101 |
+
"\n",
|
102 |
+
" # Add them in batches (otherwise Chroma error)\n",
|
103 |
+
" for start_idx in range(0, len(embeddings), 1000):\n",
|
104 |
+
" end_idx = start_idx + 1000\n",
|
105 |
+
" # Ensure not to go out of bounds\n",
|
106 |
+
" embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
|
107 |
+
" texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
|
108 |
+
" ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
|
109 |
+
" metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
|
110 |
+
"\n",
|
111 |
+
" collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
|
112 |
+
" print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
|
113 |
+
"\n",
|
114 |
+
" collection_count = vectordb._collection.count()\n",
|
115 |
+
" print(f\"There are {collection_count} entries in the collection {collection_name} after updating.\")\n",
|
116 |
+
" return new_client, 0"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": null,
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [],
|
124 |
+
"source": [
|
125 |
+
"chunk_size = 1536\n",
|
126 |
+
"chunk_overlap = 264\n",
|
127 |
+
"# If update is needed, set to False\n",
|
128 |
+
"all_docs = True\n",
|
129 |
+
"\n",
|
130 |
+
"documents, embedding_model, embeddings = preprocess_data(chunk_size, chunk_overlap, all_docs)\n",
|
131 |
+
"collection_name = \"ISO_27001_Collection\"\n",
|
132 |
+
"collection_metadata = {\n",
|
133 |
+
"\"embedding_model_provider\": \"Fine-tuned\",\n",
|
134 |
+
"\"embedding_model_name\": \"finetuned-BGE-large-ISO-27001\",\n",
|
135 |
+
"\"chunk_size\": str(chunk_size),\n",
|
136 |
+
"\"chunk_overlap\": str(chunk_overlap),\n",
|
137 |
+
"}\n",
|
138 |
+
"\n",
|
139 |
+
"build_or_update_index_vector_db(documents, embeddings, collection_name, \"l2\", collection_metadata)"
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"cell_type": "code",
|
144 |
+
"execution_count": null,
|
145 |
+
"metadata": {},
|
146 |
+
"outputs": [],
|
147 |
+
"source": [
|
148 |
+
"def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int):\n",
|
149 |
+
" \"\"\"\n",
|
150 |
+
" Stores the documents for sparse retrieval in a basic text file\n",
|
151 |
+
" \"\"\"\n",
|
152 |
+
" documents = get_documents_from_files(True)\n",
|
153 |
+
" chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
154 |
+
" chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n",
|
155 |
+
"\n",
|
156 |
+
" store_documents(chunked_cleaned_documents, f\"./../sparse_index/sparse_1536_264\")"
|
157 |
+
]
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"execution_count": null,
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [],
|
164 |
+
"source": [
|
165 |
+
"# Create the actual sparse index\n",
|
166 |
+
"store_documents_for_sparse_retrieval(chunk_size, chunk_overlap)"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"cell_type": "markdown",
|
171 |
+
"metadata": {},
|
172 |
+
"source": [
|
173 |
+
"#### Helper methods for Chroma"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": null,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"# Returns the vectorDB based on the collection name if it exists\n",
|
183 |
+
"def get_index_vector_db(collection_name: str):\n",
|
184 |
+
" new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
|
185 |
+
"\n",
|
186 |
+
" # Check if collection already exists\n",
|
187 |
+
" collection_exists = True\n",
|
188 |
+
" try:\n",
|
189 |
+
" new_client.get_collection(collection_name)\n",
|
190 |
+
" except ValueError as e:\n",
|
191 |
+
" collection_exists = False\n",
|
192 |
+
"\n",
|
193 |
+
" if not collection_exists:\n",
|
194 |
+
" raise Exception(\"Error, raised exception: Collection does not exist.\")\n",
|
195 |
+
" else:\n",
|
196 |
+
" vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
|
197 |
+
"\n",
|
198 |
+
" return new_client, vectordb"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"cell_type": "code",
|
203 |
+
"execution_count": null,
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [],
|
206 |
+
"source": [
|
207 |
+
"def delete_collection(collection_name: str):\n",
|
208 |
+
" new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
|
209 |
+
"\n",
|
210 |
+
" try:\n",
|
211 |
+
" new_client.delete_collection(collection_name)\n",
|
212 |
+
" except ValueError as e:\n",
|
213 |
+
" print(\"Collection could not be deleted.\")"
|
214 |
+
]
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"cell_type": "code",
|
218 |
+
"execution_count": null,
|
219 |
+
"metadata": {},
|
220 |
+
"outputs": [],
|
221 |
+
"source": [
|
222 |
+
"def return_collections():\n",
|
223 |
+
" new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
|
224 |
+
" collections = new_client.list_collections()\n",
|
225 |
+
" return collections"
|
226 |
+
]
|
227 |
+
}
|
228 |
+
],
|
229 |
+
"metadata": {
|
230 |
+
"kernelspec": {
|
231 |
+
"display_name": "venv",
|
232 |
+
"language": "python",
|
233 |
+
"name": "python3"
|
234 |
+
},
|
235 |
+
"language_info": {
|
236 |
+
"codemirror_mode": {
|
237 |
+
"name": "ipython",
|
238 |
+
"version": 3
|
239 |
+
},
|
240 |
+
"file_extension": ".py",
|
241 |
+
"mimetype": "text/x-python",
|
242 |
+
"name": "python",
|
243 |
+
"nbconvert_exporter": "python",
|
244 |
+
"pygments_lexer": "ipython3",
|
245 |
+
"version": "3.11.5"
|
246 |
+
}
|
247 |
+
},
|
248 |
+
"nbformat": 4,
|
249 |
+
"nbformat_minor": 2
|
250 |
+
}
|
index_preparation/create_QA_set_documents.ipynb
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for creating the documents based on the curated QA pair dataset"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from ipynb.fs.defs.preprocess_data import store_documents\n",
|
17 |
+
"from langchain.docstore.document import Document\n",
|
18 |
+
"import json\n",
|
19 |
+
"\n",
|
20 |
+
"# Load QA dataset\n",
|
21 |
+
"with open(\"./../input_data/QA_dataset/golden_qa_set.json\", 'r') as file:\n",
|
22 |
+
" golden_qa_set = json.load(file)\n",
|
23 |
+
"\n",
|
24 |
+
"# Remove duplicate answers (Kersten + Secondary Literature) and template answers\n",
|
25 |
+
"indices_to_remove = list(range(102, 121)) + list(range(122, 133)) + list(range(134, 157))\n",
|
26 |
+
"indices_to_remove = sorted(set(indices_to_remove), reverse=True)\n",
|
27 |
+
"for index in indices_to_remove:\n",
|
28 |
+
" del golden_qa_set['qa_set'][index]\n",
|
29 |
+
"\n",
|
30 |
+
"question_set = [qa['question'] for qa in golden_qa_set['qa_set']]\n",
|
31 |
+
"golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "code",
|
36 |
+
"execution_count": null,
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"# Create one document for each question\n",
|
41 |
+
"all_qa_dataset_documents = []\n",
|
42 |
+
"for q, a in zip(question_set, golden_answer_set):\n",
|
43 |
+
"\n",
|
44 |
+
" document = Document(\n",
|
45 |
+
" page_content=f\"{q} \\n {a}\", \n",
|
46 |
+
" metadata={\n",
|
47 |
+
" \"source\": \"QA Dataset\",\n",
|
48 |
+
" \"title\": \"QA Dataset\"\n",
|
49 |
+
" })\n",
|
50 |
+
" all_qa_dataset_documents.append(document)"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": null,
|
56 |
+
"metadata": {},
|
57 |
+
"outputs": [],
|
58 |
+
"source": [
|
59 |
+
"store_documents(all_qa_dataset_documents, \"./../input_data/QA_dataset/all_documents\")"
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"metadata": {
|
64 |
+
"kernelspec": {
|
65 |
+
"display_name": "venv",
|
66 |
+
"language": "python",
|
67 |
+
"name": "python3"
|
68 |
+
},
|
69 |
+
"language_info": {
|
70 |
+
"codemirror_mode": {
|
71 |
+
"name": "ipython",
|
72 |
+
"version": 3
|
73 |
+
},
|
74 |
+
"file_extension": ".py",
|
75 |
+
"mimetype": "text/x-python",
|
76 |
+
"name": "python",
|
77 |
+
"nbconvert_exporter": "python",
|
78 |
+
"pygments_lexer": "ipython3",
|
79 |
+
"version": "3.8.5"
|
80 |
+
}
|
81 |
+
},
|
82 |
+
"nbformat": 4,
|
83 |
+
"nbformat_minor": 2
|
84 |
+
}
|
index_preparation/create_pdf_documents.ipynb
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for updating the PDF document"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from ipynb.fs.defs.preprocess_data import store_documents\n",
|
17 |
+
"from ipynb.fs.defs.preprocess_data import load_documents\n",
|
18 |
+
"from langchain.docstore.document import Document\n",
|
19 |
+
"import pypdfium2 as pdfium\n",
|
20 |
+
"import cv2\n",
|
21 |
+
"import os\n",
|
22 |
+
"import pytesseract\n",
|
23 |
+
"from typing import List\n",
|
24 |
+
"import shutil\n",
|
25 |
+
"\n",
|
26 |
+
"pytesseract_path = os.environ.get(\"TESSERACT_PATH\")\n",
|
27 |
+
"pytesseract.pytesseract.tesseract_cmd = pytesseract_path\n",
|
28 |
+
"\n",
|
29 |
+
"\n",
|
30 |
+
"def update_pdf_documents() -> List[Document]:\n",
|
31 |
+
" \"\"\"\n",
|
32 |
+
" Method for processing and updating documents based on the PDFs stored in input_data/PDF/documents. For that the PDFs, that were not processed yet, are converted to images and then transformed to texts. For each PDF one document is then created with all text from all pages. In the end the filename is changed, so that it is clear that it was already processed.\n",
|
33 |
+
" \"\"\"\n",
|
34 |
+
"\n",
|
35 |
+
" # List for either all documents or only new ones\n",
|
36 |
+
" documents_PDF = []\n",
|
37 |
+
" # List for all documents\n",
|
38 |
+
" already_processed_documents = load_documents(\"./../input_data/PDF/documents/all_documents\")\n",
|
39 |
+
"\n",
|
40 |
+
" PDF_images_path = \"./../input_data/PDF/PDF_Images\"\n",
|
41 |
+
" directory_path = \"./../input_data/PDF/files\"\n",
|
42 |
+
"\n",
|
43 |
+
" # Go through each PDF file in the directory\n",
|
44 |
+
" for file in os.listdir(directory_path):\n",
|
45 |
+
" if \"Tesseract_processed\" not in file:\n",
|
46 |
+
" file_path = os.path.join(directory_path, file)\n",
|
47 |
+
" pdf = pdfium.PdfDocument(file_path)\n",
|
48 |
+
" n_pages = len(pdf)\n",
|
49 |
+
" # Create directory to store the image\n",
|
50 |
+
" os.makedirs(PDF_images_path + f\"/{file}\")\n",
|
51 |
+
" complete_text = \"\"\n",
|
52 |
+
" # Go through each page of the PDF and save the according image\n",
|
53 |
+
" for page_number in range(n_pages):\n",
|
54 |
+
" page = pdf.get_page(page_number)\n",
|
55 |
+
" pil_image = page.render(\n",
|
56 |
+
" scale=300 / 72,\n",
|
57 |
+
" rotation=0,\n",
|
58 |
+
" crop=(0, 0, 0, 0),\n",
|
59 |
+
" ).to_pil()\n",
|
60 |
+
" pil_image_path = PDF_images_path + f\"/{file}/image_{page_number+1}.png\"\n",
|
61 |
+
" pil_image.save(pil_image_path)\n",
|
62 |
+
" img = cv2.imread(pil_image_path)\n",
|
63 |
+
" # Convert image to grayscale\n",
|
64 |
+
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
|
65 |
+
" # Apply threshold to convert to binary image\n",
|
66 |
+
" threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]\n",
|
67 |
+
" # Pass the image through pytesseract and add the text to the whole document text\n",
|
68 |
+
" complete_text += pytesseract.image_to_string(threshold_img) + \"\\n\"\n",
|
69 |
+
" # Remove the image as it is already processed\n",
|
70 |
+
" os.remove(pil_image_path)\n",
|
71 |
+
"\n",
|
72 |
+
" file_name_without_pdf = file\n",
|
73 |
+
" if file.endswith(\".pdf\"):\n",
|
74 |
+
" file_name_without_pdf = file[:-4]\n",
|
75 |
+
" # Create a document based on the whole text and metadata\n",
|
76 |
+
" document_PDF = Document(page_content=complete_text, metadata={\"source\": file, \"title\": file_name_without_pdf})\n",
|
77 |
+
" documents_PDF.append(document_PDF)\n",
|
78 |
+
" already_processed_documents.append(document_PDF)\n",
|
79 |
+
"\n",
|
80 |
+
" # Change the filename, so that in future calls the PDF is not processed again\n",
|
81 |
+
" new_filename = file.replace(\".pdf\", \"_Tesseract_processed.pdf\")\n",
|
82 |
+
" new_pdf_path = os.path.join(directory_path, new_filename)\n",
|
83 |
+
" print(new_pdf_path)\n",
|
84 |
+
" pdf.close()\n",
|
85 |
+
" os.rename(file_path, new_pdf_path)\n",
|
86 |
+
"\n",
|
87 |
+
" # Store docs if new documents were processed\n",
|
88 |
+
" if len(documents_PDF) > 0:\n",
|
89 |
+
" # Store all documents, including the new ones\n",
|
90 |
+
" store_documents(already_processed_documents, \"./../input_data/PDF/documents/all_documents\")\n",
|
91 |
+
" # Store the new documents\n",
|
92 |
+
" store_documents(documents_PDF, \"./../input_data/PDF/documents/new_documents\")\n",
|
93 |
+
"\n",
|
94 |
+
" # Delete the empty folders inside the images folder\n",
|
95 |
+
" target_dir = \"./../input_data/PDF/PDF_images\"\n",
|
96 |
+
"\n",
|
97 |
+
" # Check if the target directory exists to avoid errors\n",
|
98 |
+
" if os.path.exists(target_dir):\n",
|
99 |
+
" # List all the items in the directory\n",
|
100 |
+
" for item in os.listdir(target_dir):\n",
|
101 |
+
" item_path = os.path.join(target_dir, item)\n",
|
102 |
+
" if os.path.isdir(item_path):\n",
|
103 |
+
" # Use shutil.rmtree to delete the directory and all its contents\n",
|
104 |
+
" shutil.rmtree(item_path)"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": null,
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [],
|
112 |
+
"source": [
|
113 |
+
"# Uncomment update needed because of new unprocessed files\n",
|
114 |
+
"# update_pdf_documents()"
|
115 |
+
]
|
116 |
+
}
|
117 |
+
],
|
118 |
+
"metadata": {
|
119 |
+
"kernelspec": {
|
120 |
+
"display_name": "venv",
|
121 |
+
"language": "python",
|
122 |
+
"name": "python3"
|
123 |
+
},
|
124 |
+
"language_info": {
|
125 |
+
"codemirror_mode": {
|
126 |
+
"name": "ipython",
|
127 |
+
"version": 3
|
128 |
+
},
|
129 |
+
"file_extension": ".py",
|
130 |
+
"mimetype": "text/x-python",
|
131 |
+
"name": "python",
|
132 |
+
"nbconvert_exporter": "python",
|
133 |
+
"pygments_lexer": "ipython3",
|
134 |
+
"version": "3.8.5"
|
135 |
+
}
|
136 |
+
},
|
137 |
+
"nbformat": 4,
|
138 |
+
"nbformat_minor": 2
|
139 |
+
}
|
index_preparation/create_template_documents.ipynb
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for creating the template documents"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": null,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"from ipynb.fs.defs.preprocess_data import get_template_documents\n",
|
17 |
+
"from ipynb.fs.defs.preprocess_data import store_documents\n",
|
18 |
+
"from langchain.docstore.document import Document\n",
|
19 |
+
"import os\n",
|
20 |
+
"import shutil\n",
|
21 |
+
"\n",
|
22 |
+
"# Load all already existing documents and store paths of new documents to be processed\n",
|
23 |
+
"all_template_documents = get_template_documents(True)\n",
|
24 |
+
"template_paths = os.listdir(\"./../input_data/Templates/template_files/new\")\n",
|
25 |
+
"\n",
|
26 |
+
"print(all_template_documents)\n",
|
27 |
+
"print(\"\\n\")\n",
|
28 |
+
"print(template_paths)"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": null,
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"# Manually create the documents for each template\n",
|
38 |
+
"full_path = \"./../input_data/Templates/template_files/processed\"\n",
|
39 |
+
"\n",
|
40 |
+
"template_document_1 = Document(\n",
|
41 |
+
" page_content=f\"You can find a possible template for the backup policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the backup policy.\", \n",
|
42 |
+
" metadata={\n",
|
43 |
+
" \"template_path\": full_path + \"/\" + template_paths[0], \n",
|
44 |
+
" \"source\": template_paths[0],\n",
|
45 |
+
" })\n",
|
46 |
+
"\n",
|
47 |
+
"template_document_2 = Document(\n",
|
48 |
+
" page_content=f\"You can find a possible template for the change management policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content, procedures, risk management and more for the change management policy.\", \n",
|
49 |
+
" metadata={\n",
|
50 |
+
" \"template_path\": full_path + \"/\" + template_paths[1], \n",
|
51 |
+
" \"source\": template_paths[1],\n",
|
52 |
+
" })\n",
|
53 |
+
"\n",
|
54 |
+
"\n",
|
55 |
+
"template_document_3 = Document(\n",
|
56 |
+
" page_content=f\"You can find a possible template for the encryption policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the encryption policy.\", \n",
|
57 |
+
" metadata={\n",
|
58 |
+
" \"template_path\": full_path + \"/\" + template_paths[2], \n",
|
59 |
+
" \"source\": template_paths[2],\n",
|
60 |
+
" })\n",
|
61 |
+
"\n",
|
62 |
+
"\n",
|
63 |
+
"template_document_4 = Document(\n",
|
64 |
+
" page_content=f\"You can find a possible template for a checklist for all ISO-27001 controls (Version 2013) attached to this message. It contains a simple checklist for the ISO 27001 controls 5 to 18.\", \n",
|
65 |
+
" metadata={\n",
|
66 |
+
" \"template_path\": full_path + \"/\" + template_paths[3], \n",
|
67 |
+
" \"source\": template_paths[3],\n",
|
68 |
+
" })\n",
|
69 |
+
"\n",
|
70 |
+
"template_document_5 = Document(\n",
|
71 |
+
" page_content=f\"You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.\", \n",
|
72 |
+
" metadata={\n",
|
73 |
+
" \"template_path\": full_path + \"/\" + template_paths[4], \n",
|
74 |
+
" \"source\": template_paths[4],\n",
|
75 |
+
" })\n",
|
76 |
+
"\n",
|
77 |
+
"template_document_5 = Document(\n",
|
78 |
+
" page_content=f\"You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.\", \n",
|
79 |
+
" metadata={\n",
|
80 |
+
" \"template_path\": full_path + \"/\" + template_paths[4], \n",
|
81 |
+
" \"source\": template_paths[4],\n",
|
82 |
+
" })\n",
|
83 |
+
"\n",
|
84 |
+
"new_template_documents = []\n",
|
85 |
+
"new_template_documents.append(template_document_1)\n",
|
86 |
+
"new_template_documents.append(template_document_2)\n",
|
87 |
+
"new_template_documents.append(template_document_3)\n",
|
88 |
+
"new_template_documents.append(template_document_4)\n",
|
89 |
+
"new_template_documents.append(template_document_5)"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "code",
|
94 |
+
"execution_count": null,
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [],
|
97 |
+
"source": [
|
98 |
+
"# Store the new templates and all templates\n",
|
99 |
+
"store_documents(new_template_documents, \"./../input_data/Templates/documents/new_documents\")\n",
|
100 |
+
"\n",
|
101 |
+
"all_template_documents.extend(new_template_documents)\n",
|
102 |
+
"store_documents(new_template_documents, \"./../input_data/Templates/documents/all_documents\")"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": null,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"# Move new templates to processed templates\n",
|
112 |
+
"for path in template_paths:\n",
|
113 |
+
" source_file = f\"./../input_data/Templates/template_files/new/{path}\"\n",
|
114 |
+
" destination_folder = \"./../input_data/Templates/template_files/processed\"\n",
|
115 |
+
" shutil.move(source_file, destination_folder)"
|
116 |
+
]
|
117 |
+
}
|
118 |
+
],
|
119 |
+
"metadata": {
|
120 |
+
"kernelspec": {
|
121 |
+
"display_name": "venv",
|
122 |
+
"language": "python",
|
123 |
+
"name": "python3"
|
124 |
+
},
|
125 |
+
"language_info": {
|
126 |
+
"codemirror_mode": {
|
127 |
+
"name": "ipython",
|
128 |
+
"version": 3
|
129 |
+
},
|
130 |
+
"file_extension": ".py",
|
131 |
+
"mimetype": "text/x-python",
|
132 |
+
"name": "python",
|
133 |
+
"nbconvert_exporter": "python",
|
134 |
+
"pygments_lexer": "ipython3",
|
135 |
+
"version": "3.11.5"
|
136 |
+
}
|
137 |
+
},
|
138 |
+
"nbformat": 4,
|
139 |
+
"nbformat_minor": 2
|
140 |
+
}
|
index_preparation/create_web_documents.ipynb
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for cleaning, creating and updating the web documents\n",
|
8 |
+
"\n",
|
9 |
+
"First import the documents from the uncleaned URLs and store the text into separate files. Then manually clean files and update it. After that, get all already cleaned documents and store both, only the new documents and all documents (old cleaned + new cleaned)."
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"from ipynb.fs.defs.preprocess_data import get_web_documents\n",
|
19 |
+
"from ipynb.fs.defs.preprocess_data import store_documents\n",
|
20 |
+
"from langchain.docstore.document import Document\n",
|
21 |
+
"from langchain.document_loaders import AsyncHtmlLoader\n",
|
22 |
+
"from langchain.document_transformers import Html2TextTransformer\n",
|
23 |
+
"from typing import List\n",
|
24 |
+
"\n",
|
25 |
+
"def get_web_documents_for_cleaning() -> List[Document]:\n",
|
26 |
+
" \"\"\"\n",
|
27 |
+
" Method for returning documents based on the URLs. Looks at the .txt file with all uncleaned urls and uses the AsyncHTMLoader and HTML2TextTransformer to get the texts.\n",
|
28 |
+
" \"\"\"\n",
|
29 |
+
" directory_path_web = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n",
|
30 |
+
"\n",
|
31 |
+
" imported_urls = []\n",
|
32 |
+
" with open(directory_path_web, \"r\") as file:\n",
|
33 |
+
" for line in file:\n",
|
34 |
+
" imported_urls.append(line.strip())\n",
|
35 |
+
"\n",
|
36 |
+
" loader_web = AsyncHtmlLoader(imported_urls)\n",
|
37 |
+
" documents_web = loader_web.load()\n",
|
38 |
+
"\n",
|
39 |
+
" html2text = Html2TextTransformer()\n",
|
40 |
+
" documents_web_transformed = html2text.transform_documents(documents_web)\n",
|
41 |
+
" print(\"Number of documents: \" + str(len(documents_web_transformed)) + \"\\n\")\n",
|
42 |
+
"\n",
|
43 |
+
" return documents_web_transformed\n",
|
44 |
+
"\n",
|
45 |
+
"documents = get_web_documents_for_cleaning()\n",
|
46 |
+
"already_cleaned_documents = get_web_documents(True)"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": null,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"# Loop over the array and store each string in a separate txt file\n",
|
56 |
+
"counter = 1\n",
|
57 |
+
"for doc in documents:\n",
|
58 |
+
" # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n",
|
59 |
+
" file_name = f\"file_{counter}.txt\"\n",
|
60 |
+
" counter += 1\n",
|
61 |
+
" \n",
|
62 |
+
" # Open the file in write mode\n",
|
63 |
+
" with open(file_name, 'w', encoding='utf-8') as file:\n",
|
64 |
+
" # Write the string to the file\n",
|
65 |
+
" file.write(doc.page_content)\n",
|
66 |
+
"\n",
|
67 |
+
" print(f'The string has been successfully stored in {file_name}.')"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "code",
|
72 |
+
"execution_count": null,
|
73 |
+
"metadata": {},
|
74 |
+
"outputs": [],
|
75 |
+
"source": [
|
76 |
+
"# NOW MANUALLY CLEAN"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": null,
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"cleaned_texts = []\n",
|
86 |
+
"\n",
|
87 |
+
"counter = 1\n",
|
88 |
+
"for doc in documents:\n",
|
89 |
+
" # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n",
|
90 |
+
" file_name = f\"file_{counter}.txt\"\n",
|
91 |
+
" counter += 1\n",
|
92 |
+
" \n",
|
93 |
+
" # Open the file in write mode\n",
|
94 |
+
" with open(file_name, 'r', encoding='utf-8') as file:\n",
|
95 |
+
" # Write the string to the file\n",
|
96 |
+
" text = file.read()\n",
|
97 |
+
" cleaned_texts.append(text)"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": null,
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [],
|
105 |
+
"source": [
|
106 |
+
"# Set the new cleaned texts\n",
|
107 |
+
"if len(documents) == len(cleaned_texts):\n",
|
108 |
+
" for i in range(len(documents)):\n",
|
109 |
+
" documents[i].page_content = cleaned_texts[i]\n",
|
110 |
+
"else:\n",
|
111 |
+
" raise Exception(\"Error.\")"
|
112 |
+
]
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"cell_type": "code",
|
116 |
+
"execution_count": null,
|
117 |
+
"metadata": {},
|
118 |
+
"outputs": [],
|
119 |
+
"source": [
|
120 |
+
"# Store only the new documents and all documents\n",
|
121 |
+
"store_documents(documents, \"./../input_data/Web/documents/new_documents\")\n",
|
122 |
+
"\n",
|
123 |
+
"already_cleaned_documents.extend(documents)\n",
|
124 |
+
"store_documents(already_cleaned_documents, \"./../input_data/Web/documents/all_documents\")"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "code",
|
129 |
+
"execution_count": null,
|
130 |
+
"metadata": {},
|
131 |
+
"outputs": [],
|
132 |
+
"source": [
|
133 |
+
"# Update the URLs list for cleaned and uncleaned\n",
|
134 |
+
"uncleaned_url_file_path = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n",
|
135 |
+
"cleaned_url_file_path = \"./../input_data/Web/URLs/cleaned_urls.txt\"\n",
|
136 |
+
"\n",
|
137 |
+
"# Read URLs from the source file and store them in a list\n",
|
138 |
+
"with open(uncleaned_url_file_path, \"r\") as source_file:\n",
|
139 |
+
" urls = source_file.readlines()\n",
|
140 |
+
"\n",
|
141 |
+
"# Open the destination file in append mode and write the URLs to it\n",
|
142 |
+
"with open(cleaned_url_file_path, \"a\") as destination_file:\n",
|
143 |
+
" destination_file.writelines(urls)\n",
|
144 |
+
"\n",
|
145 |
+
"# Remove the URLs from the source file\n",
|
146 |
+
"with open(uncleaned_url_file_path, \"w\") as source_file:\n",
|
147 |
+
" source_file.write(\"\")\n",
|
148 |
+
"\n",
|
149 |
+
"# Print moved urls\n",
|
150 |
+
"for url in urls:\n",
|
151 |
+
" print(\"Moved URL:\", url.strip())"
|
152 |
+
]
|
153 |
+
}
|
154 |
+
],
|
155 |
+
"metadata": {
|
156 |
+
"kernelspec": {
|
157 |
+
"display_name": "venv",
|
158 |
+
"language": "python",
|
159 |
+
"name": "python3"
|
160 |
+
},
|
161 |
+
"language_info": {
|
162 |
+
"codemirror_mode": {
|
163 |
+
"name": "ipython",
|
164 |
+
"version": 3
|
165 |
+
},
|
166 |
+
"file_extension": ".py",
|
167 |
+
"mimetype": "text/x-python",
|
168 |
+
"name": "python",
|
169 |
+
"nbconvert_exporter": "python",
|
170 |
+
"pygments_lexer": "ipython3",
|
171 |
+
"version": "3.8.5"
|
172 |
+
}
|
173 |
+
},
|
174 |
+
"nbformat": 4,
|
175 |
+
"nbformat_minor": 2
|
176 |
+
}
|
index_preparation/preprocess_data.ipynb
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"### Notebook for processing the text data (chunking, cleaning, embeddings)"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"cell_type": "code",
|
12 |
+
"execution_count": 2,
|
13 |
+
"metadata": {},
|
14 |
+
"outputs": [],
|
15 |
+
"source": [
|
16 |
+
"import json\n",
|
17 |
+
"from typing import Iterable\n",
|
18 |
+
"from langchain.docstore.document import Document\n",
|
19 |
+
"from typing import List\n",
|
20 |
+
"\n",
|
21 |
+
"# Helper methods for storing and loading already generated documents\n",
|
22 |
+
"def store_documents(documents, file_path: str) -> None:\n",
|
23 |
+
" with open(file_path, \"w\") as jsonl_file:\n",
|
24 |
+
" for doc in documents:\n",
|
25 |
+
" jsonl_file.write(doc.json() + \"\\n\")\n",
|
26 |
+
"\n",
|
27 |
+
"\n",
|
28 |
+
"def load_documents(file_path: str) -> List[Document]:\n",
|
29 |
+
" documents = []\n",
|
30 |
+
" with open(file_path, \"r\") as jsonl_file:\n",
|
31 |
+
" for line in jsonl_file:\n",
|
32 |
+
" data = json.loads(line)\n",
|
33 |
+
" obj = Document(**data)\n",
|
34 |
+
" documents.append(obj)\n",
|
35 |
+
" return documents"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 3,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"def get_pdf_documents(all_docs: bool):\n",
|
45 |
+
" \"\"\"\n",
|
46 |
+
" Method for returning the documents of the PDFs. Processing and updating takes place in update_pdf_documents.\n",
|
47 |
+
" all_docs parameter defines whether to load all documents or only new ones. Only new ones can be used if the index is already build and new documents should be added.\n",
|
48 |
+
" \"\"\"\n",
|
49 |
+
" pdf_documents = []\n",
|
50 |
+
" if all_docs:\n",
|
51 |
+
" pdf_documents = load_documents(\"./../input_data/PDF/documents/all_documents\")\n",
|
52 |
+
" else:\n",
|
53 |
+
" pdf_documents = load_documents(\"./../input_data/PDF/documents/new_documents\")\n",
|
54 |
+
"\n",
|
55 |
+
" return pdf_documents\n",
|
56 |
+
"\n",
|
57 |
+
"def get_web_documents(all_docs: bool) -> List[Document]:\n",
|
58 |
+
" \"\"\"\n",
|
59 |
+
" Method for returning the already processed documents. FIRST need to call get_web_docs_for_cleaning and clean manually. As it is a manual cleaning process, the methods are need to be called asynchronously.\n",
|
60 |
+
" \"\"\"\n",
|
61 |
+
" web_documents = []\n",
|
62 |
+
" if all_docs:\n",
|
63 |
+
" web_documents = load_documents(\"./../input_data/Web/documents/all_documents\")\n",
|
64 |
+
" else:\n",
|
65 |
+
" web_documents = load_documents(\"./../input_data/Web/documents/new_documents\")\n",
|
66 |
+
"\n",
|
67 |
+
" return web_documents\n",
|
68 |
+
"\n",
|
69 |
+
"def get_template_documents(all_docs: bool) -> List[Document]:\n",
|
70 |
+
" \"\"\"\n",
|
71 |
+
" Method for returning the documents of the templates.\n",
|
72 |
+
" \"\"\"\n",
|
73 |
+
" template_documents = []\n",
|
74 |
+
" if all_docs:\n",
|
75 |
+
" template_documents = load_documents(\"./../input_data/Templates/documents/all_documents\")\n",
|
76 |
+
" else:\n",
|
77 |
+
" template_documents = load_documents(\"./../input_data/Templates/documents/new_documents\")\n",
|
78 |
+
"\n",
|
79 |
+
" return template_documents\n",
|
80 |
+
"\n",
|
81 |
+
"def get_dataset_documents() -> List[Document]:\n",
|
82 |
+
" \"\"\"\n",
|
83 |
+
" Method for returning the documents of the templates.\n",
|
84 |
+
" \"\"\"\n",
|
85 |
+
" template_documents = []\n",
|
86 |
+
" template_documents = load_documents(\"./../input_data/QA_dataset/all_documents\")\n",
|
87 |
+
"\n",
|
88 |
+
" return template_documents"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": 4,
|
94 |
+
"metadata": {},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"def get_documents_from_files(all_docs: bool):\n",
|
98 |
+
" \"\"\"\n",
|
99 |
+
" Gets documents from all document types.\n",
|
100 |
+
" \"\"\"\n",
|
101 |
+
" documents_all = []\n",
|
102 |
+
" documents_PDF = get_pdf_documents(all_docs)\n",
|
103 |
+
" document_web = get_web_documents(all_docs)\n",
|
104 |
+
" document_template = get_template_documents(all_docs)\n",
|
105 |
+
" document_dataset = get_dataset_documents()\n",
|
106 |
+
" \n",
|
107 |
+
" documents_all.extend(documents_PDF)\n",
|
108 |
+
" documents_all.extend(document_web)\n",
|
109 |
+
" documents_all.extend(document_template)\n",
|
110 |
+
" documents_all.extend(document_dataset)\n",
|
111 |
+
" \n",
|
112 |
+
" print(\"Number of documents: \" + str(len(documents_all)) + \"\\n\")\n",
|
113 |
+
" return documents_all"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 5,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
123 |
+
"\n",
|
124 |
+
"def split_docs(documents: List[Document], chunk_size: int, chunk_overlap: int):\n",
|
125 |
+
"\n",
|
126 |
+
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[\" \"])\n",
|
127 |
+
" chunkedDocuments = text_splitter.split_documents(documents)\n",
|
128 |
+
" return chunkedDocuments"
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"execution_count": 6,
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [],
|
136 |
+
"source": [
|
137 |
+
"import re\n",
|
138 |
+
"\n",
|
139 |
+
"def clean_text(text: str) -> str:\n",
|
140 |
+
" # Replace multiple whitespaces (except newlines) with a single space\n",
|
141 |
+
" text = re.sub(r\"(?!\\n)\\s+\", \" \", text)\n",
|
142 |
+
" # Replace multiple newlines with a single newline\n",
|
143 |
+
" text = re.sub(r\"\\n+\", \"\\n\", text)\n",
|
144 |
+
" # Remove leading and trailing whitespace\n",
|
145 |
+
" text = text.strip()\n",
|
146 |
+
" return text\n",
|
147 |
+
"\n",
|
148 |
+
"def clean_and_process_chunked_documents(chunkedDocuments: List[Document]) -> List[Document]:\n",
|
149 |
+
" counter = 1\n",
|
150 |
+
" for i in chunkedDocuments:\n",
|
151 |
+
" i.page_content = clean_text(i.page_content)\n",
|
152 |
+
" i.metadata[\"original_text\"] = i.page_content\n",
|
153 |
+
" i.metadata[\"doc_ID\"] = counter\n",
|
154 |
+
" counter += 1\n",
|
155 |
+
"\n",
|
156 |
+
" i.page_content = i.page_content.lower() \n",
|
157 |
+
"\n",
|
158 |
+
" return chunkedDocuments"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 7,
|
164 |
+
"metadata": {},
|
165 |
+
"outputs": [],
|
166 |
+
"source": [
|
167 |
+
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
168 |
+
"\n",
|
169 |
+
"def get_embedding_model():\n",
|
170 |
+
" path = \"Basti8499/bge-large-en-v1.5-ISO-27001\"\n",
|
171 |
+
" model = HuggingFaceEmbeddings(model_name=path)\n",
|
172 |
+
" return model"
|
173 |
+
]
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"cell_type": "code",
|
177 |
+
"execution_count": 8,
|
178 |
+
"metadata": {},
|
179 |
+
"outputs": [],
|
180 |
+
"source": [
|
181 |
+
"def create_embedding_vectors(embedding_model, documents: List[Document]):\n",
|
182 |
+
" texts = []\n",
|
183 |
+
" for document in documents:\n",
|
184 |
+
" texts.append(document.page_content)\n",
|
185 |
+
"\n",
|
186 |
+
" embeddings = embedding_model.embed_documents(texts)\n",
|
187 |
+
"\n",
|
188 |
+
" return embeddings"
|
189 |
+
]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"cell_type": "code",
|
193 |
+
"execution_count": 1,
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [],
|
196 |
+
"source": [
|
197 |
+
"def preprocess_data(chunk_size: int, chunk_overlap: int, all_docs: bool):\n",
|
198 |
+
" documents = get_documents_from_files(all_docs)\n",
|
199 |
+
" chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
200 |
+
" chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n",
|
201 |
+
" embedding_model = get_embedding_model()\n",
|
202 |
+
" embeddings = create_embedding_vectors(embedding_model, chunked_cleaned_documents)\n",
|
203 |
+
"\n",
|
204 |
+
" return chunked_cleaned_documents, embedding_model, embeddings"
|
205 |
+
]
|
206 |
+
}
|
207 |
+
],
|
208 |
+
"metadata": {
|
209 |
+
"kernelspec": {
|
210 |
+
"display_name": "venv",
|
211 |
+
"language": "python",
|
212 |
+
"name": "python3"
|
213 |
+
},
|
214 |
+
"language_info": {
|
215 |
+
"codemirror_mode": {
|
216 |
+
"name": "ipython",
|
217 |
+
"version": 3
|
218 |
+
},
|
219 |
+
"file_extension": ".py",
|
220 |
+
"mimetype": "text/x-python",
|
221 |
+
"name": "python",
|
222 |
+
"nbconvert_exporter": "python",
|
223 |
+
"pygments_lexer": "ipython3",
|
224 |
+
"version": "3.11.5"
|
225 |
+
}
|
226 |
+
},
|
227 |
+
"nbformat": 4,
|
228 |
+
"nbformat_minor": 2
|
229 |
+
}
|
init_embedding_model.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
2 |
+
|
3 |
+
path = "Basti8499/bge-large-en-v1.5-ISO-27001"
|
4 |
+
model = HuggingFaceEmbeddings(model_name=path)
|
input_data/PDF/documents/all_documents
ADDED
File without changes
|
input_data/PDF/documents/new_documents
ADDED
File without changes
|
input_data/QA_dataset/all_documents
ADDED
The diff for this file is too large to render.
See raw diff
|
|
input_data/QA_dataset/golden_qa_set.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
input_data/Templates/documents/all_documents
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"page_content": "You can find a possible template for the backup policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the backup policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Backup policy.docx", "source": "Backup policy.docx"}, "type": "Document"}
|
2 |
+
{"page_content": "You can find a possible template for the change management policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content, procedures, risk management and more for the change management policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Change management policy.docx", "source": "Change management policy.docx"}, "type": "Document"}
|
3 |
+
{"page_content": "You can find a possible template for the encryption policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the encryption policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Encryption policy.docx", "source": "Encryption policy.docx"}, "type": "Document"}
|
4 |
+
{"page_content": "You can find a possible template for a checklist for all ISO-27001 controls (Version 2013) attached to this message. It contains a simple checklist for the ISO 27001 controls 5 to 18.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx", "source": "IC-ISO-27001-Controls-Checklist.xlsx"}, "type": "Document"}
|
5 |
+
{"page_content": "You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx", "source": "IC-ISO-27001-Risk-Assessment.xlsx"}, "type": "Document"}
|
input_data/Templates/documents/new_documents
ADDED
File without changes
|
input_data/Templates/template_files/processed/Backup policy.docx
ADDED
Binary file (12.7 kB). View file
|
|
input_data/Templates/template_files/processed/Change management policy.docx
ADDED
Binary file (13.9 kB). View file
|
|
input_data/Templates/template_files/processed/Encryption policy.docx
ADDED
Binary file (13.2 kB). View file
|
|
input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx
ADDED
Binary file (314 kB). View file
|
|
input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx
ADDED
Binary file (48.5 kB). View file
|
|
input_data/Web/URLs/cleaned_urls.txt
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://www.vanta.com/resources/who-needs-iso-27001-certification
|
2 |
+
https://www.vanta.com/resources/iso-27001-compliance-checklist
|
3 |
+
https://www.vanta.com/resources/how-long-does-it-take-to-get-iso-certified
|
4 |
+
https://www.strongdm.com/blog/iso-27001-controls
|
5 |
+
https://www.itgovernance.eu/blog/en/a-9-step-guide-to-implementing-iso-27001
|
6 |
+
https://www.itgovernance.eu/blog/en/benefits-of-iso-27001-certification
|
7 |
+
https://www.itgovernance.eu/blog/en/why-are-so-many-organisations-getting-certified-to-iso-27001
|
8 |
+
https://www.itgovernance.eu/blog/en/what-you-need-to-know-about-iso-270012022
|
9 |
+
https://www.itgovernance.eu/blog/en/how-iso-27001-can-boost-your-cloud-security
|
10 |
+
https://www.vanta.com/resources/the-ultimate-iso-27001-guide-powered-by-vanta-and-aprio
|
11 |
+
https://www.drata.com/blog/iso-27001-compliance
|
12 |
+
https://www.drata.com/blog/iso-27001-risk-assessment
|
13 |
+
https://www.drata.com/blog/iso-27001-statement-of-applicability
|
14 |
+
https://www.drata.com/blog/ask-an-auditor-demystifying-iso-27001
|
15 |
+
https://www.drata.com/blog/iso-27001-vs-iso-27002
|
16 |
+
https://www.drata.com/blog/iso-27001-2022-update
|
17 |
+
https://www.drata.com/blog/iso-27001-certification-cost
|
18 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/2022-version-transition-guide/
|
19 |
+
https://www.dataguard.co.uk/knowledge/iso-27001-certification/
|
20 |
+
https://www.dataguard.co.uk/knowledge/iso-27001-controls-annex-a/
|
21 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-1-requirements-of-interested-parties/
|
22 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-2-needs-and-expectations-of-key-parties/
|
23 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-3-how-to-determine-the-scope-of-your-isms/
|
24 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-4-information-security-management-system/
|
25 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-1-leadership-and-commitment/
|
26 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-2-information-security-policy/
|
27 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-3-organisational-roles-responsibilities-and-authorities/
|
28 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-6-1-actions-to-address-risks-and-opportunities/
|
29 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-6-2-information-security-objectives/
|
30 |
+
https://www.british-assessment.co.uk/insights/a-complete-guide-to-iso-iec-270012022/
|
31 |
+
https://secureframe.com/blog/iso-27001-2022
|
32 |
+
https://www.dataguard.co.uk/blog/iso-27001-risk-treatment-plan-what-you-need-to-know
|
33 |
+
https://www.creative-n.com/blog/how-much-does-it-cost-to-implement-iso-27001/
|
34 |
+
https://www.secfix.com/post/when-is-an-iso-27001-certification-required
|
35 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-1-resources-for-isms/
|
36 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-2-competence/
|
37 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-3-awareness/
|
38 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-4-communication/
|
39 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-5-documented-information/
|
40 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-1-operational-planning-and-control/
|
41 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-2-information-security-risk-assessment/
|
42 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-3-information-security-risk-treatment/
|
43 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-1-monitoring-measurement-analysis-and-evaluation/
|
44 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-2-internal-audit/
|
45 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-3-management-review/
|
46 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-10-1-continual-improvement/
|
47 |
+
https://www.dataguard.co.uk/knowledge/iso-27001/clause-10-2-nonconformity-and-corrective-action/
|
48 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a5-information-security-policies
|
49 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.6-organisation-information-security/
|
50 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.7-human-resource-security/
|
51 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.8-asset-management
|
52 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.9-access-control/
|
53 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.10-cryptography
|
54 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.11-physical-and-environmental-security/
|
55 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.12-operations-security
|
56 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.13-communications-security/
|
57 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.14-system-acquisition-development-and-maintenance/
|
58 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.15-supplier-relationships/
|
59 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.16-information-security-incident-management/
|
60 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.17-information-security-aspects-of-business-continuity-management/
|
61 |
+
https://www.dataguard.co.uk/blog/iso-27001-annex-a.18-compliance/
|
62 |
+
https://reciprocity.com/difference-between-gdpr-and-iso-27001/
|
input_data/Web/URLs/uncleaned_urls.txt
ADDED
File without changes
|
input_data/Web/documents/all_documents
ADDED
File without changes
|
input_data/Web/documents/new_documents
ADDED
File without changes
|
requirements.txt
ADDED
Binary file (7.27 kB). View file
|
|
requirements_Docker.txt
ADDED
Binary file (7.27 kB). View file
|
|
setup.sh
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
python -m venv venv
|
3 |
+
source venv/bin/activate
|
4 |
+
python -m pip install --upgrade pip
|
5 |
+
python -m pip install -r requirements.txt
|
6 |
+
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
|
7 |
+
echo "Initialization completed."
|
8 |
+
|
9 |
+
# Creating all necessary directories at once
|
10 |
+
mkdir -p ./inputData/PDF/files \
|
11 |
+
./inputData/PDF/PDF_images \
|
12 |
+
./inputData/Templates/template_files/new \
|
13 |
+
./inputData/Templates/template_files/processed \
|
14 |
+
./chroma
|
15 |
+
|
16 |
+
echo "Directories and necessary files created."
|
sparse_index/sparse_1536_264
ADDED
The diff for this file is too large to render.
See raw diff
|
|