MikeCraBash commited on
Commit
d3ab2e6
β€’
1 Parent(s): ae1fad4
Files changed (5) hide show
  1. Dockerfile +12 -0
  2. README.md +4 -4
  3. app.py +113 -0
  4. chainlit.md +3 -0
  5. requirements.txt +206 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
12
+
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: PrepGoogle4o
3
- emoji: πŸ“‰
4
- colorFrom: green
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: openrail
 
1
  ---
2
+ title: Prepr
3
+ emoji: πŸ‘
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  license: openrail
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI MAKERSPACE PREPR
2
+ # Date: 2024-5-16
3
+
4
+ # Basic Imports & Setup
5
+ import os
6
+ from openai import AsyncOpenAI
7
+
8
+ # Using Chainlit for our UI
9
+ import chainlit as cl
10
+ from chainlit.prompt import Prompt, PromptMessage
11
+ from chainlit.playground.providers import ChatOpenAI
12
+
13
+ # Getting the API key from the .env file
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ # RAG pipeline imports and setup code
18
+ # Get the DeveloperWeek PDF file (future implementation: direct download from URL)
19
+ from langchain.document_loaders import PyMuPDFLoader
20
+
21
+ # Adjust the URL to the direct download format
22
+ #file_id = "1JeA-w4kvbI3GHk9Dh_j19_Q0JUDE7hse"
23
+ #direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
24
+
25
+ file_id = "11Bq38osADZtTxGudM9OJr51BV9YwKsf3"
26
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
27
+
28
+ # Now load the document using the direct URL
29
+ docs = PyMuPDFLoader(direct_url).load()
30
+
31
+ import tiktoken
32
+ def tiktoken_len(text):
33
+ tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
34
+ text,
35
+ )
36
+ return len(tokens)
37
+
38
+ # Split the document into chunks
39
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
40
+
41
+ text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size = 500, # 500 tokens per chunk, experiment with this value
43
+ chunk_overlap = 50, # 50 tokens overlap between chunks, experiment with this value
44
+ length_function = tiktoken_len,
45
+ )
46
+
47
+ split_chunks = text_splitter.split_documents(docs)
48
+
49
+ # Load the embeddings model
50
+ from langchain_openai.embeddings import OpenAIEmbeddings
51
+
52
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
53
+
54
+ # Load the vector store and retriever from Qdrant
55
+ from langchain_community.vectorstores import Qdrant
56
+
57
+ qdrant_vectorstore = Qdrant.from_documents(
58
+ split_chunks,
59
+ embedding_model,
60
+ location=":memory:",
61
+ collection_name="Prepr",
62
+ )
63
+
64
+ qdrant_retriever = qdrant_vectorstore.as_retriever()
65
+
66
+ from langchain_openai import ChatOpenAI
67
+ openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
68
+
69
+ from langchain_core.prompts import ChatPromptTemplate
70
+ RAG_PROMPT = """
71
+ CONTEXT:
72
+ {context}
73
+
74
+ QUERY:
75
+ {question}
76
+
77
+ You are a personal assistant for a professional. Your tone is professional and considerate. Before proceeding to answer about which conference sessions the user should attend, be sure to ask them what key topics they are hoping to learn from the conference, and if there are any specific sessions they are keen on attending. Use the provided context to answer the user's query. You are a professional personal assistant for an executive professional in a high tech company. You help them plan for events and meetings. You always review the provided event information. You can look up dates and location where event sessions take place from the document. If you do not know the answer, or cannot answer, please respond with "Insufficient data for further analysis, please try again". For each session you suggest, include bullet points with the session title, speaker, company, topic, AI industry relevance, details of their work in AI, main point likely to be made, and three questions to ask the speaker. You end your successful responses with "Is there anything else that I can help you with?". If the user says NO, or any other negative response, then you ask "How did I do?
78
+ """
79
+
80
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
81
+
82
+ from operator import itemgetter
83
+ from langchain.schema.output_parser import StrOutputParser
84
+ from langchain.schema.runnable import RunnablePassthrough
85
+
86
+ retrieval_augmented_qa_chain = (
87
+ {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
88
+ | RunnablePassthrough.assign(context=itemgetter("context"))
89
+ | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
90
+ )
91
+
92
+ # Chainlit App
93
+ @cl.on_chat_start
94
+ async def start_chat():
95
+ settings = {
96
+ "model": "gpt-3.5-turbo",
97
+ "temperature": 0,
98
+ "max_tokens": 500,
99
+ "top_p": 1,
100
+ "frequency_penalty": 0,
101
+ "presence_penalty": 0,
102
+ }
103
+ cl.user_session.set("settings", settings)
104
+
105
+ @cl.on_message
106
+ async def main(message: cl.Message):
107
+ chainlit_question = message.content
108
+ #chainlit_question = "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"
109
+ response = retrieval_augmented_qa_chain.invoke({"question": chainlit_question})
110
+ chainlit_answer = response["response"].content
111
+
112
+ msg = cl.Message(content=chainlit_answer)
113
+ await msg.send()
chainlit.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # AI Makerspace Demo Day - Prepr
2
+
3
+ Welcome to Prepr your personal preparation assistant. I can help you prepare for a conference, a meeting or an interview ... How can I help you today?
requirements.txt ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ altair==5.3.0
5
+ annotated-types==0.6.0
6
+ anyio==3.7.1
7
+ appdirs==1.4.4
8
+ argon2-cffi==23.1.0
9
+ argon2-cffi-bindings==21.2.0
10
+ arrow==1.3.0
11
+ async-lru==2.0.4
12
+ asyncer==0.0.2
13
+ attrs==23.2.0
14
+ Babel==2.14.0
15
+ backoff==2.2.1
16
+ beautifulsoup4==4.12.3
17
+ bidict==0.23.1
18
+ bleach==6.1.0
19
+ blinker==1.8.1
20
+ cachetools==5.3.3
21
+ certifi==2024.2.2
22
+ cffi==1.16.0
23
+ chainlit
24
+ charset-normalizer==3.3.2
25
+ click==8.1.7
26
+ cohere==4.37
27
+ contourpy==1.2.1
28
+ curl_cffi==0.6.2
29
+ cycler==0.12.1
30
+ dataclasses-json==0.5.14
31
+ datasets==2.19.0
32
+ defusedxml==0.7.1
33
+ Deprecated==1.2.14
34
+ dill==0.3.8
35
+ dirtyjson==1.0.8
36
+ distro==1.9.0
37
+ docker==7.0.0
38
+ docker-pycreds==0.4.0
39
+ duckduckgo_search==5.3.0
40
+ fastapi==0.100.1
41
+ fastapi-socketio==0.0.10
42
+ fastavro==1.9.4
43
+ fastjsonschema==2.19.1
44
+ filelock==3.13.4
45
+ filetype==1.2.0
46
+ fonttools==4.51.0
47
+ fqdn==1.5.1
48
+ frozenlist==1.4.1
49
+ fsspec==2024.3.1
50
+ gitdb==4.0.11
51
+ GitPython==3.1.43
52
+ googleapis-common-protos==1.63.0
53
+ grandalf==0.8
54
+ greenlet==3.0.3
55
+ grpcio==1.62.2
56
+ grpcio-tools==1.62.2
57
+ h11==0.14.0
58
+ h2==4.1.0
59
+ hpack==4.0.0
60
+ httpcore==0.17.3
61
+ httpx
62
+ huggingface-hub==0.22.2
63
+ hyperframe==6.0.1
64
+ idna==3.6
65
+ importlib-metadata==6.11.0
66
+ install==1.3.5
67
+ ipywidgets==8.1.2
68
+ isoduration==20.11.0
69
+ Jinja2==3.1.3
70
+ joblib==1.4.0
71
+ json5==0.9.25
72
+ jsonpatch==1.33
73
+ jsonpointer==2.4
74
+ jsonschema==4.21.1
75
+ jsonschema-specifications==2023.12.1
76
+ jupyter==1.0.0
77
+ jupyter-console==6.6.3
78
+ jupyter-events==0.10.0
79
+ jupyter-lsp==2.2.5
80
+ jupyter_server==2.14.0
81
+ jupyter_server_terminals==0.5.3
82
+ jupyterlab
83
+ jupyterlab_pygments==0.3.0
84
+ jupyterlab_server==2.26.0
85
+ jupyterlab_widgets==3.0.10
86
+ kiwisolver==1.4.5
87
+ langchain==0.1.17
88
+ langchain-community==0.0.36
89
+ langchain-core==0.1.50
90
+ langchain-openai==0.1.6
91
+ langchain-text-splitters==0.0.1
92
+ langchainhub==0.1.15
93
+ langsmith==0.1.48
94
+ Lazify==0.4.0
95
+ markdown-it-py==3.0.0
96
+ MarkupSafe==2.1.5
97
+ marshmallow==3.21.1
98
+ matplotlib==3.8.4
99
+ mdurl==0.1.2
100
+ mistune==3.0.2
101
+ multidict==6.0.5
102
+ multiprocess==0.70.16
103
+ mypy-extensions==1.0.0
104
+ nbclient==0.10.0
105
+ nbconvert==7.16.3
106
+ nbformat==5.10.4
107
+ networkx
108
+ nltk==3.8.1
109
+ notebook==7.1.2
110
+ notebook_shim==0.2.4
111
+ numpy==1.26.4
112
+ openai==1.25.1
113
+ opentelemetry-api==1.24.0
114
+ opentelemetry-exporter-otlp==1.24.0
115
+ opentelemetry-exporter-otlp-proto-common==1.24.0
116
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
117
+ opentelemetry-exporter-otlp-proto-http==1.24.0
118
+ opentelemetry-instrumentation==0.45b0
119
+ opentelemetry-proto==1.24.0
120
+ opentelemetry-sdk==1.24.0
121
+ opentelemetry-semantic-conventions==0.45b0
122
+ orjson==3.10.1
123
+ overrides==7.7.0
124
+ packaging==23.2
125
+ pandas==2.2.2
126
+ pandocfilters==1.5.1
127
+ pillow==10.3.0
128
+ plotly==5.22.0
129
+ portalocker==2.8.2
130
+ prometheus_client==0.20.0
131
+ protobuf==4.25.3
132
+ pyarrow==16.0.0
133
+ pyarrow-hotfix==0.6
134
+ pycparser==2.22
135
+ pydantic==2.6.4
136
+ pydantic_core==2.16.3
137
+ pydeck==0.9.0
138
+ PyJWT==2.8.0
139
+ PyMuPDF==1.24.2
140
+ PyMuPDFb==1.24.1
141
+ pyparsing==3.1.2
142
+ pypdf==4.2.0
143
+ pysbd==0.3.4
144
+ python-dotenv==1.0.0
145
+ python-engineio==4.9.0
146
+ python-graphql-client==0.4.3
147
+ python-json-logger==2.0.7
148
+ python-magic==0.4.27
149
+ python-multipart==0.0.6
150
+ python-socketio==5.11.2
151
+ pytz==2024.1
152
+ PyYAML==6.0.1
153
+ qdrant-client==1.9.1
154
+ qtconsole==5.5.1
155
+ QtPy==2.4.1
156
+ ragas==0.1.7
157
+ referencing==0.34.0
158
+ regex==2024.4.16
159
+ requests==2.31.0
160
+ rfc3339-validator==0.1.4
161
+ rfc3986-validator==0.1.1
162
+ rich==13.7.1
163
+ rpds-py==0.18.0
164
+ scikit-learn==1.4.2
165
+ scipy==1.13.0
166
+ Send2Trash==1.8.3
167
+ sentry-sdk==1.45.0
168
+ setproctitle==1.3.3
169
+ simple-websocket==1.0.0
170
+ smmap==5.0.1
171
+ sniffio==1.3.1
172
+ soupsieve==2.5
173
+ SQLAlchemy==2.0.29
174
+ starlette==0.27.0
175
+ streamlit==1.33.0
176
+ striprtf==0.0.26
177
+ syncer==2.0.3
178
+ tenacity==8.2.3
179
+ terminado==0.18.1
180
+ threadpoolctl==3.4.0
181
+ tiktoken==0.6.0
182
+ tinycss2==1.2.1
183
+ toml==0.10.2
184
+ tomli==2.0.1
185
+ toolz==0.12.1
186
+ tqdm==4.66.2
187
+ types-python-dateutil==2.9.0.20240316
188
+ types-requests==2.31.0.20240406
189
+ typing-inspect==0.9.0
190
+ tzdata==2024.1
191
+ uptrace==1.24.0
192
+ uri-template==1.3.0
193
+ urllib3==2.2.1
194
+ uvicorn==0.23.2
195
+ wandb==0.16.6
196
+ watchfiles==0.20.0
197
+ webcolors==1.13
198
+ webencodings==0.5.1
199
+ websocket-client==1.7.0
200
+ websockets==12.0
201
+ widgetsnbextension==4.0.10
202
+ wikipedia==1.4.0
203
+ wrapt==1.16.0
204
+ wsproto==1.2.0
205
+ xxhash==3.4.1
206
+ yarl==1.9.4