danielsuarez-mash
commited on
Commit
•
bdcb863
1
Parent(s):
f45a84a
New changes
Browse files- .DS_Store +0 -0
- .gitignore +1 -0
- app.py +4 -5
- example_documents/Daniel's Resume-2.pdf +0 -0
- llm_handbook.ipynb +29 -17
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
app.py
CHANGED
@@ -8,7 +8,6 @@ from langchain_community.vectorstores import FAISS
|
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_community.llms import HuggingFaceHub
|
10 |
from langchain_core.runnables import RunnablePassthrough
|
11 |
-
from langchain_core.runnables import RunnableSequence
|
12 |
from langchain_core.output_parsers import StrOutputParser
|
13 |
|
14 |
st.title('LLM - Retrieval Augmented Generation')
|
@@ -21,7 +20,6 @@ def authenticate():
|
|
21 |
|
22 |
# if running on cloud
|
23 |
try:
|
24 |
-
os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
25 |
st.write(
|
26 |
"Has environment variables been set:",
|
27 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] == st.secrets["HUGGINGFACEHUB_API_TOKEN"])
|
@@ -50,12 +48,12 @@ def load_pdf(pdf):
|
|
50 |
|
51 |
return text
|
52 |
|
53 |
-
def split_text(text):
|
54 |
|
55 |
# split
|
56 |
text_splitter = RecursiveCharacterTextSplitter(
|
57 |
-
chunk_size=
|
58 |
-
chunk_overlap=
|
59 |
separators=["\n\n", "\n", " ", ""]
|
60 |
)
|
61 |
|
@@ -123,6 +121,7 @@ def main():
|
|
123 |
|
124 |
# load split store
|
125 |
vectorstore = load_split_store(pdf)
|
|
|
126 |
|
127 |
# create a retriever using vectorstore
|
128 |
retriever = vectorstore.as_retriever()
|
|
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_community.llms import HuggingFaceHub
|
10 |
from langchain_core.runnables import RunnablePassthrough
|
|
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
|
13 |
st.title('LLM - Retrieval Augmented Generation')
|
|
|
20 |
|
21 |
# if running on cloud
|
22 |
try:
|
|
|
23 |
st.write(
|
24 |
"Has environment variables been set:",
|
25 |
os.environ["HUGGINGFACEHUB_API_TOKEN"] == st.secrets["HUGGINGFACEHUB_API_TOKEN"])
|
|
|
48 |
|
49 |
return text
|
50 |
|
51 |
+
def split_text(text, chunk_size=400, chunk_overlap=20):
|
52 |
|
53 |
# split
|
54 |
text_splitter = RecursiveCharacterTextSplitter(
|
55 |
+
chunk_size=chunk_size,
|
56 |
+
chunk_overlap=chunk_overlap,
|
57 |
separators=["\n\n", "\n", " ", ""]
|
58 |
)
|
59 |
|
|
|
121 |
|
122 |
# load split store
|
123 |
vectorstore = load_split_store(pdf)
|
124 |
+
st.write('PDF vectorized')
|
125 |
|
126 |
# create a retriever using vectorstore
|
127 |
retriever = vectorstore.as_retriever()
|
example_documents/Daniel's Resume-2.pdf
ADDED
Binary file (82.8 kB). View file
|
|
llm_handbook.ipynb
CHANGED
@@ -37,7 +37,7 @@
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
-
"execution_count":
|
41 |
"id": "9fcd2583-d0ab-4649-a241-4526f6a3b83d",
|
42 |
"metadata": {
|
43 |
"id": "9fcd2583-d0ab-4649-a241-4526f6a3b83d"
|
@@ -46,10 +46,9 @@
|
|
46 |
"source": [
|
47 |
"# import packages\n",
|
48 |
"import os\n",
|
49 |
-
"import
|
50 |
-
"import
|
51 |
-
"from langchain import
|
52 |
-
"from dotenv import load_dotenv"
|
53 |
]
|
54 |
},
|
55 |
{
|
@@ -59,21 +58,29 @@
|
|
59 |
"id": "AyRxKsE4qPR1"
|
60 |
},
|
61 |
"source": [
|
62 |
-
"#API KEY"
|
63 |
]
|
64 |
},
|
65 |
{
|
66 |
"cell_type": "code",
|
67 |
-
"execution_count":
|
68 |
"id": "cf146257-5014-4041-980c-0ead2c3932c3",
|
69 |
"metadata": {
|
70 |
"id": "cf146257-5014-4041-980c-0ead2c3932c3"
|
71 |
},
|
72 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
"source": [
|
74 |
"# LOCAL\n",
|
75 |
"load_dotenv()\n",
|
76 |
-
"os.environ.get('HUGGINGFACEHUB_API_TOKEN')
|
77 |
]
|
78 |
},
|
79 |
{
|
@@ -90,14 +97,14 @@
|
|
90 |
},
|
91 |
{
|
92 |
"cell_type": "code",
|
93 |
-
"execution_count":
|
94 |
"id": "06c54d35-e9a2-4043-b3c3-588ac4f4a0d1",
|
95 |
"metadata": {
|
96 |
"id": "06c54d35-e9a2-4043-b3c3-588ac4f4a0d1"
|
97 |
},
|
98 |
"outputs": [],
|
99 |
"source": [
|
100 |
-
"from langchain import PromptTemplate\n",
|
101 |
"\n",
|
102 |
"# create template\n",
|
103 |
"template = \"\"\"\n",
|
@@ -125,7 +132,7 @@
|
|
125 |
},
|
126 |
{
|
127 |
"cell_type": "code",
|
128 |
-
"execution_count":
|
129 |
"id": "03290cad-f6be-4002-b177-00220f22333a",
|
130 |
"metadata": {
|
131 |
"colab": {
|
@@ -136,11 +143,16 @@
|
|
136 |
},
|
137 |
"outputs": [
|
138 |
{
|
139 |
-
"
|
140 |
-
"
|
141 |
-
"
|
142 |
-
|
143 |
-
"
|
|
|
|
|
|
|
|
|
|
|
144 |
]
|
145 |
}
|
146 |
],
|
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
+
"execution_count": 11,
|
41 |
"id": "9fcd2583-d0ab-4649-a241-4526f6a3b83d",
|
42 |
"metadata": {
|
43 |
"id": "9fcd2583-d0ab-4649-a241-4526f6a3b83d"
|
|
|
46 |
"source": [
|
47 |
"# import packages\n",
|
48 |
"import os\n",
|
49 |
+
"from dotenv import load_dotenv\n",
|
50 |
+
"from langchain_community.llms import HuggingFaceHub\n",
|
51 |
+
"from langchain.chains import LLMChain"
|
|
|
52 |
]
|
53 |
},
|
54 |
{
|
|
|
58 |
"id": "AyRxKsE4qPR1"
|
59 |
},
|
60 |
"source": [
|
61 |
+
"# API KEY"
|
62 |
]
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
+
"execution_count": 17,
|
67 |
"id": "cf146257-5014-4041-980c-0ead2c3932c3",
|
68 |
"metadata": {
|
69 |
"id": "cf146257-5014-4041-980c-0ead2c3932c3"
|
70 |
},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"name": "stdout",
|
74 |
+
"output_type": "stream",
|
75 |
+
"text": [
|
76 |
+
"None\n"
|
77 |
+
]
|
78 |
+
}
|
79 |
+
],
|
80 |
"source": [
|
81 |
"# LOCAL\n",
|
82 |
"load_dotenv()\n",
|
83 |
+
"print(os.environ.get('HUGGINGFACEHUB_API_TOKEN'))"
|
84 |
]
|
85 |
},
|
86 |
{
|
|
|
97 |
},
|
98 |
{
|
99 |
"cell_type": "code",
|
100 |
+
"execution_count": 18,
|
101 |
"id": "06c54d35-e9a2-4043-b3c3-588ac4f4a0d1",
|
102 |
"metadata": {
|
103 |
"id": "06c54d35-e9a2-4043-b3c3-588ac4f4a0d1"
|
104 |
},
|
105 |
"outputs": [],
|
106 |
"source": [
|
107 |
+
"from langchain.prompts import PromptTemplate\n",
|
108 |
"\n",
|
109 |
"# create template\n",
|
110 |
"template = \"\"\"\n",
|
|
|
132 |
},
|
133 |
{
|
134 |
"cell_type": "code",
|
135 |
+
"execution_count": 14,
|
136 |
"id": "03290cad-f6be-4002-b177-00220f22333a",
|
137 |
"metadata": {
|
138 |
"colab": {
|
|
|
143 |
},
|
144 |
"outputs": [
|
145 |
{
|
146 |
+
"ename": "ValidationError",
|
147 |
+
"evalue": "1 validation error for HuggingFaceHub\n__root__\n Did not find huggingfacehub_api_token, please add an environment variable `HUGGINGFACEHUB_API_TOKEN` which contains it, or pass `huggingfacehub_api_token` as a named parameter. (type=value_error)",
|
148 |
+
"output_type": "error",
|
149 |
+
"traceback": [
|
150 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
151 |
+
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
|
152 |
+
"Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# instantiate llm\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mHuggingFaceHub\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtiiuae/falcon-7b-instruct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtemperature\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpenalty_alpha\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtop_k\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmax_length\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1000\u001b[39;49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# instantiate chain\u001b[39;00m\n\u001b[1;32m 13\u001b[0m llm_chain \u001b[38;5;241m=\u001b[39m LLMChain(\n\u001b[1;32m 14\u001b[0m llm\u001b[38;5;241m=\u001b[39mllm,\n\u001b[1;32m 15\u001b[0m prompt\u001b[38;5;241m=\u001b[39mprompt,\n\u001b[1;32m 16\u001b[0m verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 17\u001b[0m )\n",
|
153 |
+
"File \u001b[0;32m~/anaconda3/envs/llm/lib/python3.11/site-packages/langchain_core/load/serializable.py:107\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 107\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n",
|
154 |
+
"File \u001b[0;32m~/anaconda3/envs/llm/lib/python3.11/site-packages/pydantic/v1/main.py:341\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(__pydantic_self__, **data)\u001b[0m\n\u001b[1;32m 339\u001b[0m values, fields_set, validation_error \u001b[38;5;241m=\u001b[39m validate_model(__pydantic_self__\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m, data)\n\u001b[1;32m 340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validation_error:\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m validation_error\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 343\u001b[0m object_setattr(__pydantic_self__, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m__dict__\u001b[39m\u001b[38;5;124m'\u001b[39m, values)\n",
|
155 |
+
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for HuggingFaceHub\n__root__\n Did not find huggingfacehub_api_token, please add an environment variable `HUGGINGFACEHUB_API_TOKEN` which contains it, or pass `huggingfacehub_api_token` as a named parameter. (type=value_error)"
|
156 |
]
|
157 |
}
|
158 |
],
|