mtyrrell commited on
Commit
5729146
·
verified ·
1 Parent(s): a74ebbe

Update appStore/rag.py

Browse files
Files changed (1) hide show
  1. appStore/rag.py +176 -40
appStore/rag.py CHANGED
@@ -1,3 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  # import json
3
  import numpy as np
@@ -6,12 +107,12 @@ import openai
6
  from haystack.schema import Document
7
  import streamlit as st
8
  from tenacity import retry, stop_after_attempt, wait_random_exponential
 
9
 
10
 
11
  # Get openai API key
12
- # openai.api_key = os.environ["OPENAI_API_KEY"]
13
- #model_select = "gpt-3.5-turbo-0125"
14
- model_select ="gpt-4"
15
 
16
  # define a special function for putting the prompt together (as we can't use haystack)
17
  def get_prompt(context, label):
@@ -29,59 +130,91 @@ def get_prompt(context, label):
29
 
30
  return prompt
31
 
32
- # def get_prompt(context, label):
33
- # base_prompt="Summarize the following context efficiently in bullet points, the less the better - but keep concrete goals. \
34
- # Summarize only elements of the context that address vulnerability to climate change. \
35
- # Formatting example: \
36
- # - Bullet point 1 \
37
- # - Bullet point 2 \
38
- # "
39
 
40
- # # Add the meta data for references
41
- # # context = ' - '.join([d.content for d in docs])
42
- # prompt = base_prompt+"; Context: "+context+"; Answer:"
43
-
44
- # return prompt
45
 
46
- # base_prompt="Summarize the following context efficiently in bullet points, the less the better- but keep concrete goals. \
47
- # Summarize only activities that address the vulnerability of "+label+" to climate change. \
48
- # Formatting example: \
49
- # - Collect and utilize gender-disaggregated data to inform and improve climate change adaptation efforts. \
50
- # - Prioritize gender sensitivity in adaptation options, ensuring participation and benefits for women, who are more vulnerable to climate impacts. \
51
- # "
52
- # # convert df rows to Document object so we can feed it into the summarizer easily
53
- # def get_document(df):
54
- # # we take a list of each extract
55
- # ls_dict = []
56
- # for index, row in df.iterrows():
57
- # # Create a Document object for each row (we only need the text)
58
- # doc = Document(
59
- # row['text'],
60
- # meta={
61
- # 'label': row['Vulnerability Label']}
62
- # )
63
- # # Append the Document object to the documents list
64
- # ls_dict.append(doc)
65
 
66
- # return ls_dict
 
 
 
67
 
68
 
69
- # exception handling for issuing multiple API calls to openai (exponential backoff)
70
- @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
71
- def completion_with_backoff(**kwargs):
72
- return openai.ChatCompletion.create(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # construct RAG query, send to openai and process response
76
  def run_query(context, label):
77
  '''
78
  For non-streamed completion, enable the following 2 lines and comment out the code below
79
  '''
 
 
80
  # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
81
  # result = res.choices[0].message.content
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # instantiate ChatCompletion as a generator object (stream is set to True)
84
- response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(context, label)}], stream=True)
85
  # iterate through the streamed output
86
  report = []
87
  res_box = st.empty()
@@ -102,3 +235,6 @@ def run_query(context, label):
102
 
103
 
104
 
 
 
 
 
1
+ # import os
2
+ # # import json
3
+ # import numpy as np
4
+ # import pandas as pd
5
+ # import openai
6
+ # from haystack.schema import Document
7
+ # import streamlit as st
8
+ # from tenacity import retry, stop_after_attempt, wait_random_exponential
9
+
10
+
11
+ # # Get openai API key
12
+ # # openai.api_key = os.environ["OPENAI_API_KEY"]
13
+ # hf_token = os.environ["HF_API_KEY"]
14
+ # #model_select = "gpt-3.5-turbo-0125"
15
+ # model_select ="gpt-4"
16
+
17
+ # # define a special function for putting the prompt together (as we can't use haystack)
18
+ # def get_prompt(context, label):
19
+ # base_prompt="Summarize the following context efficiently in bullet points, the less the better - but keep concrete goals. \
20
+ # Summarize only elements of the context that address vulnerability of "+label+" to climate change. \
21
+ # If there is no mention of "+label+" in the context, return nothing. \
22
+ # Formatting example: \
23
+ # - Bullet point 1 \
24
+ # - Bullet point 2 \
25
+ # "
26
+
27
+ # # Add the meta data for references
28
+ # # context = ' - '.join([d.content for d in docs])
29
+ # prompt = base_prompt+"; Context: "+context+"; Answer:"
30
+
31
+ # return prompt
32
+
33
+ # # def get_prompt(context, label):
34
+ # # base_prompt="Summarize the following context efficiently in bullet points, the less the better - but keep concrete goals. \
35
+ # # Summarize only elements of the context that address vulnerability to climate change. \
36
+ # # Formatting example: \
37
+ # # - Bullet point 1 \
38
+ # # - Bullet point 2 \
39
+ # # "
40
+
41
+ # # # Add the meta data for references
42
+ # # # context = ' - '.join([d.content for d in docs])
43
+ # # prompt = base_prompt+"; Context: "+context+"; Answer:"
44
+
45
+ # # return prompt
46
+
47
+ # # base_prompt="Summarize the following context efficiently in bullet points, the less the better- but keep concrete goals. \
48
+ # # Summarize only activities that address the vulnerability of "+label+" to climate change. \
49
+ # # Formatting example: \
50
+ # # - Collect and utilize gender-disaggregated data to inform and improve climate change adaptation efforts. \
51
+ # # - Prioritize gender sensitivity in adaptation options, ensuring participation and benefits for women, who are more vulnerable to climate impacts. \
52
+ # # "
53
+ # # # convert df rows to Document object so we can feed it into the summarizer easily
54
+ # # def get_document(df):
55
+ # # # we take a list of each extract
56
+ # # ls_dict = []
57
+ # # for index, row in df.iterrows():
58
+ # # # Create a Document object for each row (we only need the text)
59
+ # # doc = Document(
60
+ # # row['text'],
61
+ # # meta={
62
+ # # 'label': row['Vulnerability Label']}
63
+ # # )
64
+ # # # Append the Document object to the documents list
65
+ # # ls_dict.append(doc)
66
+
67
+ # # return ls_dict
68
+
69
+
70
+ # # exception handling for issuing multiple API calls to openai (exponential backoff)
71
+ # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
72
+ # def completion_with_backoff(**kwargs):
73
+ # return openai.ChatCompletion.create(**kwargs)
74
+
75
+
76
+ # # construct RAG query, send to openai and process response
77
+ # def run_query(context, label):
78
+ # '''
79
+ # For non-streamed completion, enable the following 2 lines and comment out the code below
80
+ # '''
81
+ # # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
82
+ # # result = res.choices[0].message.content
83
+
84
+ # # instantiate ChatCompletion as a generator object (stream is set to True)
85
+ # response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(context, label)}], stream=True)
86
+ # # iterate through the streamed output
87
+ # report = []
88
+ # res_box = st.empty()
89
+ # for chunk in response:
90
+ # # extract the object containing the text (totally different structure when streaming)
91
+ # chunk_message = chunk['choices'][0]['delta']
92
+ # # test to make sure there is text in the object (some don't have)
93
+ # if 'content' in chunk_message:
94
+ # report.append(chunk_message.content) # extract the message
95
+ # # add the latest text and merge it with all previous
96
+ # result = "".join(report).strip()
97
+ # # res_box.success(result) # output to response text box
98
+ # res_box.success(result)
99
+
100
+
101
+
102
  import os
103
  # import json
104
  import numpy as np
 
107
  from haystack.schema import Document
108
  import streamlit as st
109
  from tenacity import retry, stop_after_attempt, wait_random_exponential
110
+ from huggingface_hub import InferenceClient
111
 
112
 
113
  # Get openai API key
114
+ openai.api_key = os.environ["OPENAI_API_KEY"]
115
+
 
116
 
117
  # define a special function for putting the prompt together (as we can't use haystack)
118
  def get_prompt(context, label):
 
130
 
131
  return prompt
132
 
 
 
 
 
 
 
 
133
 
 
 
 
 
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ # # exception handling for issuing multiple API calls to openai (exponential backoff)
137
+ # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
138
+ # def completion_with_backoff(**kwargs):
139
+ # return openai.ChatCompletion.create(**kwargs)
140
 
141
 
142
+ def get_prompt(context, label):
143
+ base_prompt="Summarize the following context efficiently in bullet points, the less the better - but keep concrete goals. \
144
+ Summarize only elements of the context that address vulnerability of "+label+" to climate change. \
145
+ If there is no mention of "+label+" in the context, return nothing. \
146
+ Do not include an introduction sentence, just the bullet points as per below. \
147
+ Formatting example: \
148
+ - Bullet point 1 \
149
+ - Bullet point 2 \
150
+ "
151
+
152
+ # Add the meta data for references
153
+ # context = ' - '.join([d.content for d in docs])
154
+ prompt = base_prompt+"; Context: "+context+"; Answer:"
155
+
156
+ return prompt
157
 
158
 
159
+ # # construct RAG query, send to openai and process response
160
+ # def run_query(context, label, chatbot_role):
161
+ # '''
162
+ # For non-streamed completion, enable the following 2 lines and comment out the code below
163
+ # '''
164
+ # # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
165
+ # # result = res.choices[0].message.content
166
+
167
+ # messages = [
168
+ # ChatMessage(role="system", content=chatbot_role),
169
+ # ChatMessage(role="user", content=get_prompt(context, label)),
170
+ # ]
171
+ # response = llm.chat(messages)
172
+ # return(response)
173
+
174
+
175
+
176
+ # tokenizer = AutoTokenizer.from_pretrained(
177
+ # "meta-llama/Meta-Llama-3.1-8B-Instruct",
178
+ # token=hf_token,
179
+ # )
180
+
181
+ # stopping_ids = [
182
+ # tokenizer.eos_token_id,
183
+ # tokenizer.convert_tokens_to_ids("<|eot_id|>"),
184
+ # ]
185
+
186
+ # Define the role of the chatbot
187
+ # chatbot_role = """You are an analyst specializing in climate change impact assessments and producing insights from policy documents."""
188
+
189
  # construct RAG query, send to openai and process response
190
  def run_query(context, label):
191
  '''
192
  For non-streamed completion, enable the following 2 lines and comment out the code below
193
  '''
194
+ chatbot_role = """You are an analyst specializing in climate change impact assessments and producing insights from policy documents."""
195
+
196
  # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
197
  # result = res.choices[0].message.content
198
 
199
+ # Initialize the client, pointing it to one of the available models
200
+ client = InferenceClient()
201
+
202
+ response = client.chat.completions.create(
203
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct",
204
+ messages=[
205
+ ChatMessage(role="system", content=chatbot_role),
206
+ ChatMessage(role="user", content=get_prompt(context, label)),
207
+ ],
208
+ stream=True,
209
+ max_tokens=500
210
+ )
211
+
212
+ # iterate and print stream
213
+ for message in chat_completion:
214
+ print(message.choices[0].delta.content, end="")
215
+
216
  # instantiate ChatCompletion as a generator object (stream is set to True)
217
+ # response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(context, label)}], stream=True)
218
  # iterate through the streamed output
219
  report = []
220
  res_box = st.empty()
 
235
 
236
 
237
 
238
+
239
+
240
+