Weedoo commited on
Commit
dacd607
·
verified ·
1 Parent(s): 49d6487
Files changed (1) hide show
  1. utils.py +162 -55
utils.py CHANGED
@@ -4,65 +4,96 @@ import requests
4
  from pinecone import Pinecone, ServerlessSpec
5
  import logging
6
  import os
 
 
 
 
7
 
8
  script_dir = os.path.dirname(os.path.abspath(__file__))
9
- os.chdir(script_dir)
 
10
 
11
  def get_zotero_ids(api_key, library_id, tag):
12
 
13
- base_url = 'https://api.zotero.org'
14
- suffix = '/users/'+ library_id +'/items?tag='+ tag
 
 
 
15
 
16
- header = {'Authorization': 'Bearer '+ api_key}
17
- request = requests.get(base_url + suffix, headers= header)
18
-
19
- return [data['data']['archiveID'].replace('arXiv:', '') for data in request.json()]
20
 
21
- def get_arxiv_papers(ids = None, category = None, comment = None):
22
 
23
- logging.getLogger('arxiv').setLevel(logging.WARNING)
 
 
24
 
25
  client = arxiv.Client()
26
 
27
  if category is None:
28
  search = arxiv.Search(
29
- id_list= ids,
30
- max_results= len(ids),
31
  )
32
- else :
33
  if comment is None:
34
- custom_query = f'cat:{category}'
35
  else:
36
- custom_query = f'cat:{category} AND co:{comment}'
37
 
38
  search = arxiv.Search(
39
- query = custom_query,
40
- max_results= 15,
41
- sort_by= arxiv.SortCriterion.SubmittedDate
42
  )
43
  if ids is None and category is None:
44
- raise ValueError('not a valid query')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- df = pd.DataFrame({'Title': [result.title for result in client.results(search)],
47
- 'Abstract': [result.summary.replace('\n', ' ') for result in client.results(search)],
48
- 'Date': [result.published.date().strftime('%Y-%m-%d') for result in client.results(search)],
49
- 'id': [result.entry_id for result in client.results(search)]})
50
-
51
  if ids:
52
- df.to_csv('arxiv-scrape.csv', index = False)
53
  return df
54
 
 
55
  def get_hf_embeddings(api_key, df):
56
 
57
- title_abs = [title + '[SEP]' + abstract for title,abstract in zip(df['Title'], df['Abstract'])]
 
 
 
58
 
59
  API_URL = "https://api-inference.huggingface.co/models/malteos/scincl"
60
  headers = {"Authorization": f"Bearer {api_key}"}
61
 
62
- response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": False}})
63
-
 
 
64
  if response.status_code == 503:
65
- response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": True}})
 
 
 
 
 
 
 
 
 
 
66
 
67
  embeddings = response.json()
68
 
@@ -70,64 +101,140 @@ def get_hf_embeddings(api_key, df):
70
 
71
 
72
  def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df):
73
- input = [{'id': df['id'][i], 'values': embeddings[i]} for i in range(len(embeddings))]
 
 
74
 
75
- pc = Pinecone(api_key = api_key)
76
  if index in pc.list_indexes().names():
77
  while True:
78
- logging.warning(f'Index name : {index} already exists.')
79
- return f'Index name : {index} already exists'
80
-
81
  pc.create_index(
82
  name=index,
83
  dimension=dim,
84
  metric="cosine",
85
- deletion_protection="disabled",
86
- spec=ServerlessSpec(
87
- cloud='aws',
88
- region='us-east-1'
89
- )
90
- )
91
 
92
  index = pc.Index(index)
93
  return index.upsert(vectors=input, namespace=namespace)
94
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def get_new_papers(df):
97
- df_main = pd.read_csv('arxiv-scrape.csv')
98
  df.reset_index(inplace=True)
99
- df.drop(columns=['index'], inplace=True)
100
- union_df = df.merge(df_main, how='left', indicator=True)
101
- df = union_df[union_df['_merge'] == 'left_only'].drop(columns=['_merge'])
102
  if df.empty:
103
- return 'No New Papers Found'
104
  else:
105
- # df_main = pd.concat([df_main, df], ignore_index= True) #persistence of recommended paper removed for demo
106
- # df_main.drop_duplicates(inplace= True)
107
- # df_main.to_csv('arxiv-scrape.csv', index = False)
108
  return df
109
 
 
110
  def recommend_papers(api_key, index, namespace, embeddings, df, threshold):
111
 
112
- pc = Pinecone(api_key = api_key)
113
  if index in pc.list_indexes().names():
114
  index = pc.Index(index)
115
  else:
116
  raise ValueError(f"{index} doesnt exist. Project isnt initialized properly")
117
-
118
  results = []
119
  score_threshold = threshold
120
- for i,embedding in enumerate(embeddings):
121
  query = embedding
122
- result = index.query(namespace=namespace,vector=query,top_k=3,include_values=False)
123
- sum_score = sum(match['score'] for match in result['matches'])
 
 
124
  if sum_score > score_threshold:
125
- results.append(f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3} <br />")
 
 
126
 
127
  if results:
128
- return '\n'.join(results)
129
  else:
130
- return 'No Interesting Paper'
 
131
 
 
 
132
 
 
 
 
 
 
 
 
 
 
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from pinecone import Pinecone, ServerlessSpec
5
  import logging
6
  import os
7
+ import asyncio
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv(".env")
11
 
12
  script_dir = os.path.dirname(os.path.abspath(__file__))
13
+ os.chdir(script_dir)
14
+
15
 
16
  def get_zotero_ids(api_key, library_id, tag):
17
 
18
+ base_url = "https://api.zotero.org"
19
+ suffix = "/users/" + library_id + "/items?tag=" + tag
20
+
21
+ header = {"Authorization": "Bearer " + api_key}
22
+ request = requests.get(base_url + suffix, headers=header)
23
 
24
+ return [data["data"]["archiveID"].replace("arXiv:", "") for data in request.json()]
 
 
 
25
 
 
26
 
27
+ def get_arxiv_papers(ids=None, category=None, comment=None):
28
+
29
+ logging.getLogger("arxiv").setLevel(logging.WARNING)
30
 
31
  client = arxiv.Client()
32
 
33
  if category is None:
34
  search = arxiv.Search(
35
+ id_list=ids,
36
+ max_results=len(ids),
37
  )
38
+ else:
39
  if comment is None:
40
+ custom_query = f"cat:{category}"
41
  else:
42
+ custom_query = f"cat:{category} AND co:{comment}"
43
 
44
  search = arxiv.Search(
45
+ query=custom_query,
46
+ max_results=15,
47
+ sort_by=arxiv.SortCriterion.SubmittedDate,
48
  )
49
  if ids is None and category is None:
50
+ raise ValueError("not a valid query")
51
+
52
+ df = pd.DataFrame(
53
+ {
54
+ "Title": [result.title for result in client.results(search)],
55
+ "Abstract": [
56
+ result.summary.replace("\n", " ") for result in client.results(search)
57
+ ],
58
+ "Date": [
59
+ result.published.date().strftime("%Y-%m-%d")
60
+ for result in client.results(search)
61
+ ],
62
+ "id": [result.entry_id for result in client.results(search)],
63
+ }
64
+ )
65
 
 
 
 
 
 
66
  if ids:
67
+ df.to_csv("arxiv-scrape.csv", index=False)
68
  return df
69
 
70
+
71
  def get_hf_embeddings(api_key, df):
72
 
73
+ title_abs = [
74
+ title + "[SEP]" + abstract
75
+ for title, abstract in zip(df["Title"], df["Abstract"])
76
+ ]
77
 
78
  API_URL = "https://api-inference.huggingface.co/models/malteos/scincl"
79
  headers = {"Authorization": f"Bearer {api_key}"}
80
 
81
+ response = requests.post(
82
+ API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": False}
83
+ )
84
+ print(str(response.status_code) + "This part needs an update, causing KeyError 0")
85
  if response.status_code == 503:
86
+ response = asyncio.run(
87
+ asyncio.to_thread(
88
+ requests.post,
89
+ API_URL,
90
+ headers=headers,
91
+ json={"inputs": title_abs, "wait_for_model": True},
92
+ )
93
+ )
94
+ # response = requests.post(
95
+ # API_URL, headers=headers, json={"inputs": title_abs, "wait_for_model": True}
96
+ # )
97
 
98
  embeddings = response.json()
99
 
 
101
 
102
 
103
  def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df):
104
+ input = [
105
+ {"id": df["id"][i], "values": embeddings[i]} for i in range(len(embeddings))
106
+ ]
107
 
108
+ pc = Pinecone(api_key=api_key)
109
  if index in pc.list_indexes().names():
110
  while True:
111
+ logging.warning(f"Index name : {index} already exists.")
112
+ return f"Index name : {index} already exists"
113
+
114
  pc.create_index(
115
  name=index,
116
  dimension=dim,
117
  metric="cosine",
118
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
119
+ )
 
 
 
 
120
 
121
  index = pc.Index(index)
122
  return index.upsert(vectors=input, namespace=namespace)
123
 
124
 
125
+ def main():
126
+ script_dir = os.path.dirname(os.path.abspath(__file__))
127
+ os.chdir(script_dir)
128
+ logging.basicConfig(
129
+ filename="logs/logfile.log",
130
+ level=logging.INFO,
131
+ format="%(asctime)s - %(levelname)s - %(message)s",
132
+ )
133
+ logging.getLogger("arxiv").setLevel(logging.WARNING)
134
+ logging.info("Project Initialization Script Started (Serverless)")
135
+
136
+ ids = get_zotero_ids(
137
+ os.getenv("ZOTERO_API_KEY"),
138
+ os.getenv("ZOTERO_LIBRARY_ID"),
139
+ os.getenv("ZOTERO_TAG"),
140
+ )
141
+ print(ids)
142
+
143
+ df = get_arxiv_papers(ids=ids)
144
+
145
+ embeddings, dim = get_hf_embeddings(os.getenv("HF_API_KEY"), df)
146
+
147
+ feedback = upload_to_pinecone(
148
+ api_key=os.getenv("PINECONE_API_KEY"),
149
+ index=os.getenv("INDEX_NAME"),
150
+ namespace=os.getenv("NAMESPACE_NAME"),
151
+ embeddings=embeddings,
152
+ dim=dim,
153
+ df=df,
154
+ )
155
+
156
+ logging.info(feedback)
157
+ if feedback is dict:
158
+ return f"Retrieved {len(ids)} papers from Zotero. Successfully upserted {feedback['upserted_count']} embeddings in {os.getenv('NAMESPACE_NAME')} namespace."
159
+ else:
160
+ return feedback
161
+
162
+
163
  def get_new_papers(df):
164
+ df_main = pd.read_csv("arxiv-scrape.csv")
165
  df.reset_index(inplace=True)
166
+ df.drop(columns=["index"], inplace=True)
167
+ union_df = df.merge(df_main, how="left", indicator=True)
168
+ df = union_df[union_df["_merge"] == "left_only"].drop(columns=["_merge"])
169
  if df.empty:
170
+ return "No New Papers Found"
171
  else:
172
+ df_main = pd.concat([df_main, df], ignore_index=True)
173
+ df_main.drop_duplicates(inplace=True)
174
+ df_main.to_csv("arxiv-scrape.csv", index=False)
175
  return df
176
 
177
+
178
  def recommend_papers(api_key, index, namespace, embeddings, df, threshold):
179
 
180
+ pc = Pinecone(api_key=api_key)
181
  if index in pc.list_indexes().names():
182
  index = pc.Index(index)
183
  else:
184
  raise ValueError(f"{index} doesnt exist. Project isnt initialized properly")
185
+
186
  results = []
187
  score_threshold = threshold
188
+ for i, embedding in enumerate(embeddings):
189
  query = embedding
190
+ result = index.query(
191
+ namespace=namespace, vector=query, top_k=3, include_values=False
192
+ )
193
+ sum_score = sum(match["score"] for match in result["matches"])
194
  if sum_score > score_threshold:
195
+ results.append(
196
+ f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3} <br />"
197
+ )
198
 
199
  if results:
200
+ return "\n".join(results)
201
  else:
202
+ return "No Interesting Paper"
203
+
204
 
205
+ def recs(threshold):
206
+ logging.info("Weekly Script Started (Serverless)")
207
 
208
+ df = get_arxiv_papers(
209
+ category=os.getenv("ARXIV_CATEGORY_NAME"),
210
+ comment=os.getenv("ARXIV_COMMENT_QUERY"),
211
+ )
212
+
213
+ df = get_new_papers(df)
214
+
215
+ if not isinstance(df, pd.DataFrame):
216
+ return df
217
 
218
+ embeddings, _ = get_hf_embeddings(os.getenv("HF_API_KEY"), df)
219
+
220
+ results = recommend_papers(
221
+ os.getenv("PINECONE_API_KEY"),
222
+ os.getenv("INDEX_NAME"),
223
+ os.getenv("NAMESPACE_NAME"),
224
+ embeddings,
225
+ df,
226
+ threshold,
227
+ )
228
+
229
+ return results
230
+
231
+
232
+ if __name__ == "__main__":
233
+ choice = int(input("1. Initialize\n2. Recommend Papers\n"))
234
+ if choice == 1:
235
+ print(main())
236
+ elif choice == 2:
237
+ threshold = float(input("Enter Similarity Threshold"))
238
+ print(recs(threshold))
239
+ else:
240
+ raise ValueError("Invalid Input")