jattokatarratto commited on
Commit
0c212bd
·
verified ·
1 Parent(s): 9eefd88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +309 -106
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
 
3
-
4
  from transformers import file_utils
5
  print(file_utils.default_cache_path)
6
 
@@ -11,14 +10,21 @@ import logging
11
 
12
  import time
13
 
14
- from transformers import pipeline, AutoTokenizer
 
 
15
  from transformers.pipelines.pt_utils import KeyDataset
 
 
16
 
17
  from concurrent.futures import ThreadPoolExecutor, as_completed
18
  from collections import Counter
19
 
 
 
 
20
  import torch
21
- torch.cuda.empty_cache() # Clear cache ot torch
22
 
23
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
  print(f"Device: {device}...")
@@ -116,6 +122,135 @@ for modelName in models_List:
116
  modelGliner = GLiNER.from_pretrained(modelName, map_location=device)
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  def process_row_Gliner(args, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels, row):
121
  context_to_annotate = row[args.source_column]
@@ -361,12 +496,12 @@ def annotate(df, args, pipeInner, tokenizerGliner, modelGliner, modelGlinerBio,
361
  #https://data.bioontology.org/documentation#nav_annotator
362
  #https://bioportal.bioontology.org/annotatorplus
363
 
364
- #key_bioportal = ""
365
- #if args.bioportalkey_filename:
366
- # fkeyname = args.bioportalkey_filename
367
- # with open(fkeyname) as f:
368
- # key_bioportal = f.read()
369
- key_bioportal = os.environ['key_bioportal']
370
 
371
  df_annot = pd.DataFrame()
372
  for drm_idx, row in tqdm(df.iterrows()):
@@ -941,13 +1076,13 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
941
  entityBioeUrl = None
942
  ALLURIScontext = []
943
 
944
- #key_bioportal = ""
945
- #if args.bioportalkey_filename:
946
- # fkeyname = args.bioportalkey_filename
947
- # with open(fkeyname) as f:
948
- # key_bioportal = f.read()
949
- key_bioportal = os.environ['key_bioportal']
950
-
951
  # Check if args.KG_restriction exists and is not empty
952
  if getattr(args, 'KG_restriction', None):
953
 
@@ -1225,7 +1360,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1225
 
1226
 
1227
 
1228
- def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True):
1229
 
1230
  # trial
1231
  #return None, map_query_input_output
@@ -1233,93 +1368,160 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1233
  word = word.lower()
1234
  word = word.capitalize()
1235
 
1236
- labelTriples = ". ".join(" ".join(element.capitalize() for element in triple) for triple in labelTriplesLIST)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1237
 
1238
  if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
1239
  texts = text_splitter.create_documents([labelTriples])
1240
  labelTriples = texts[0].page_content
 
 
 
1241
 
1242
- #Can you elaborate and express better the following notes, delimited by triple backticks, about "{word}"?
1243
- #Don't add explanations for your answer. Do not invent. Don't use a structure or indenting. Be concise. Don't discard relevant information.
1244
- #made of RDF-like statements,
1245
 
1246
  contextText = ""
1247
- # myPromt = f"""
1248
- # Can you elaborate and express better the given notes below, delimited by triple backticks, about "{word}"?
1249
- # Don't add explanations for your answer.
1250
- # Do not invent.
1251
- # Don't use a structure or indenting.
1252
- # Be concise but exhaustive. Don't discard information reported in the notes.
1253
- # """
1254
- myPromt = f"""
1255
- Can you reformulate the following notes, provided between triple backticks, into clear and complete sentences about "{word}"?
1256
- Ensure the rewriting is human-readable and easily interpretable. Maintain conciseness and exhaustiveness, including all information from the notes.
1257
- Avoid using note formats or lists, and refrain from inventing additional information.
1258
- """
1259
- myDelimiter = "```"
1260
 
 
1261
 
1262
- if not(labelTriples) or labelTriples.strip=="":
1263
- logging.warning("No text or promt supplied! Skypping it!")
1264
- return contextText, map_query_input_output
1265
 
1266
- if cleanInput==True:
1267
- labelTriples = cleanInputText(labelTriples)
1268
 
1269
- # try to read cache
 
 
1270
 
1271
- if map_query_input_output is not None:
1272
- key = args.model_name + "__" + str(args.temperature) + "__" + myPromt
 
 
 
 
 
 
 
 
 
 
 
1273
 
1274
- if key in map_query_input_output:
1275
- if labelTriples in map_query_input_output[key]:
1276
- output = map_query_input_output[key][labelTriples]
1277
- # if input_text.strip() == "":
1278
- # print("here")
1279
 
1280
- # if handler == api_call_dglc:
1281
- # output = clean_gpt_out(output) #clean output
1282
 
1283
- if strtobool(args.debug):
1284
- print("RETRIEVED CACHED RESULT FOR:\n", myPromt, "\n", myDelimiter, word, myDelimiter, "\n=>\n", output, "\n")
1285
 
1286
- return output, map_query_input_output
 
 
 
 
1287
 
1288
- # call
 
1289
 
1290
- try:
 
 
 
1291
 
1292
- contextText = ""
1293
- # if args.service_provider == "gptjrc":
1294
- # contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1295
- # temperature=args.temperature, delimiter=myDelimiter,
1296
- # InContextExamples=[],
1297
- # handler=api_call_gptjrc,
1298
- # verbose=True, args=args)
1299
 
 
1300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1301
 
1302
- if contextText:
1303
- if not isinstance(contextText, str):
1304
- contextText = contextText['choices'][0]['message']['content']
1305
 
1306
- if map_query_input_output is not None:
1307
- if not key in map_query_input_output:
1308
- map_query_input_output[key] = {}
1309
 
1310
  if contextText:
1311
- if contextText != "":
1312
- map_query_input_output[key][labelTriples] = contextText
1313
 
 
 
 
 
 
 
 
 
 
 
 
1314
 
1315
- except Exception as err:
1316
- return None, map_query_input_output
1317
 
1318
 
1319
  return contextText, map_query_input_output
1320
 
 
 
1321
  #@mem.cache
1322
- def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True):
1323
 
1324
  if strtobool(args.debug):
1325
  print(f"\n----- Starting virtuoso_api_call for {word}")
@@ -1374,7 +1576,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1374
  else:
1375
 
1376
  try:
1377
- entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking )
1378
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1379
  ALLURIScontext = list(set(ALLURIScontext))
1380
  except Exception as err:
@@ -1404,7 +1606,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1404
  unique_listLabelTriples = cache_map_virtuoso[entityBioeUrl]["LabelTriples"]
1405
  if strtobool(args.debug):
1406
  print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "LabelTriples", "\n")
1407
- if "SingleContext" in cache_map_virtuoso[entityBioeUrl]:
1408
  singleContext = cache_map_virtuoso[entityBioeUrl]["SingleContext"]
1409
  if strtobool(args.debug):
1410
  print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "SingleContext", "\n")
@@ -1414,7 +1616,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1414
  if unique_listLabelTriples:
1415
  singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples,
1416
  text_splitter, args,
1417
- load_map_query_input_output)
1418
  else:
1419
 
1420
  query = f"""
@@ -1491,13 +1693,13 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1491
  cache_map_virtuoso[entityBioeUrl] = {}
1492
  cache_map_virtuoso[entityBioeUrl]["LabelTriples"] = unique_listLabelTriples
1493
 
1494
- singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples, text_splitter, args, load_map_query_input_output)
1495
 
1496
 
1497
  except Exception as err:
1498
  singleContext = None
1499
 
1500
- if singleContext:
1501
  if cache_map_virtuoso is not None:
1502
  if not entityBioeUrl in cache_map_virtuoso:
1503
  cache_map_virtuoso[entityBioeUrl] = {}
@@ -1518,7 +1720,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1518
  unique_listGlobalTriples = cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()]
1519
  if strtobool(args.debug):
1520
  print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalTriples"+" "+contextWordVirtuoso).strip(), "\n")
1521
- if ("GlobalContext"+" "+contextWordVirtuoso).strip() in cache_map_virtuoso[word]:
1522
  globalContext = cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()]
1523
  if strtobool(args.debug):
1524
  print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalContext"+" "+contextWordVirtuoso).strip(), "\n")
@@ -1528,7 +1730,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1528
  if unique_listGlobalTriples:
1529
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
1530
  text_splitter, args,
1531
- load_map_query_input_output)
1532
  else:
1533
 
1534
  if not ALLURIScontext:
@@ -1554,7 +1756,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1554
  endpoint,
1555
  VirtuosoUsername,
1556
  contextWordVirtuoso,
1557
- UseBioportalForLinking=UseBioportalForLinking)
 
1558
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1559
  ALLURIScontext = list(set(ALLURIScontext))
1560
 
@@ -1588,7 +1791,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1588
  if strtobool(args.debug):
1589
  print("RETRIEVED CACHED RESULT FOR:\n", xxUrl, " => ",
1590
  "LabelTriples", "\n")
1591
- # if "SingleContext" in cache_map_virtuoso[xxUrl]:
1592
  # singleContext = cache_map_virtuoso[xxUrl]["SingleContext"]
1593
  # if strtobool(args.debug):
1594
  # print("RETRIEVED CACHED RESULT FOR:\n", xxUrl, " => ",
@@ -1599,7 +1802,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1599
  # singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(
1600
  # word, unique_listLabelTriples,
1601
  # text_splitter, args,
1602
- # load_map_query_input_output)
1603
  # else:
1604
 
1605
  if not unique_listLabelTriples:
@@ -1681,9 +1884,9 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1681
  "LabelTriples"] = unique_listLabelTriples
1682
 
1683
  # singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(
1684
- # word, unique_listLabelTriples, text_splitter, args, load_map_query_input_output)
1685
  #
1686
- # if singleContext:
1687
  # if cache_map_virtuoso is not None:
1688
  # if not xxUrl in cache_map_virtuoso:
1689
  # cache_map_virtuoso[xxUrl] = {}
@@ -1715,9 +1918,9 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1715
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
1716
  unique_listGlobalTriples,
1717
  text_splitter, args,
1718
- load_map_query_input_output)
1719
 
1720
- if globalContext:
1721
  if cache_map_virtuoso is not None:
1722
  if not word in cache_map_virtuoso:
1723
  cache_map_virtuoso[word] = {}
@@ -1725,7 +1928,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1725
 
1726
  if unique_listLabelTriples:
1727
  sssingleTriples = " ,., ".join(
1728
- " ,,, ".join(element.capitalize() for element in triple) for triple in unique_listLabelTriples)
1729
  while "\\n" in sssingleTriples:
1730
  sssingleTriples = sssingleTriples.replace("\\n", " ")
1731
  sssingleTriples = sssingleTriples.strip()
@@ -1735,7 +1938,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1735
 
1736
  if unique_listGlobalTriples:
1737
  ggglobalTriples = " ,., ".join(
1738
- " ,,, ".join(element.capitalize() for element in triple) for triple in unique_listGlobalTriples)
1739
  while "\\n" in ggglobalTriples:
1740
  ggglobalTriples = ggglobalTriples.replace("\\n", " ")
1741
  ggglobalTriples = ggglobalTriples.strip()
@@ -1747,8 +1950,6 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1747
 
1748
 
1749
 
1750
-
1751
-
1752
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
1753
 
1754
  result = None
@@ -1790,7 +1991,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1790
  if strtobool(args.debug):
1791
  print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
1792
  print(row[args.source_column])
1793
- result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True)
1794
 
1795
  else:
1796
  if row['model'] == "Forced":
@@ -1815,12 +2016,12 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1815
 
1816
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1817
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
1818
- id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
1819
 
1820
  if not result: #try annotation without bioportal
1821
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1822
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
1823
- id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False)
1824
 
1825
  else:
1826
  if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
@@ -1844,7 +2045,7 @@ def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonam
1844
  iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
1845
 
1846
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
1847
- row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True)
1848
 
1849
  return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
1850
 
@@ -1981,6 +2182,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
1981
  help="whether to extract a readable context from the extracted triples for the concept")
1982
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
1983
  help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
 
 
1984
 
1985
  parser.add_argument("--service_provider", type=str, default="no", help="llm service provider")
1986
  parser.add_argument("--model_name", type=str, default="no", help="llm to use")
@@ -2107,12 +2310,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2107
  else:
2108
  cache_map_geonames = {}
2109
 
2110
- #key_geonames = ""
2111
- #if args.geonameskey_filename:
2112
- # fkeyname = args.geonameskey_filename
2113
- # with open(fkeyname) as f:
2114
- # key_geonames = f.read()
2115
- key_geonames = os.environ['key_geonames']
2116
 
2117
  cache_map_virtuoso = None
2118
  if strtobool(args.USE_CACHE):
@@ -2123,12 +2326,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2123
  else:
2124
  cache_map_virtuoso = {}
2125
 
2126
- #key_virtuoso = ""
2127
- #if args.virtuosokey_filename:
2128
- # fkeyname = args.virtuosokey_filename
2129
- # with open(fkeyname) as f:
2130
- # key_virtuoso = f.read()
2131
- key_virtuoso = os.environ['key_virtuoso']
2132
 
2133
  # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
2134
 
 
1
  import os
2
 
 
3
  from transformers import file_utils
4
  print(file_utils.default_cache_path)
5
 
 
10
 
11
  import time
12
 
13
+ import sys
14
+
15
+ from transformers import pipeline, AutoTokenizer, AutoModel
16
  from transformers.pipelines.pt_utils import KeyDataset
17
+ from sentence_transformers.util import cos_sim
18
+ from typing import Dict
19
 
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
23
+ #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
24
+ #os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
25
+
26
  import torch
27
+ #torch.cuda.empty_cache() # Clear cache ot torch
28
 
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  print(f"Device: {device}...")
 
122
  modelGliner = GLiNER.from_pretrained(modelName, map_location=device)
123
 
124
 
125
+ # 1. Load the model and tokenizer
126
+ model_id_Retriever = 'mixedbread-ai/mxbai-embed-large-v1'
127
+ tokenizer_Retriever = AutoTokenizer.from_pretrained(model_id_Retriever)
128
+ modelRetriever = AutoModel.from_pretrained(model_id_Retriever)
129
+
130
+
131
+ def RAG_retrieval_Base(queryText, passages, min_threshold=0.0, max_num_passages=None):
132
+ similarities = retrievePassageSimilarities(queryText, passages)
133
+
134
+ # Create a DataFrame
135
+ df = pd.DataFrame({
136
+ 'Passage': passages,
137
+ 'Similarity': similarities.flatten() # Flatten the similarity tensor/array to ensure compatibility
138
+ })
139
+
140
+ # Filter the DataFrame based on the similarity threshold
141
+ df_filtered = df[df['Similarity'] >= min_threshold]
142
+
143
+ # If max_num_passages is specified, limit the number of passages returned
144
+ if max_num_passages is not None:
145
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
146
+
147
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
148
+
149
+ # Return the filtered DataFrame
150
+ return df_filtered
151
+
152
+
153
+ def RAG_retrieval_Percentile(queryText, passages, percentile=90, max_num_passages=None, min_threshold=0.5):
154
+ # Encoding and similarity computation remains the same
155
+
156
+ similarities = retrievePassageSimilarities(queryText, passages)
157
+
158
+ # Determine threshold based on percentile
159
+ threshold = np.percentile(similarities.flatten(), percentile)
160
+
161
+ # Create a DataFrame
162
+ df = pd.DataFrame({
163
+ 'Passage': passages,
164
+ 'Similarity': similarities.flatten()
165
+ })
166
+
167
+ # Filter using percentile threshold
168
+ df_filtered = df[df['Similarity'] >= threshold]
169
+
170
+ if min_threshold:
171
+ # Filter the DataFrame also on min similarity threshold
172
+ df_filtered = df[df['Similarity'] >= min_threshold]
173
+
174
+ # If max_num_passages is specified, limit the number of passages returned
175
+ if max_num_passages is not None:
176
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
177
+
178
+ # Sort by similarity
179
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
180
+
181
+ return df_filtered
182
+
183
+
184
+ def RAG_retrieval_TopK(queryText, passages, top_fraction=0.1, max_num_passages=None, min_threshold=0.5):
185
+ # Encoding and similarity computation remains the same
186
+
187
+ similarities = retrievePassageSimilarities(queryText, passages)
188
+
189
+ # Calculate the number of passages to select based on top fraction
190
+ num_passages_TopFraction = max(1, int(top_fraction * len(passages)))
191
+
192
+ # Create a DataFrame
193
+ df = pd.DataFrame({
194
+ 'Passage': passages,
195
+ 'Similarity': similarities.flatten()
196
+ })
197
+
198
+ # Select the top passages dynamically
199
+ df_filtered = df.nlargest(num_passages_TopFraction, 'Similarity')
200
+
201
+ if min_threshold:
202
+ # Filter the DataFrame also on min similarity threshold
203
+ df_filtered = df[df['Similarity'] >= min_threshold]
204
+
205
+ # If max_num_passages is specified, limit the number of passages returned
206
+ if max_num_passages is not None:
207
+ df_filtered = df_filtered.nlargest(max_num_passages, 'Similarity')
208
+
209
+ # Sort by similarity
210
+ df_filtered = df_filtered.sort_values(by='Similarity', ascending=False)
211
+
212
+ return df_filtered
213
+
214
+
215
+
216
+ # Define the transform_query function
217
+ def transform_query(queryText: str) -> str:
218
+ """For retrieval, add the prompt for queryText (not for documents)."""
219
+ return f'Represent this sentence for searching relevant passages: {queryText}'
220
+
221
+
222
+ # Define the pooling function
223
+ def pooling(outputs: torch.Tensor, inputs: Dict, strategy: str = 'cls') -> np.ndarray:
224
+ if strategy == 'cls':
225
+ outputs = outputs[:, 0]
226
+ elif strategy == 'mean':
227
+ outputs = torch.sum(
228
+ outputs * inputs["attention_mask"][:, :, None], dim=1
229
+ ) / torch.sum(inputs["attention_mask"], dim=1, keepdim=True)
230
+ else:
231
+ raise NotImplementedError
232
+ return outputs.detach().cpu().numpy()
233
+
234
+
235
+ def retrievePassageSimilarities(queryText, passages):
236
+ # Create the docs list by adding the transformed queryText and then the passages
237
+ docs = [transform_query(queryText)] + passages
238
+
239
+ # 2. Encode the inputs
240
+ inputs = tokenizer_Retriever(docs, padding=True, return_tensors='pt')
241
+
242
+ # Move inputs to the right device using accelerator
243
+ inputs = {k: v.to(device) for k, v in inputs.items()}
244
+ outputs = modelRetriever(**inputs).last_hidden_state
245
+ embeddings = pooling(outputs, inputs, 'cls')
246
+
247
+ similarities = cos_sim(embeddings[0], embeddings[1:])
248
+
249
+ # print('similarities:', similarities)
250
+
251
+ return similarities
252
+
253
+
254
 
255
  def process_row_Gliner(args, tokenizerGliner, modelGlinerBio, modelGliner, glinerlabels, row):
256
  context_to_annotate = row[args.source_column]
 
496
  #https://data.bioontology.org/documentation#nav_annotator
497
  #https://bioportal.bioontology.org/annotatorplus
498
 
499
+ key_bioportal = ""
500
+ if args.bioportalkey_filename:
501
+ fkeyname = args.bioportalkey_filename
502
+ with open(fkeyname) as f:
503
+ key_bioportal = f.read()
504
+ #key_bioportal = os.environ['key_bioportal']
505
 
506
  df_annot = pd.DataFrame()
507
  for drm_idx, row in tqdm(df.iterrows()):
 
1076
  entityBioeUrl = None
1077
  ALLURIScontext = []
1078
 
1079
+ key_bioportal = ""
1080
+ if args.bioportalkey_filename:
1081
+ fkeyname = args.bioportalkey_filename
1082
+ with open(fkeyname) as f:
1083
+ key_bioportal = f.read()
1084
+ #key_bioportal = os.environ['key_bioportal']
1085
+
1086
  # Check if args.KG_restriction exists and is not empty
1087
  if getattr(args, 'KG_restriction', None):
1088
 
 
1360
 
1361
 
1362
 
1363
+ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""):
1364
 
1365
  # trial
1366
  #return None, map_query_input_output
 
1368
  word = word.lower()
1369
  word = word.capitalize()
1370
 
1371
+
1372
+ if (strtobool(args.UseRetrieverForContextCreation)==True):
1373
+ labelTriples = ""
1374
+ passages = []
1375
+ nn=200
1376
+ for i, triple in enumerate(labelTriplesLIST, start=1):
1377
+ #for triple in labelTriplesLIST:
1378
+ TriplesString = (" ".join(str(element).capitalize() for element in triple))
1379
+ passages.append(TriplesString)
1380
+ # Check if the current index is a multiple of nn
1381
+ if i % nn == 0:
1382
+ #print("elaborate RAG triples")
1383
+
1384
+ #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=50)
1385
+ #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=50, min_threshold=0.65)
1386
+ #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=50, min_threshold=0.65)
1387
+ df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=50, min_threshold=0.65)
1388
+
1389
+ passages = []
1390
+
1391
+ df_retrieved = df_retrievedtopk.copy()
1392
+ if not df_retrieved.empty:
1393
+ labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1394
+ labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1395
+
1396
+ if not labelTriples:
1397
+ labelTriples =labelTriplesAPP
1398
+ else:
1399
+ labelTriples = labelTriples + ". " + labelTriplesAPP
1400
+
1401
+ if passages:
1402
+ df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=50, min_threshold=0.65)
1403
+
1404
+ df_retrieved = df_retrievedtopk.copy()
1405
+ if not df_retrieved.empty:
1406
+ labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1407
+ labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1408
+ if not labelTriples:
1409
+ labelTriples = labelTriplesAPP
1410
+ else:
1411
+ labelTriples = labelTriples + ". " + labelTriplesAPP
1412
+
1413
+ if labelTriples:
1414
+ labelTriples.strip().replace("..",".").strip()
1415
+
1416
+
1417
+ else:
1418
+ labelTriples = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST)
1419
+
1420
+
1421
+ if not(labelTriples) or labelTriples.strip=="":
1422
+ logging.warning("No text or prompt supplied! Skypping it!")
1423
+ return "", map_query_input_output
1424
 
1425
  if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
1426
  texts = text_splitter.create_documents([labelTriples])
1427
  labelTriples = texts[0].page_content
1428
+ if not (labelTriples) or labelTriples.strip == "":
1429
+ logging.warning("after splitting ...No text or prompt supplied! Skypping it!")
1430
+ return "", map_query_input_output
1431
 
 
 
 
1432
 
1433
  contextText = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
1434
 
1435
+ if (strtobool(args.UseRetrieverForContextCreation) == True):
1436
 
1437
+ contextText = labelTriples
 
 
1438
 
1439
+ else: #USE the LLM for summarise the triples
 
1440
 
1441
+ # Can you elaborate and express better the following notes, delimited by triple backticks, about "{word}"?
1442
+ # Don't add explanations for your answer. Do not invent. Don't use a structure or indenting. Be concise. Don't discard relevant information.
1443
+ # made of RDF-like statements,
1444
 
1445
+ # myPromt = f"""
1446
+ # Can you elaborate and express better the given notes below, delimited by triple backticks, about "{word}"?
1447
+ # Don't add explanations for your answer.
1448
+ # Do not invent.
1449
+ # Don't use a structure or indenting.
1450
+ # Be concise but exhaustive. Don't discard information reported in the notes.
1451
+ # """
1452
+ myPromt = f"""
1453
+ Can you reformulate the following notes, provided between triple backticks, into clear and complete sentences about "{word}"?
1454
+ Ensure the rewriting is human-readable and easily interpretable. Maintain conciseness and exhaustiveness, including all information from the notes.
1455
+ Avoid using note formats or lists, and refrain from inventing additional information.
1456
+ """
1457
+ myDelimiter = "```"
1458
 
1459
+ if cleanInput==True:
1460
+ labelTriples = cleanInputText(labelTriples)
 
 
 
1461
 
1462
+ # try to read cache
 
1463
 
1464
+ if map_query_input_output is not None:
1465
+ key = args.model_name + "__" + str(args.temperature) + "__" + myPromt
1466
 
1467
+ if key in map_query_input_output:
1468
+ if labelTriples in map_query_input_output[key]:
1469
+ output = map_query_input_output[key][labelTriples]
1470
+ # if input_text.strip() == "":
1471
+ # print("here")
1472
 
1473
+ # if handler == api_call_dglc:
1474
+ # output = clean_gpt_out(output) #clean output
1475
 
1476
+ if strtobool(args.debug):
1477
+ print("RETRIEVED CACHED RESULT FOR:\n", myPromt, "\n", myDelimiter, word, myDelimiter, "\n=>\n", output, "\n")
1478
+
1479
+ return output, map_query_input_output
1480
 
1481
+ # call
 
 
 
 
 
 
1482
 
1483
+ try:
1484
 
1485
+ contextText = ""
1486
+ # if args.service_provider == "gptjrc":
1487
+ # contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1488
+ # temperature=args.temperature, delimiter=myDelimiter,
1489
+ # InContextExamples=[],
1490
+ # handler=api_call_gptjrc,
1491
+ # verbose=True, args=args)
1492
+ # elif args.service_provider == "HFonPremises":
1493
+ # contextText = call_model(input_text=labelTriples, prompt=myPromt, model=args.model_name,
1494
+ # temperature=args.temperature, delimiter=myDelimiter,
1495
+ # InContextExamples=[],
1496
+ # handler=api_call_HFonPremises,
1497
+ # verbose=True, args=args)
1498
 
 
 
 
1499
 
 
 
 
1500
 
1501
  if contextText:
1502
+ if not isinstance(contextText, str):
1503
+ contextText = contextText['choices'][0]['message']['content']
1504
 
1505
+ if map_query_input_output is not None:
1506
+ if not key in map_query_input_output:
1507
+ map_query_input_output[key] = {}
1508
+
1509
+ if contextText:
1510
+ if contextText != "":
1511
+ map_query_input_output[key][labelTriples] = contextText
1512
+
1513
+
1514
+ except Exception as err:
1515
+ return None, map_query_input_output
1516
 
 
 
1517
 
1518
 
1519
  return contextText, map_query_input_output
1520
 
1521
+
1522
+
1523
  #@mem.cache
1524
+ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""):
1525
 
1526
  if strtobool(args.debug):
1527
  print(f"\n----- Starting virtuoso_api_call for {word}")
 
1576
  else:
1577
 
1578
  try:
1579
+ entityBioeUrl, ALLURIScontext, cache_map_virtuoso = getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=UseBioportalForLinking, questionText=questionText )
1580
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1581
  ALLURIScontext = list(set(ALLURIScontext))
1582
  except Exception as err:
 
1606
  unique_listLabelTriples = cache_map_virtuoso[entityBioeUrl]["LabelTriples"]
1607
  if strtobool(args.debug):
1608
  print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "LabelTriples", "\n")
1609
+ if ("SingleContext" in cache_map_virtuoso[entityBioeUrl]) and (strtobool(args.UseRetrieverForContextCreation)==False):
1610
  singleContext = cache_map_virtuoso[entityBioeUrl]["SingleContext"]
1611
  if strtobool(args.debug):
1612
  print("RETRIEVED CACHED RESULT FOR:\n", entityBioeUrl, " => ", "SingleContext", "\n")
 
1616
  if unique_listLabelTriples:
1617
  singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples,
1618
  text_splitter, args,
1619
+ load_map_query_input_output,cleanInput=True,questionText=questionText)
1620
  else:
1621
 
1622
  query = f"""
 
1693
  cache_map_virtuoso[entityBioeUrl] = {}
1694
  cache_map_virtuoso[entityBioeUrl]["LabelTriples"] = unique_listLabelTriples
1695
 
1696
+ singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listLabelTriples, text_splitter, args, load_map_query_input_output,cleanInput=True,questionText=questionText)
1697
 
1698
 
1699
  except Exception as err:
1700
  singleContext = None
1701
 
1702
+ if singleContext and (strtobool(args.UseRetrieverForContextCreation)==False):
1703
  if cache_map_virtuoso is not None:
1704
  if not entityBioeUrl in cache_map_virtuoso:
1705
  cache_map_virtuoso[entityBioeUrl] = {}
 
1720
  unique_listGlobalTriples = cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()]
1721
  if strtobool(args.debug):
1722
  print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalTriples"+" "+contextWordVirtuoso).strip(), "\n")
1723
+ if (("GlobalContext"+" "+contextWordVirtuoso).strip() in cache_map_virtuoso[word]) and (strtobool(args.UseRetrieverForContextCreation)==False):
1724
  globalContext = cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()]
1725
  if strtobool(args.debug):
1726
  print("RETRIEVED CACHED RESULT FOR:\n", word, " => ", ("GlobalContext"+" "+contextWordVirtuoso).strip(), "\n")
 
1730
  if unique_listGlobalTriples:
1731
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
1732
  text_splitter, args,
1733
+ load_map_query_input_output,cleanInput=True,questionText=questionText)
1734
  else:
1735
 
1736
  if not ALLURIScontext:
 
1756
  endpoint,
1757
  VirtuosoUsername,
1758
  contextWordVirtuoso,
1759
+ UseBioportalForLinking=UseBioportalForLinking,
1760
+ questionText=questionText)
1761
  if ALLURIScontext and isinstance(ALLURIScontext, list):
1762
  ALLURIScontext = list(set(ALLURIScontext))
1763
 
 
1791
  if strtobool(args.debug):
1792
  print("RETRIEVED CACHED RESULT FOR:\n", xxUrl, " => ",
1793
  "LabelTriples", "\n")
1794
+ # if "SingleContext" in cache_map_virtuoso[xxUrl] and (strtobool(args.UseRetrieverForContextCreation)==False):
1795
  # singleContext = cache_map_virtuoso[xxUrl]["SingleContext"]
1796
  # if strtobool(args.debug):
1797
  # print("RETRIEVED CACHED RESULT FOR:\n", xxUrl, " => ",
 
1802
  # singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(
1803
  # word, unique_listLabelTriples,
1804
  # text_splitter, args,
1805
+ # load_map_query_input_output, cleanInput=True, questionText=questionText)
1806
  # else:
1807
 
1808
  if not unique_listLabelTriples:
 
1884
  "LabelTriples"] = unique_listLabelTriples
1885
 
1886
  # singleContext, load_map_query_input_output = getLinearTextualContextFromTriples(
1887
+ # word, unique_listLabelTriples, text_splitter, args, load_map_query_input_output, cleanInput=True, questionText=questionText)
1888
  #
1889
+ # if singleContext and (strtobool(args.UseRetrieverForContextCreation)==False):
1890
  # if cache_map_virtuoso is not None:
1891
  # if not xxUrl in cache_map_virtuoso:
1892
  # cache_map_virtuoso[xxUrl] = {}
 
1918
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
1919
  unique_listGlobalTriples,
1920
  text_splitter, args,
1921
+ load_map_query_input_output, cleanInput=True, questionText=questionText)
1922
 
1923
+ if globalContext and (strtobool(args.UseRetrieverForContextCreation)==False):
1924
  if cache_map_virtuoso is not None:
1925
  if not word in cache_map_virtuoso:
1926
  cache_map_virtuoso[word] = {}
 
1928
 
1929
  if unique_listLabelTriples:
1930
  sssingleTriples = " ,., ".join(
1931
+ " ,,, ".join(str(element).capitalize() for element in triple) for triple in unique_listLabelTriples)
1932
  while "\\n" in sssingleTriples:
1933
  sssingleTriples = sssingleTriples.replace("\\n", " ")
1934
  sssingleTriples = sssingleTriples.strip()
 
1938
 
1939
  if unique_listGlobalTriples:
1940
  ggglobalTriples = " ,., ".join(
1941
+ " ,,, ".join(str(element).capitalize() for element in triple) for triple in unique_listGlobalTriples)
1942
  while "\\n" in ggglobalTriples:
1943
  ggglobalTriples = ggglobalTriples.replace("\\n", " ")
1944
  ggglobalTriples = ggglobalTriples.strip()
 
1950
 
1951
 
1952
 
 
 
1953
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
1954
 
1955
  result = None
 
1991
  if strtobool(args.debug):
1992
  print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
1993
  print(row[args.source_column])
1994
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column])
1995
 
1996
  else:
1997
  if row['model'] == "Forced":
 
2016
 
2017
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2018
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2019
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
2020
 
2021
  if not result: #try annotation without bioportal
2022
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2023
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2024
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False,questionText=row[args.source_column])
2025
 
2026
  else:
2027
  if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
 
2045
  iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2046
 
2047
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2048
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
2049
 
2050
  return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
2051
 
 
2182
  help="whether to extract a readable context from the extracted triples for the concept")
2183
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
2184
  help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
2185
+ parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
2186
+ help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
2187
 
2188
  parser.add_argument("--service_provider", type=str, default="no", help="llm service provider")
2189
  parser.add_argument("--model_name", type=str, default="no", help="llm to use")
 
2310
  else:
2311
  cache_map_geonames = {}
2312
 
2313
+ key_geonames = ""
2314
+ if args.geonameskey_filename:
2315
+ fkeyname = args.geonameskey_filename
2316
+ with open(fkeyname) as f:
2317
+ key_geonames = f.read()
2318
+ #key_geonames = os.environ['key_geonames']
2319
 
2320
  cache_map_virtuoso = None
2321
  if strtobool(args.USE_CACHE):
 
2326
  else:
2327
  cache_map_virtuoso = {}
2328
 
2329
+ key_virtuoso = ""
2330
+ if args.virtuosokey_filename:
2331
+ fkeyname = args.virtuosokey_filename
2332
+ with open(fkeyname) as f:
2333
+ key_virtuoso = f.read()
2334
+ #key_virtuoso = os.environ['key_virtuoso']
2335
 
2336
  # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe:
2337