jattokatarratto commited on
Commit
05e5fac
·
verified ·
1 Parent(s): 9bb2916

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -109
app.py CHANGED
@@ -20,11 +20,7 @@ from typing import Dict
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
23
- #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
24
- #os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
25
-
26
  import torch
27
- #torch.cuda.empty_cache() # Clear cache ot torch
28
 
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
  print(f"Device: {device}...")
@@ -81,9 +77,9 @@ POSSIBLE_KGchoices_List = ["AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI",
81
  "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
82
  "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
83
  "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
84
- "SYMP", "FoodOn", "UBERON", "VO", "EuroSciVoc"]
85
 
86
- ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
87
 
88
  encod = encoding_getter('microsoft/deberta-v3-large')
89
  text_splitter = TokenTextSplitter(
@@ -897,7 +893,8 @@ def entitiesFusion(df_annotated, args):
897
 
898
  #delete all the rows with score smaller than entities_filter_threshold:
899
  if args.entities_filter_threshold > 0:
900
- df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
 
901
  if df_annotated.empty:
902
  return df_annotated
903
 
@@ -1067,7 +1064,7 @@ def geonames_api_call(word, args, key_geonames, cache_map_geonames):
1067
  return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames
1068
 
1069
 
1070
- def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=False ):
1071
  #UseBioportalForLinking = False #trial to del
1072
 
1073
  if strtobool(args.debug):
@@ -1177,7 +1174,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1177
  if strtobool(args.debug):
1178
  print("Use Virtuoso Sparql endpoint for linking ... " + word.lower())
1179
 
1180
- responseText = sparqlQuery(endpoint, query, VirtuosoUsername, key_virtuoso, strtobool(args.USE_CACHE))
1181
 
1182
  # Parse the response as JSON
1183
  results = json.loads(responseText)
@@ -1360,6 +1357,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
1360
 
1361
 
1362
 
 
1363
  def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""):
1364
 
1365
  # trial
@@ -1463,20 +1461,20 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1463
  # # Check if the current index is a multiple of nn
1464
  # if i % nn == 0:
1465
  # #print("elaborate RAG triples")
1466
- #
1467
  # #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1468
  # #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1469
  # #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1470
  # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1471
- #
1472
  # passages = []
1473
- #
1474
  # df_retrieved = df_retrievedtopk.copy()
1475
  # if not df_retrieved.empty:
1476
  # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1477
  # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1478
  # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1479
- #
1480
  # if not labelTriples:
1481
  # labelTriples =labelTriplesAPP
1482
  # else:
@@ -1504,14 +1502,14 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1504
 
1505
 
1506
  if not(labelTriples) or labelTriples.strip=="":
1507
- logging.warning("No text or prompt supplied! Skypping it!")
1508
  return "", map_query_input_output
1509
 
1510
  if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
1511
  texts = text_splitter.create_documents([labelTriples])
1512
  labelTriples = texts[0].page_content
1513
  if not (labelTriples) or labelTriples.strip == "":
1514
- logging.warning("after splitting ...No text or prompt supplied! Skypping it!")
1515
  return "", map_query_input_output
1516
 
1517
 
@@ -1604,6 +1602,7 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
1604
  return contextText, map_query_input_output
1605
 
1606
 
 
1607
  #@mem.cache
1608
  def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""):
1609
 
@@ -2036,102 +2035,111 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
2036
 
2037
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
2038
 
2039
- result = None
2040
- singleContext = None
2041
- globalContext = None
2042
- singleTriples = None
2043
- globalTriples = None
2044
  ALLURIScontext = []
2045
 
2046
- InRagMode=False
2047
- if hasattr(args, 'useBioKgRAG') and (strtobool(args.useBioKgRAG)==True):
2048
- InRagMode = True
2049
 
2050
- if (InRagMode==False):
2051
- if row['IsGeo'] == 1:
2052
 
2053
- if strtobool(args.debug):
2054
- print(f"\n----- IsGeo ... COMPUTING {row['word']} IN THE TEXT:")
2055
- print(row[args.source_column])
2056
-
2057
- result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
2058
-
2059
- elif row['IsBio'] == 1:
2060
-
2061
- # Check if '@id' column exists in df_Extract
2062
- iiid = None
2063
- # Check if the '@id' exists in the Series
2064
- if '@id' in row:
2065
- # Check if the value is not None or NaN
2066
- if row['@id'] is not None and not pd.isna(row['@id']):
2067
- # Assign the value to the variable iiid
2068
- iiid = row['@id']
2069
- iiiALLURIScontextFromNCBO = None
2070
- if 'ALLURIScontextFromNCBO' in row:
2071
- if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): #and not pd.isna(row['ALLURIScontextFromNCBO']):
2072
- iiiALLURIScontextFromNCBO=row['ALLURIScontextFromNCBO']
2073
- iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2074
 
2075
  if strtobool(args.debug):
2076
- print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
2077
  print(row[args.source_column])
2078
- result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column])
2079
 
2080
- else:
2081
- if row['model'] == "Forced":
2082
- # Check if '@id' column exists in df_Extract
2083
- iiid = None
2084
- # Check if the '@id' exists in the Series
2085
- if '@id' in row:
2086
- # Check if the value is not None or NaN
2087
- if row['@id'] is not None and not pd.isna(row['@id']):
2088
- # Assign the value to the variable iiid
2089
- iiid = row['@id']
2090
- iiiALLURIScontextFromNCBO = None
2091
- if 'ALLURIScontextFromNCBO' in row:
2092
- if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
2093
- list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
2094
- iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
2095
- iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2096
 
2097
- if strtobool(args.debug):
2098
- print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
2099
- print(row[args.source_column])
2100
 
2101
- result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2102
- row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2103
- id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2104
 
2105
- if not result: #try annotation without bioportal
2106
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2107
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2108
- id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False,questionText=row[args.source_column])
2109
 
2110
- else:
2111
- if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
 
 
2112
 
2113
- if strtobool(args.debug):
2114
- print(f"\n----- InRagMode ...COMPUTING ... {row['word']} IN THE TEXT:")
2115
- print(row[args.source_column])
 
 
 
2116
 
2117
- # Check if '@id' column exists in df_Extract
2118
- iiid = None
2119
- # Check if the '@id' exists in the Series
2120
- if '@id' in row:
2121
- # Check if the value is not None or NaN
2122
- if row['@id'] is not None and not pd.isna(row['@id']):
2123
- # Assign the value to the variable iiid
2124
- iiid = row['@id']
2125
- iiiALLURIScontextFromNCBO = None
2126
- if 'ALLURIScontextFromNCBO' in row:
2127
- if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list):
2128
- iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
2129
- iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2130
 
2131
- result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2132
- row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
2133
 
2134
- return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
 
 
 
 
2135
 
2136
 
2137
  def parallel_process_Row4Linking(df, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
@@ -2188,16 +2196,29 @@ def elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map
2188
  else:
2189
  # single processing
2190
  result = df_annotated_combined.apply(lambda row: process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output), axis=1)
 
2191
  #
2192
- df_annotated_combined['namedEntity'] = result.str[0]
2193
- df_annotated_combined['ALLURIScontext'] = result.str[1]
2194
- df_annotated_combined['Context'] = result.str[2]
2195
- df_annotated_combined['ContextGlobal'] = result.str[3]
2196
- df_annotated_combined['Triples'] = result.str[4]
2197
- df_annotated_combined['TriplesGlobal'] = result.str[5]
2198
- cache_map_geonames_AFTER = result.str[6].iloc[-1]
2199
- cache_map_virtuoso_AFTER = result.str[7].iloc[-1]
2200
- load_map_query_input_output_AFTER = result.str[8].iloc[-1] #
 
 
 
 
 
 
 
 
 
 
 
 
2201
 
2202
 
2203
  if args.num_cores_eLinking>1:
@@ -2546,19 +2567,28 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
2546
  # ##### this is to pass the links:
2547
 
2548
  # Create a new column for the entities with links
2549
- # #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(lambda row: f"<a href='{row['namedEntity']}'>{row['word']}</a>", axis=1)
2550
  # df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2551
- # lambda row: f"<a href='{row['namedEntity']}'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
2552
  # 'word'], axis=1)
2553
  #include the expl-rel prefix:
2554
  #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2555
- # lambda row: f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
2556
  # 'word'], axis=1)
 
 
 
 
 
 
2557
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2558
- lambda
2559
- row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}'>{row['word']}</a>" if pd.notnull(
2560
- row['namedEntity']) else row[
2561
- 'word'], axis=1)
 
 
 
2562
 
2563
  # Create a new dictionary with the entity information and the link
2564
  dict_annotated_combined_NEL = df_annotated_combined[
 
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
  from collections import Counter
22
 
 
 
 
23
  import torch
 
24
 
25
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
  print(f"Device: {device}...")
 
77
  "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
78
  "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
79
  "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
80
+ "SYMP", "FoodOn", "UBERON", "VO", "OGMS", "EuroSciVoc"]
81
 
82
+ ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO", "OGMS"]
83
 
84
  encod = encoding_getter('microsoft/deberta-v3-large')
85
  text_splitter = TokenTextSplitter(
 
893
 
894
  #delete all the rows with score smaller than entities_filter_threshold:
895
  if args.entities_filter_threshold > 0:
896
+ # df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
897
+ df_annotated = df_annotated[df_annotated['score'] > args.entities_filter_threshold]
898
  if df_annotated.empty:
899
  return df_annotated
900
 
 
1064
  return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames
1065
 
1066
 
1067
+ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=False, questionText="" ):
1068
  #UseBioportalForLinking = False #trial to del
1069
 
1070
  if strtobool(args.debug):
 
1174
  if strtobool(args.debug):
1175
  print("Use Virtuoso Sparql endpoint for linking ... " + word.lower())
1176
 
1177
+ responseText = sparqlQuery(endpoint, questionText, VirtuosoUsername, key_virtuoso, strtobool(args.USE_CACHE))
1178
 
1179
  # Parse the response as JSON
1180
  results = json.loads(responseText)
 
1357
 
1358
 
1359
 
1360
+
1361
  def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""):
1362
 
1363
  # trial
 
1461
  # # Check if the current index is a multiple of nn
1462
  # if i % nn == 0:
1463
  # #print("elaborate RAG triples")
1464
+ #
1465
  # #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1466
  # #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1467
  # #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1468
  # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1469
+ #
1470
  # passages = []
1471
+ #
1472
  # df_retrieved = df_retrievedtopk.copy()
1473
  # if not df_retrieved.empty:
1474
  # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1475
  # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1476
  # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1477
+ #
1478
  # if not labelTriples:
1479
  # labelTriples =labelTriplesAPP
1480
  # else:
 
1502
 
1503
 
1504
  if not(labelTriples) or labelTriples.strip=="":
1505
+ logging.warning("getLinearTextualContextFromTriples - No text or prompt supplied! No relevant contextual triples retrieved...Skypping it! Word: "+str(word))
1506
  return "", map_query_input_output
1507
 
1508
  if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
1509
  texts = text_splitter.create_documents([labelTriples])
1510
  labelTriples = texts[0].page_content
1511
  if not (labelTriples) or labelTriples.strip == "":
1512
+ logging.warning("after splitting ...No text or prompt supplied! Skypping it! Word: "+str(word))
1513
  return "", map_query_input_output
1514
 
1515
 
 
1602
  return contextText, map_query_input_output
1603
 
1604
 
1605
+
1606
  #@mem.cache
1607
  def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""):
1608
 
 
2035
 
2036
  def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
2037
 
2038
+ result = "" #None
2039
+ singleContext = "" #None
2040
+ globalContext = "" #None
2041
+ singleTriples = "" #None
2042
+ globalTriples = "" #None
2043
  ALLURIScontext = []
2044
 
2045
+ try:
 
 
2046
 
2047
+ if row.empty:
2048
+ return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
2049
 
2050
+ InRagMode=False
2051
+ if hasattr(args, 'useBioKgRAG') and (strtobool(args.useBioKgRAG)==True):
2052
+ InRagMode = True
2053
+
2054
+ if (InRagMode==False):
2055
+ if row['IsGeo'] == 1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2056
 
2057
  if strtobool(args.debug):
2058
+ print(f"\n----- IsGeo ... COMPUTING {row['word']} IN THE TEXT:")
2059
  print(row[args.source_column])
 
2060
 
2061
+ result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2062
 
2063
+ elif row['IsBio'] == 1:
 
 
2064
 
2065
+ # Check if '@id' column exists in df_Extract
2066
+ iiid = None
2067
+ # Check if the '@id' exists in the Series
2068
+ if '@id' in row:
2069
+ # Check if the value is not None or NaN
2070
+ if row['@id'] is not None and not pd.isna(row['@id']):
2071
+ # Assign the value to the variable iiid
2072
+ iiid = row['@id']
2073
+ iiiALLURIScontextFromNCBO = None
2074
+ if 'ALLURIScontextFromNCBO' in row:
2075
+ if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): #and not pd.isna(row['ALLURIScontextFromNCBO']):
2076
+ iiiALLURIScontextFromNCBO=row['ALLURIScontextFromNCBO']
2077
+ iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2078
+
2079
+ if strtobool(args.debug):
2080
+ print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
2081
+ print(row[args.source_column])
2082
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column])
2083
+
2084
+ else:
2085
+ if row['model'] == "Forced":
2086
+ # Check if '@id' column exists in df_Extract
2087
+ iiid = None
2088
+ # Check if the '@id' exists in the Series
2089
+ if '@id' in row:
2090
+ # Check if the value is not None or NaN
2091
+ if row['@id'] is not None and not pd.isna(row['@id']):
2092
+ # Assign the value to the variable iiid
2093
+ iiid = row['@id']
2094
+ iiiALLURIScontextFromNCBO = None
2095
+ if 'ALLURIScontextFromNCBO' in row:
2096
+ if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
2097
+ list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
2098
+ iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
2099
+ iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2100
+
2101
+ if strtobool(args.debug):
2102
+ print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
2103
+ print(row[args.source_column])
2104
 
 
2105
  result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2106
  row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2107
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
2108
 
2109
+ if not result: #try annotation without bioportal
2110
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2111
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
2112
+ id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False,questionText=row[args.source_column])
2113
 
2114
+ else:
2115
+ if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
2116
+
2117
+ if strtobool(args.debug):
2118
+ print(f"\n----- InRagMode ...COMPUTING ... {row['word']} IN THE TEXT:")
2119
+ print(row[args.source_column])
2120
 
2121
+ # Check if '@id' column exists in df_Extract
2122
+ iiid = None
2123
+ # Check if the '@id' exists in the Series
2124
+ if '@id' in row:
2125
+ # Check if the value is not None or NaN
2126
+ if row['@id'] is not None and not pd.isna(row['@id']):
2127
+ # Assign the value to the variable iiid
2128
+ iiid = row['@id']
2129
+ iiiALLURIScontextFromNCBO = None
2130
+ if 'ALLURIScontextFromNCBO' in row:
2131
+ if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list):
2132
+ iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
2133
+ iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
2134
 
2135
+ result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
2136
+ row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
2137
 
2138
+ return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
2139
+
2140
+ except Exception as e:
2141
+ #print(f"Error occurred: {e}")
2142
+ return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
2143
 
2144
 
2145
  def parallel_process_Row4Linking(df, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
 
2196
  else:
2197
  # single processing
2198
  result = df_annotated_combined.apply(lambda row: process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output), axis=1)
2199
+
2200
  #
2201
+ try:
2202
+ df_annotated_combined['namedEntity'] = result.str[0]
2203
+ df_annotated_combined['ALLURIScontext'] = result.str[1]
2204
+ df_annotated_combined['Context'] = result.str[2]
2205
+ df_annotated_combined['ContextGlobal'] = result.str[3]
2206
+ df_annotated_combined['Triples'] = result.str[4]
2207
+ df_annotated_combined['TriplesGlobal'] = result.str[5]
2208
+ cache_map_geonames_AFTER = result.str[6].iloc[-1]
2209
+ cache_map_virtuoso_AFTER = result.str[7].iloc[-1]
2210
+ load_map_query_input_output_AFTER = result.str[8].iloc[-1] #
2211
+ except Exception as e:
2212
+ # print(f"Error occurred: {e}")
2213
+ df_annotated_combined['namedEntity'] = ""
2214
+ df_annotated_combined['ALLURIScontext'] = ""
2215
+ df_annotated_combined['Context'] = ""
2216
+ df_annotated_combined['ContextGlobal'] = ""
2217
+ df_annotated_combined['Triples'] = ""
2218
+ df_annotated_combined['TriplesGlobal'] = ""
2219
+ cache_map_geonames_AFTER = cache_map_geonames
2220
+ cache_map_virtuoso_AFTER = cache_map_virtuoso
2221
+ load_map_query_input_output_AFTER = load_map_query_input_output
2222
 
2223
 
2224
  if args.num_cores_eLinking>1:
 
2567
  # ##### this is to pass the links:
2568
 
2569
  # Create a new column for the entities with links
2570
+ # #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(lambda row: f"<a href='{row['namedEntity']}' target='_blank'>{row['word']}</a>", axis=1)
2571
  # df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2572
+ # lambda row: f"<a href='{row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
2573
  # 'word'], axis=1)
2574
  #include the expl-rel prefix:
2575
  #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2576
+ # lambda row: f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
2577
  # 'word'], axis=1)
2578
+ # df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2579
+ # lambda
2580
+ # row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(
2581
+ # row['namedEntity']) else row[
2582
+ # 'word'], axis=1)
2583
+
2584
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
2585
+ lambda row: (
2586
+ f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
2587
+ if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
2588
+ 'word']
2589
+ ),
2590
+ axis=1
2591
+ )
2592
 
2593
  # Create a new dictionary with the entity information and the link
2594
  dict_annotated_combined_NEL = df_annotated_combined[