Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,11 +20,7 @@ from typing import Dict
|
|
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
21 |
from collections import Counter
|
22 |
|
23 |
-
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
24 |
-
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
25 |
-
|
26 |
import torch
|
27 |
-
#torch.cuda.empty_cache() # Clear cache ot torch
|
28 |
|
29 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
30 |
print(f"Device: {device}...")
|
@@ -81,9 +77,9 @@ POSSIBLE_KGchoices_List = ["AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI",
|
|
81 |
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
|
82 |
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
|
83 |
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
|
84 |
-
"SYMP", "FoodOn", "UBERON", "VO", "EuroSciVoc"]
|
85 |
|
86 |
-
ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO"]
|
87 |
|
88 |
encod = encoding_getter('microsoft/deberta-v3-large')
|
89 |
text_splitter = TokenTextSplitter(
|
@@ -897,7 +893,8 @@ def entitiesFusion(df_annotated, args):
|
|
897 |
|
898 |
#delete all the rows with score smaller than entities_filter_threshold:
|
899 |
if args.entities_filter_threshold > 0:
|
900 |
-
df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
|
|
|
901 |
if df_annotated.empty:
|
902 |
return df_annotated
|
903 |
|
@@ -1067,7 +1064,7 @@ def geonames_api_call(word, args, key_geonames, cache_map_geonames):
|
|
1067 |
return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames
|
1068 |
|
1069 |
|
1070 |
-
def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=False ):
|
1071 |
#UseBioportalForLinking = False #trial to del
|
1072 |
|
1073 |
if strtobool(args.debug):
|
@@ -1177,7 +1174,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1177 |
if strtobool(args.debug):
|
1178 |
print("Use Virtuoso Sparql endpoint for linking ... " + word.lower())
|
1179 |
|
1180 |
-
responseText = sparqlQuery(endpoint,
|
1181 |
|
1182 |
# Parse the response as JSON
|
1183 |
results = json.loads(responseText)
|
@@ -1360,6 +1357,7 @@ def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso
|
|
1360 |
|
1361 |
|
1362 |
|
|
|
1363 |
def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""):
|
1364 |
|
1365 |
# trial
|
@@ -1463,20 +1461,20 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
|
|
1463 |
# # Check if the current index is a multiple of nn
|
1464 |
# if i % nn == 0:
|
1465 |
# #print("elaborate RAG triples")
|
1466 |
-
#
|
1467 |
# #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
|
1468 |
# #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
|
1469 |
# #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
|
1470 |
# df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
|
1471 |
-
#
|
1472 |
# passages = []
|
1473 |
-
#
|
1474 |
# df_retrieved = df_retrievedtopk.copy()
|
1475 |
# if not df_retrieved.empty:
|
1476 |
# #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
|
1477 |
# labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
|
1478 |
# labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
|
1479 |
-
#
|
1480 |
# if not labelTriples:
|
1481 |
# labelTriples =labelTriplesAPP
|
1482 |
# else:
|
@@ -1504,14 +1502,14 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
|
|
1504 |
|
1505 |
|
1506 |
if not(labelTriples) or labelTriples.strip=="":
|
1507 |
-
logging.warning("No text or prompt supplied! Skypping it!")
|
1508 |
return "", map_query_input_output
|
1509 |
|
1510 |
if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
|
1511 |
texts = text_splitter.create_documents([labelTriples])
|
1512 |
labelTriples = texts[0].page_content
|
1513 |
if not (labelTriples) or labelTriples.strip == "":
|
1514 |
-
logging.warning("after splitting ...No text or prompt supplied! Skypping it!")
|
1515 |
return "", map_query_input_output
|
1516 |
|
1517 |
|
@@ -1604,6 +1602,7 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
|
|
1604 |
return contextText, map_query_input_output
|
1605 |
|
1606 |
|
|
|
1607 |
#@mem.cache
|
1608 |
def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""):
|
1609 |
|
@@ -2036,102 +2035,111 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
|
|
2036 |
|
2037 |
def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
2038 |
|
2039 |
-
result = None
|
2040 |
-
singleContext = None
|
2041 |
-
globalContext = None
|
2042 |
-
singleTriples = None
|
2043 |
-
globalTriples = None
|
2044 |
ALLURIScontext = []
|
2045 |
|
2046 |
-
|
2047 |
-
if hasattr(args, 'useBioKgRAG') and (strtobool(args.useBioKgRAG)==True):
|
2048 |
-
InRagMode = True
|
2049 |
|
2050 |
-
|
2051 |
-
|
2052 |
|
2053 |
-
|
2054 |
-
|
2055 |
-
|
2056 |
-
|
2057 |
-
|
2058 |
-
|
2059 |
-
elif row['IsBio'] == 1:
|
2060 |
-
|
2061 |
-
# Check if '@id' column exists in df_Extract
|
2062 |
-
iiid = None
|
2063 |
-
# Check if the '@id' exists in the Series
|
2064 |
-
if '@id' in row:
|
2065 |
-
# Check if the value is not None or NaN
|
2066 |
-
if row['@id'] is not None and not pd.isna(row['@id']):
|
2067 |
-
# Assign the value to the variable iiid
|
2068 |
-
iiid = row['@id']
|
2069 |
-
iiiALLURIScontextFromNCBO = None
|
2070 |
-
if 'ALLURIScontextFromNCBO' in row:
|
2071 |
-
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): #and not pd.isna(row['ALLURIScontextFromNCBO']):
|
2072 |
-
iiiALLURIScontextFromNCBO=row['ALLURIScontextFromNCBO']
|
2073 |
-
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
2074 |
|
2075 |
if strtobool(args.debug):
|
2076 |
-
print(f"\n-----
|
2077 |
print(row[args.source_column])
|
2078 |
-
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column])
|
2079 |
|
2080 |
-
|
2081 |
-
if row['model'] == "Forced":
|
2082 |
-
# Check if '@id' column exists in df_Extract
|
2083 |
-
iiid = None
|
2084 |
-
# Check if the '@id' exists in the Series
|
2085 |
-
if '@id' in row:
|
2086 |
-
# Check if the value is not None or NaN
|
2087 |
-
if row['@id'] is not None and not pd.isna(row['@id']):
|
2088 |
-
# Assign the value to the variable iiid
|
2089 |
-
iiid = row['@id']
|
2090 |
-
iiiALLURIScontextFromNCBO = None
|
2091 |
-
if 'ALLURIScontextFromNCBO' in row:
|
2092 |
-
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
|
2093 |
-
list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
|
2094 |
-
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
|
2095 |
-
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
2096 |
|
2097 |
-
|
2098 |
-
print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
|
2099 |
-
print(row[args.source_column])
|
2100 |
|
2101 |
-
|
2102 |
-
|
2103 |
-
id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2104 |
|
2105 |
-
if not result: #try annotation without bioportal
|
2106 |
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
2107 |
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
|
2108 |
-
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,
|
2109 |
|
2110 |
-
|
2111 |
-
|
|
|
|
|
2112 |
|
2113 |
-
|
2114 |
-
|
2115 |
-
|
|
|
|
|
|
|
2116 |
|
2117 |
-
|
2118 |
-
|
2119 |
-
|
2120 |
-
|
2121 |
-
|
2122 |
-
|
2123 |
-
|
2124 |
-
|
2125 |
-
|
2126 |
-
|
2127 |
-
|
2128 |
-
|
2129 |
-
|
2130 |
|
2131 |
-
|
2132 |
-
|
2133 |
|
2134 |
-
|
|
|
|
|
|
|
|
|
2135 |
|
2136 |
|
2137 |
def parallel_process_Row4Linking(df, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
@@ -2188,16 +2196,29 @@ def elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map
|
|
2188 |
else:
|
2189 |
# single processing
|
2190 |
result = df_annotated_combined.apply(lambda row: process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output), axis=1)
|
|
|
2191 |
#
|
2192 |
-
|
2193 |
-
|
2194 |
-
|
2195 |
-
|
2196 |
-
|
2197 |
-
|
2198 |
-
|
2199 |
-
|
2200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2201 |
|
2202 |
|
2203 |
if args.num_cores_eLinking>1:
|
@@ -2546,19 +2567,28 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
|
|
2546 |
# ##### this is to pass the links:
|
2547 |
|
2548 |
# Create a new column for the entities with links
|
2549 |
-
# #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(lambda row: f"<a href='{row['namedEntity']}'>{row['word']}</a>", axis=1)
|
2550 |
# df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2551 |
-
# lambda row: f"<a href='{row['namedEntity']}'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
|
2552 |
# 'word'], axis=1)
|
2553 |
#include the expl-rel prefix:
|
2554 |
#df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2555 |
-
# lambda row: f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
|
2556 |
# 'word'], axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
2557 |
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2558 |
-
lambda
|
2559 |
-
|
2560 |
-
row['namedEntity']) else row[
|
2561 |
-
|
|
|
|
|
|
|
2562 |
|
2563 |
# Create a new dictionary with the entity information and the link
|
2564 |
dict_annotated_combined_NEL = df_annotated_combined[
|
|
|
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
21 |
from collections import Counter
|
22 |
|
|
|
|
|
|
|
23 |
import torch
|
|
|
24 |
|
25 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
26 |
print(f"Device: {device}...")
|
|
|
77 |
"GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH",
|
78 |
"MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI",
|
79 |
"OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO",
|
80 |
+
"SYMP", "FoodOn", "UBERON", "VO", "OGMS", "EuroSciVoc"]
|
81 |
|
82 |
+
ONLY_Ontologies_OnBIOPORTAL = ["AEO", "BCGO", "BFO", "BIM", "CHEBI", "CHIRO", "CL", "DCM", "DOID", "FMA", "FOODON", "GENO", "GML", "GO", "GEOSPARQL", "HL7", "HP", "HP_O", "IAO", "ICD10", "IDO", "LOINC", "MESH", "MONDO", "NCBITAXON", "NCIT", "NIFCELL", "NIFSTD", "OBCS", "OCHV", "OHPI", "OPB", "PLOSTHES", "RADLEX", "OBOREL", "SNOMEDCT", "SO", "STATO", "STY", "SYMP", "PTRANS", "UBERON", "VO", "OGMS"]
|
83 |
|
84 |
encod = encoding_getter('microsoft/deberta-v3-large')
|
85 |
text_splitter = TokenTextSplitter(
|
|
|
893 |
|
894 |
#delete all the rows with score smaller than entities_filter_threshold:
|
895 |
if args.entities_filter_threshold > 0:
|
896 |
+
# df_annotated = df_annotated[df_annotated['score'] >= args.entities_filter_threshold]
|
897 |
+
df_annotated = df_annotated[df_annotated['score'] > args.entities_filter_threshold]
|
898 |
if df_annotated.empty:
|
899 |
return df_annotated
|
900 |
|
|
|
1064 |
return None, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames
|
1065 |
|
1066 |
|
1067 |
+
def getUrlBioAndAllOtherBioConcepts(word, args, key_virtuoso, cache_map_virtuoso, endpoint, VirtuosoUsername, contextWordVirtuoso, UseBioportalForLinking=False, questionText="" ):
|
1068 |
#UseBioportalForLinking = False #trial to del
|
1069 |
|
1070 |
if strtobool(args.debug):
|
|
|
1174 |
if strtobool(args.debug):
|
1175 |
print("Use Virtuoso Sparql endpoint for linking ... " + word.lower())
|
1176 |
|
1177 |
+
responseText = sparqlQuery(endpoint, questionText, VirtuosoUsername, key_virtuoso, strtobool(args.USE_CACHE))
|
1178 |
|
1179 |
# Parse the response as JSON
|
1180 |
results = json.loads(responseText)
|
|
|
1357 |
|
1358 |
|
1359 |
|
1360 |
+
|
1361 |
def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, args, map_query_input_output, cleanInput=True, questionText=""):
|
1362 |
|
1363 |
# trial
|
|
|
1461 |
# # Check if the current index is a multiple of nn
|
1462 |
# if i % nn == 0:
|
1463 |
# #print("elaborate RAG triples")
|
1464 |
+
#
|
1465 |
# #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
|
1466 |
# #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
|
1467 |
# #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
|
1468 |
# df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
|
1469 |
+
#
|
1470 |
# passages = []
|
1471 |
+
#
|
1472 |
# df_retrieved = df_retrievedtopk.copy()
|
1473 |
# if not df_retrieved.empty:
|
1474 |
# #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
|
1475 |
# labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
|
1476 |
# labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
|
1477 |
+
#
|
1478 |
# if not labelTriples:
|
1479 |
# labelTriples =labelTriplesAPP
|
1480 |
# else:
|
|
|
1502 |
|
1503 |
|
1504 |
if not(labelTriples) or labelTriples.strip=="":
|
1505 |
+
logging.warning("getLinearTextualContextFromTriples - No text or prompt supplied! No relevant contextual triples retrieved...Skypping it! Word: "+str(word))
|
1506 |
return "", map_query_input_output
|
1507 |
|
1508 |
if token_counter(labelTriples, args.model_name) > args.tokens_max: # THE CONTEXT IS TOO BIG, BIGGER THAN tokens_max, I need to split
|
1509 |
texts = text_splitter.create_documents([labelTriples])
|
1510 |
labelTriples = texts[0].page_content
|
1511 |
if not (labelTriples) or labelTriples.strip == "":
|
1512 |
+
logging.warning("after splitting ...No text or prompt supplied! Skypping it! Word: "+str(word))
|
1513 |
return "", map_query_input_output
|
1514 |
|
1515 |
|
|
|
1602 |
return contextText, map_query_input_output
|
1603 |
|
1604 |
|
1605 |
+
|
1606 |
#@mem.cache
|
1607 |
def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=None, iALLURIScontextFromNCBO=None,UseBioportalForLinking=True,questionText=""):
|
1608 |
|
|
|
2035 |
|
2036 |
def process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
2037 |
|
2038 |
+
result = "" #None
|
2039 |
+
singleContext = "" #None
|
2040 |
+
globalContext = "" #None
|
2041 |
+
singleTriples = "" #None
|
2042 |
+
globalTriples = "" #None
|
2043 |
ALLURIScontext = []
|
2044 |
|
2045 |
+
try:
|
|
|
|
|
2046 |
|
2047 |
+
if row.empty:
|
2048 |
+
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
|
2049 |
|
2050 |
+
InRagMode=False
|
2051 |
+
if hasattr(args, 'useBioKgRAG') and (strtobool(args.useBioKgRAG)==True):
|
2052 |
+
InRagMode = True
|
2053 |
+
|
2054 |
+
if (InRagMode==False):
|
2055 |
+
if row['IsGeo'] == 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2056 |
|
2057 |
if strtobool(args.debug):
|
2058 |
+
print(f"\n----- IsGeo ... COMPUTING {row['word']} IN THE TEXT:")
|
2059 |
print(row[args.source_column])
|
|
|
2060 |
|
2061 |
+
result, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames = geonames_api_call(row['word'], args, key_geonames, cache_map_geonames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2062 |
|
2063 |
+
elif row['IsBio'] == 1:
|
|
|
|
|
2064 |
|
2065 |
+
# Check if '@id' column exists in df_Extract
|
2066 |
+
iiid = None
|
2067 |
+
# Check if the '@id' exists in the Series
|
2068 |
+
if '@id' in row:
|
2069 |
+
# Check if the value is not None or NaN
|
2070 |
+
if row['@id'] is not None and not pd.isna(row['@id']):
|
2071 |
+
# Assign the value to the variable iiid
|
2072 |
+
iiid = row['@id']
|
2073 |
+
iiiALLURIScontextFromNCBO = None
|
2074 |
+
if 'ALLURIScontextFromNCBO' in row:
|
2075 |
+
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list): #and not pd.isna(row['ALLURIScontextFromNCBO']):
|
2076 |
+
iiiALLURIScontextFromNCBO=row['ALLURIScontextFromNCBO']
|
2077 |
+
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
2078 |
+
|
2079 |
+
if strtobool(args.debug):
|
2080 |
+
print(f"\n----- isBio COMPUTING ... {row['word']} IN THE TEXT:")
|
2081 |
+
print(row[args.source_column])
|
2082 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=True, questionText=row[args.source_column])
|
2083 |
+
|
2084 |
+
else:
|
2085 |
+
if row['model'] == "Forced":
|
2086 |
+
# Check if '@id' column exists in df_Extract
|
2087 |
+
iiid = None
|
2088 |
+
# Check if the '@id' exists in the Series
|
2089 |
+
if '@id' in row:
|
2090 |
+
# Check if the value is not None or NaN
|
2091 |
+
if row['@id'] is not None and not pd.isna(row['@id']):
|
2092 |
+
# Assign the value to the variable iiid
|
2093 |
+
iiid = row['@id']
|
2094 |
+
iiiALLURIScontextFromNCBO = None
|
2095 |
+
if 'ALLURIScontextFromNCBO' in row:
|
2096 |
+
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'],
|
2097 |
+
list): # and not pd.isna(row['ALLURIScontextFromNCBO']):
|
2098 |
+
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
|
2099 |
+
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
2100 |
+
|
2101 |
+
if strtobool(args.debug):
|
2102 |
+
print(f"\n----- isForced COMPUTING ... {row['word']} IN THE TEXT:")
|
2103 |
+
print(row[args.source_column])
|
2104 |
|
|
|
2105 |
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
2106 |
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
|
2107 |
+
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
|
2108 |
|
2109 |
+
if not result: #try annotation without bioportal
|
2110 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
2111 |
+
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output,
|
2112 |
+
id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO, UseBioportalForLinking=False,questionText=row[args.source_column])
|
2113 |
|
2114 |
+
else:
|
2115 |
+
if (row['IsBio'] == 1) or ( (pd.isnull(row["IsBio"]) or row["IsBio"] == '' or row['IsBio'] == 0 or row["IsBio"] is None) and (row['entity_group'] == "MISC") ):
|
2116 |
+
|
2117 |
+
if strtobool(args.debug):
|
2118 |
+
print(f"\n----- InRagMode ...COMPUTING ... {row['word']} IN THE TEXT:")
|
2119 |
+
print(row[args.source_column])
|
2120 |
|
2121 |
+
# Check if '@id' column exists in df_Extract
|
2122 |
+
iiid = None
|
2123 |
+
# Check if the '@id' exists in the Series
|
2124 |
+
if '@id' in row:
|
2125 |
+
# Check if the value is not None or NaN
|
2126 |
+
if row['@id'] is not None and not pd.isna(row['@id']):
|
2127 |
+
# Assign the value to the variable iiid
|
2128 |
+
iiid = row['@id']
|
2129 |
+
iiiALLURIScontextFromNCBO = None
|
2130 |
+
if 'ALLURIScontextFromNCBO' in row:
|
2131 |
+
if row['ALLURIScontextFromNCBO'] is not None and isinstance(row['ALLURIScontextFromNCBO'], list):
|
2132 |
+
iiiALLURIScontextFromNCBO = row['ALLURIScontextFromNCBO']
|
2133 |
+
iiiALLURIScontextFromNCBO = list(set(iiiALLURIScontextFromNCBO))
|
2134 |
|
2135 |
+
result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_virtuoso, load_map_query_input_output = virtuoso_api_call(
|
2136 |
+
row['word'], text_splitter, args, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, id=iiid, iALLURIScontextFromNCBO=iiiALLURIScontextFromNCBO,UseBioportalForLinking=True,questionText=row[args.source_column])
|
2137 |
|
2138 |
+
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
|
2139 |
+
|
2140 |
+
except Exception as e:
|
2141 |
+
#print(f"Error occurred: {e}")
|
2142 |
+
return result, ALLURIScontext, singleContext, globalContext, singleTriples, globalTriples, cache_map_geonames, cache_map_virtuoso, load_map_query_input_output, row.name
|
2143 |
|
2144 |
|
2145 |
def parallel_process_Row4Linking(df, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output):
|
|
|
2196 |
else:
|
2197 |
# single processing
|
2198 |
result = df_annotated_combined.apply(lambda row: process_row4Linking(row, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output), axis=1)
|
2199 |
+
|
2200 |
#
|
2201 |
+
try:
|
2202 |
+
df_annotated_combined['namedEntity'] = result.str[0]
|
2203 |
+
df_annotated_combined['ALLURIScontext'] = result.str[1]
|
2204 |
+
df_annotated_combined['Context'] = result.str[2]
|
2205 |
+
df_annotated_combined['ContextGlobal'] = result.str[3]
|
2206 |
+
df_annotated_combined['Triples'] = result.str[4]
|
2207 |
+
df_annotated_combined['TriplesGlobal'] = result.str[5]
|
2208 |
+
cache_map_geonames_AFTER = result.str[6].iloc[-1]
|
2209 |
+
cache_map_virtuoso_AFTER = result.str[7].iloc[-1]
|
2210 |
+
load_map_query_input_output_AFTER = result.str[8].iloc[-1] #
|
2211 |
+
except Exception as e:
|
2212 |
+
# print(f"Error occurred: {e}")
|
2213 |
+
df_annotated_combined['namedEntity'] = ""
|
2214 |
+
df_annotated_combined['ALLURIScontext'] = ""
|
2215 |
+
df_annotated_combined['Context'] = ""
|
2216 |
+
df_annotated_combined['ContextGlobal'] = ""
|
2217 |
+
df_annotated_combined['Triples'] = ""
|
2218 |
+
df_annotated_combined['TriplesGlobal'] = ""
|
2219 |
+
cache_map_geonames_AFTER = cache_map_geonames
|
2220 |
+
cache_map_virtuoso_AFTER = cache_map_virtuoso
|
2221 |
+
load_map_query_input_output_AFTER = load_map_query_input_output
|
2222 |
|
2223 |
|
2224 |
if args.num_cores_eLinking>1:
|
|
|
2567 |
# ##### this is to pass the links:
|
2568 |
|
2569 |
# Create a new column for the entities with links
|
2570 |
+
# #df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(lambda row: f"<a href='{row['namedEntity']}' target='_blank'>{row['word']}</a>", axis=1)
|
2571 |
# df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2572 |
+
# lambda row: f"<a href='{row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
|
2573 |
# 'word'], axis=1)
|
2574 |
#include the expl-rel prefix:
|
2575 |
#df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2576 |
+
# lambda row: f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(row['namedEntity']) else row[
|
2577 |
# 'word'], axis=1)
|
2578 |
+
# df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2579 |
+
# lambda
|
2580 |
+
# row: f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>" if pd.notnull(
|
2581 |
+
# row['namedEntity']) else row[
|
2582 |
+
# 'word'], axis=1)
|
2583 |
+
|
2584 |
df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
|
2585 |
+
lambda row: (
|
2586 |
+
f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
|
2587 |
+
if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
|
2588 |
+
'word']
|
2589 |
+
),
|
2590 |
+
axis=1
|
2591 |
+
)
|
2592 |
|
2593 |
# Create a new dictionary with the entity information and the link
|
2594 |
dict_annotated_combined_NEL = df_annotated_combined[
|