Spaces:
Runtime error
Runtime error
File size: 32,266 Bytes
2be1d3d 325517e 2be1d3d 258c003 2be1d3d 3500aa6 2be1d3d 87e7bb0 2be1d3d 87e7bb0 2be1d3d 87e7bb0 2be1d3d 87e7bb0 2be1d3d 87e7bb0 2be1d3d 87e7bb0 2be1d3d 3500aa6 2be1d3d 3500aa6 2be1d3d 3500aa6 2be1d3d 258c003 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 |
import spacy
import re
import nltk
from nltk.corpus import wordnet
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
spacy.cli.download("en_core_web_sm")
# use spacy small because in that way we are closer to a BOW model which is the one we care in our case since we just compare words
nlp_comparatives = spacy.load('en_core_web_sm', disable=["parser", "ner"])
def find_comptives_symbols(sentence):
"""
Capture unique cases of symbols like <, >, =, <=, >= and ==
If more than one symbol exists, return []
"""
# symbols regex pattern
pattern = r"(?<![<=>])<=|>=|==|(?<![<=>])<|>|(?<!<)=|=(?![<=>])"
matches = re.findall(pattern, sentence)
# if len(matches) > 1:
# return []
found_symbols = []
for matching in matches:
# found_symbols.append({'comparative': ['symbol', matching]})
found_symbols.append({'comparative': matching})
return found_symbols
def find_comptives_straight_patterns(sentence):
"""
Function to identivy mentions of comparatives. The form is "comparative adverbs/adjectives followed by than", "words like more/less followed by than", "equal to"
"""
doc = nlp_comparatives(sentence)
comparatives = []
for token in doc:
# find mentions of "equal" followed by "to"
if token.text.lower() == "equal":
next_token = token.nbor()
if next_token.text.lower() == "to":
prev_token = token.nbor(-1)
if prev_token.pos_ == "NOUN":
# comparatives.append({'comparative': ["equal to", "="]})
comparatives.append({'comparative': "="})
# find mentions of "more"/"less" followed by "than"
elif token.text.lower() in ["more", "less"]:
next_token = token.nbor()
if next_token.text.lower() == "than":
prev_token = token.nbor(-1)
if token.text.lower() == 'more':
# comparatives.append({'comparative': [token.text + " " + next_token.text, '>']})
comparatives.append({'comparative': '>'})
elif token.text.lower() == 'less':
# comparatives.append({'comparative': [token.text + " " + next_token.text, '<']})
comparatives.append({'comparative': '<'})
# find mentions of comparative adjectives or comparative adverbs followed by "than"
elif token.tag_ == "JJR" or token.tag_ == "RBR":
next_token = token.nbor()
if next_token.text.lower() == "than" and next_token.nbor().pos_ != "NOUN":
# check if the token is a synonym of "bigger"
# retrieve a set of synonyms for the concepts of 'big' and 'bigger'
big_synonyms = set(
wordnet.synsets('big') + wordnet.synsets('large') + wordnet.synsets('great') + wordnet.synsets(
'huge') + wordnet.synsets('enormous') + wordnet.synsets('heavy') + wordnet.synsets(
'strong') + wordnet.synsets('enormous') + wordnet.synsets('massive') + wordnet.synsets(
'immense') + wordnet.synsets('substantial'))
bigger_synonyms = set(wordnet.synsets('bigger') + wordnet.synsets('larger') + wordnet.synsets(
'greater') + wordnet.synsets('higher') + wordnet.synsets('taller') + wordnet.synsets(
'heavier') + wordnet.synsets('stronger'))
bigger_related_words = big_synonyms.union(bigger_synonyms)
bigger_rel_words = [word.name().split('.')[0] for word in bigger_related_words]
flag_bigger = 0
if token.text.lower() in bigger_rel_words:
flag_bigger = 1
# comparatives.append({'comparative': [token.text + " " + next_token.text, '>']})
comparatives.append({'comparative': '>'})
# if no synonym of bigger was found, check for smaller synsets
if not flag_bigger:
# retrieve a set of synonyms for the concepts of 'small' and 'smaller'
small_synonyms = set(wordnet.synsets('small') + wordnet.synsets('little') + wordnet.synsets(
'tiny') + wordnet.synsets('petite') + wordnet.synsets('miniature') + wordnet.synsets(
'slight') + wordnet.synsets('meager') + wordnet.synsets('inconsequential') + wordnet.synsets(
'minor'))
smaller_synonyms = set(wordnet.synsets('smaller') + wordnet.synsets('lesser') + wordnet.synsets(
'lower') + wordnet.synsets('shorter') + wordnet.synsets('lighter') + wordnet.synsets('weaker'))
smaller_related_words = small_synonyms.union(smaller_synonyms)
smaller_rel_words = [word.name().split('.')[0] for word in smaller_related_words]
if token.text.lower() in smaller_rel_words:
flag_bigger = 0
# comparatives.append({'comparative': [token.text + " " + next_token.text, '<']})
comparatives.append({'comparative': '<'})
return comparatives
# helper functions for 'identify_pattern_bigger_smaller'
# helper functions for 'identify_pattern_bigger_smaller'
def identify_comparison(sentence):
"""
Capture patterns of 'word-er' followed by 'than' (e.g. 'better than', 'lesser than', etc)
"""
pattern = r'\b(\w+er than)\b'
matches = re.findall(pattern, sentence)
if matches:
return matches
else:
return 0
def find_more_than_reference(sentence):
"""
Capture patterns of 'more' followed by 'word' followed by 'than' (e.g. more advanced than)
"""
pattern = r"(more) (\w+) than"
matches = re.findall(pattern, sentence)
if matches:
return [' '.join(match) for match in matches]
else:
return 0
def find_less_than_reference(sentence):
"""
Capture patterns of 'less' followed by 'word' followed by 'than' (e.g. less advanced than)
"""
pattern = r"(less) (\w+) than"
matches = re.findall(pattern, sentence)
if matches:
return [' '.join(match) for match in matches]
else:
return 0
def is_related_to(word, target_word):
"""
Returns True if the input 'word' is semantically related to the 'target_word', otherwise False.
"""
target_synsets = set(wordnet.synsets(target_word))
word_synsets = set(wordnet.synsets(word))
if word_synsets.intersection(target_synsets):
return True
else:
return False
def is_related_to_bigger(word):
"""
Returns True if the input 'word' is semantically related to the concept 'bigger', otherwise False.
"""
if word.lower() == "more" or word.lower().startswith("more "):
return True
# retrieve a set of synonyms for the concepts of 'big' and 'bigger'
big_synonyms = set(wordnet.synsets('big') + wordnet.synsets('large') + wordnet.synsets('great') + wordnet.synsets(
'huge') + wordnet.synsets('enormous') + wordnet.synsets('heavy') + wordnet.synsets('strong') + wordnet.synsets(
'enormous') + wordnet.synsets('massive') + wordnet.synsets('immense') + wordnet.synsets('substantial'))
bigger_synonyms = set(
wordnet.synsets('bigger') + wordnet.synsets('larger') + wordnet.synsets('greater') + wordnet.synsets(
'higher') + wordnet.synsets('taller') + wordnet.synsets('heavier') + wordnet.synsets('stronger'))
related_words = big_synonyms.union(bigger_synonyms)
# Check if the input word is semantically related to any of those 'big'/'bigger' synonyms
for related_word in related_words:
if is_related_to(word, related_word.name().split('.')[0]):
return True
return False
def is_related_to_smaller(word):
"""
Returns True if the input word is semantically related to the concept of 'smaller', otherwise False.
"""
if word.lower() == "less" or word.lower().startswith("less "):
return True
# retrieve a set of synonyms for the concepts of 'small' and 'smaller'
small_synonyms = set(
wordnet.synsets('small') + wordnet.synsets('little') + wordnet.synsets('tiny') + wordnet.synsets(
'petite') + wordnet.synsets('miniature') + wordnet.synsets('slight') + wordnet.synsets(
'meager') + wordnet.synsets('inconsequential') + wordnet.synsets('minor'))
smaller_synonyms = set(
wordnet.synsets('smaller') + wordnet.synsets('lesser') + wordnet.synsets('lower') + wordnet.synsets(
'shorter') + wordnet.synsets('lighter') + wordnet.synsets('weaker'))
related_words = small_synonyms.union(smaller_synonyms)
# Check if the input word is semantically related to any of those 'small'/'smaller' synonyms
for related_word in related_words:
if is_related_to(word, related_word.name().split('.')[0]):
return True
return False
def identify_bigger_smaller_advanced(sentence):
"""
This is a complementary function to capture cases of 'words ending with -er' followed by 'than' and cases of 'more'/'less' followed 'word' followed by 'than'
"""
# pattern 'words ending with -er' followed by 'than' (pattern1)
word_er_than = identify_comparison(sentence)
# pattern 'more' followed 'word' followed by 'than' (pattern2)
more_word_than = find_more_than_reference(sentence)
# pattern 'less' followed 'word' followed by 'than' (pattern3)
less_word_than = find_less_than_reference(sentence)
bigger_list = []
smaller_list = []
# in case any pattern is captured
if word_er_than or more_word_than or less_word_than:
# in case of pattern1
if word_er_than:
for word in word_er_than:
# perform relevant substitutions
target_word = word.replace("than", "").strip()
# examine if it is a bigger-related or smaller-related word
bigger_word = is_related_to_bigger(target_word)
smaller_word = is_related_to_smaller(target_word)
# case of bigger word
if bigger_word and not smaller_word:
# bigger_list.append({"comparative": [word, ">"]})
bigger_list.append({"comparative": ">"})
# case of smaller word
elif smaller_word and not bigger_word:
# smaller_list.append({"comparative": [word, "<"]})
smaller_list.append({"comparative": "<"})
# in case of pattern2
if more_word_than:
for word in more_word_than:
# perform relevant substitutions
target_word = word.replace("than", "").replace("more", "").strip()
# in this case it must be a bigger-related word
bigger_word = is_related_to_bigger(target_word)
# case of bigger word
if bigger_word:
# bigger_list.append({"comparative": [word, ">"]})
bigger_list.append({"comparative": ">"})
# in case of pattern3
if less_word_than:
for word in less_word_than:
# perform relevant substitutions
target_word = word.replace("than", "").replace("less", "").strip()
# in this case it must be a lesser-related word
lesser_word = is_related_to_smaller(target_word)
# case of bigger word
if lesser_word:
# smaller_list.append({"comparative": [word, "<"]})
smaller_list.append({"comparative": "<"})
# return the combined list
return bigger_list + smaller_list
def find_equal_to_comptives_ngrams(sentence):
"""
This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
The possible reference phrases are provided as a list.
"""
# This is a reference list for the concept of 'equal to'. It has many references to perform on them the semantic similarity examination
possible_references = ["equal to", "same as", "similar to", "identical to", "equivalent to", "tantamount to",
"corresponding to", "comparable to", "akin to", "commensurate with", "in line with",
"on a par with", "indistinguishable from", "corresponding with", "congruent with"]
# that thershold is enough empirically
max_similarity = 0.85
possible_reference_list = []
# parse with the spacy model (embeddings each of the references)
embedding_references = []
for reference in possible_references:
reference_doc = nlp_comparatives(reference)
embedding_references.append(reference_doc)
# Check 2-grams, 3-grams, and 4-grams
for n in range(2, 5):
# get n-grams
sentence_ngrams = list(nltk.ngrams(sentence.split(), n))
for sent_ngram in sentence_ngrams:
sentence_ngram_str = ' '.join(sent_ngram)
sentence_ngram_doc = nlp_comparatives(sentence_ngram_str)
for emb_ref in embedding_references:
similarity = sentence_ngram_doc.similarity(emb_ref)
if similarity >= max_similarity:
# possible_reference_list.append({'comparative': [sentence_ngram_str, "="]})
possible_reference_list.append({'comparative': "="})
break
# if we have found a possible refernce that is similar enough with an n-gram of the input sentence, return the comparative '=', otherwise return 0
if possible_reference_list:
return possible_reference_list
else:
return []
def single_verb_comptives(sentence):
"""
This function takes a sentence and identifies any mention of bigger than, smaller than, equal to, expressed
as single-word verb. It uses wordnet synsets to examine for synonyms and antonyms
"""
# base references
bigger_references_sg = ["surpass", "exceed", "outstrip", "outdo", "outmatch", "outclass", "eclipse", "overshadow",
"outrank", "overtake", "top", "beat", "transcend", "dominate", "prevail", "trump",
"vanquish", "outperform", "outgun", "outdistance", "outshine"]
lesser_references_sg = ["lag", "trail", "lose", "underperform", "yield", "surrender", "straggle", "dawdle",
"lollygag", "loiter", "delay", "defer", "postpone", "procrastinate", "linger", "hesitate",
"prolong", "drag"]
equal_references_sg = ["match", "equal", "tie", "correspond", "conform", "agree", "harmonize", "coordinate",
"comply", "fit", "parallel", "resemble", "mirror", "emulate", "equilibrate", "balance",
"counterbalance", "offset", "compensate"]
doc = nlp_comparatives(sentence)
bigger_list = []
smaller_list = []
equal_list = []
# search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
for token in doc:
# first examine for 1-1 pair matching and 1-1 lemma pair matching
if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg:
# bigger_list.append({'comparative': [token.text, ">"]})
bigger_list.append({'comparative': ">"})
break
elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg:
# smaller_list.append({'comparative': [token.text, "<"]})
smaller_list.append({'comparative': "<"})
break
elif token.text in equal_references_sg or token.lemma_ in equal_references_sg:
# equal_list.append({'comparative': [token.text, "="]})
equal_list.append({'comparative': "="})
break
else:
# if not, then try with synonyms only for verbs
if token.pos_ == "VERB":
for lemma in token.lemma_.split('|'):
synsets = wordnet.synsets(lemma, pos='v')
for syn in synsets:
if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
# bigger_list.append({'comparative': [token.text, ">"]})
bigger_list.append({'comparative': ">"})
break
elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
# smaller_list.append({'comparative': [token.text, "<"]})
smaller_list.append({'comparative': "<"})
break
elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
# equal_list.append({'comparative': [token.text, "="]})
equal_list.append({'comparative': "="})
break
final_list = bigger_list + smaller_list + equal_list
if final_list:
return final_list
else:
return []
# helper functions for 'identify_multi_word_verbs'
# Define multi-word verb lists
bigger_list = ["is a cut above", "is ahead of", "is superior to", "is greater than", "raise the bar", "climb the ladder", "set the standard", "set the pace", "break the mold", "push the envelope", "raise the game", "is a class apart"]
smaller_list = ["fall behind", "is inferior to", "is smaller than", "lag behind", "trail behind", "is second to", "bring up the rear", "lose ground", "bring up the tail end", "fall short", "fall beneath", "fail to measure up", "put off"]
equal_list = ["is in line with", "is equal to", "is on a par with", "is on par with", "is the same as", "is comparable to", "is in sync with", "is in harmony with", "is in step with", "is in tune with", "is in accord with", "is consistent with", "is consonant with", "keep pace with", "keep up with", "is equivalent to", "balance out", "even out"]
# Calculate embeddings of multi-word verbs
bigger_embeddings = [np.mean([token.vector for token in nlp_comparatives(verb)], axis=0) for verb in bigger_list]
smaller_embeddings = [np.mean([token.vector for token in nlp_comparatives(verb)], axis=0) for verb in smaller_list]
equal_embeddings = [np.mean([token.vector for token in nlp_comparatives(verb)], axis=0) for verb in equal_list]
# Define function to check if n-gram is in multi-word verb list
def check_list(ngram, verb_list):
"""
This is a function to check if n-gram is in multi-word verb list
"""
if ngram in verb_list:
return True
else:
return False
def cosine_sim(a, b):
"""
This is a function to calculate cosine similarity
"""
return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
# we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
# (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
def multiword_verb_comptives(sentence):
"""
This function takes a sentence and identifies any mention of bigger than, smaller than, equal to, expressed
as multi-word verbs. Based on three refernces lists it performs initially a simple string comparison with each
of their elements and the ngrams of the input sentence. If there is no match there, it performs the same procedure
with cosine similarity to identify any similar ngrams.
"""
# Split sentence into tokens
tokens = sentence.split()
# Initialize variables to store label and max similarity
label = None
max_sim = 0
# these lists are used to capture any possible reference
bigger_l = []
smaller_l = []
equal_l = []
# Define set to keep track of matched ngrams
matched_ngrams = set()
# Iterate through n-grams of sentence, starting with the largest n-grams
for n in range(5, 0, -1):
for i in range(len(tokens)-n+1):
ngram = ' '.join(tokens[i:i+n])
# Skip ngrams that have already been matched
if ngram in matched_ngrams:
continue
# Check if n-gram is in bigger_list
if check_list(ngram, bigger_list):
matched_ngrams.update(set(ngram.split()))
# bigger_l.append({"comparative": [ngram, '>']})
bigger_l.append({"comparative": '>'})
# Check if n-gram is in smaller_list
elif check_list(ngram, smaller_list):
matched_ngrams.update(set(ngram.split()))
# smaller_l.append({"comparative":[ngram, '<']})
smaller_l.append({"comparative":'<'})
# Check if n-gram is in equal_list
elif check_list(ngram, equal_list):
matched_ngrams.update(set(ngram.split()))
# equal_l.append({"comparative":[ngram, '=']})
equal_l.append({"comparative": '='})
# Check if n-gram is similar to any verb in bigger_list using pre-calculated embeddings
else:
ngram_emb = np.mean([token.vector for token in nlp_comparatives(ngram)], axis=0)
similarities_bigger = [cosine_sim(ngram_emb, verb_emb) for verb_emb in bigger_embeddings]
max_sim_bigger = max(similarities_bigger)
# Check if n-gram is similar to any verb in smaller_list using pre-calculated embeddings
similarities_smaller = [cosine_sim(ngram_emb, verb_emb) for verb_emb in smaller_embeddings]
max_sim_smaller = max(similarities_smaller)
# Check if n-gram is similar to any verb in equal_list using pre-calculated embeddings
similarities_equal = [cosine_sim(ngram_emb, verb_emb) for verb_emb in equal_embeddings]
max_sim_equal = max(similarities_equal)
# Determine the maximum similarity value among the three lists
if max_sim_bigger > max_sim_smaller and max_sim_bigger > max_sim_equal and max_sim_bigger > max_sim:
max_sim = max_sim_bigger
if max_sim > 0.9:
matched_ngrams.update(set(ngram.split()))
# bigger_l.append({"comparative":[ngram, '>']})
bigger_l.append({"comparative":'>'})
else:
matched_ngrams.update(set(ngram.split()))
elif max_sim_smaller > max_sim_bigger and max_sim_smaller > max_sim_equal and max_sim_smaller > max_sim:
max_sim = max_sim_smaller
if max_sim > 0.9:
matched_ngrams.update(set(ngram.split()))
# smaller_l.append({"comparative":[ngram, '<']})
smaller_l.append({"comparative":'<'})
else:
matched_ngrams.update(set(ngram.split()))
elif max_sim_equal > max_sim_bigger and max_sim_equal > max_sim_smaller and max_sim_equal > max_sim:
max_sim = max_sim_smaller
if max_sim > 0.9:
matched_ngrams.update(set(ngram.split()))
# equal_l.append({"comparative":[ngram, '=']})
equal_l.append({"comparative":'='})
else:
matched_ngrams.update(set(ngram.split()))
return bigger_l + smaller_l + equal_l
def identify_double_symbol_comparisons(sentence):
"""
Identifies comparison phrases in a given sentence.
Returns a list of matched phrases and their corresponding operators.
"""
comparison_phrases = [
["less than or equal to", "less or equal to", "smaller than or equal to",
"smaller or equal to", "lower than or equal to", "lower or equal to",
"inferior to or equal to", "inferior or equal to", "lesser or equal to"],
["greater than or equal to", "greater or equal to", "more than or equal to",
"more or equal to", "higher than or equal to", "higher or equal to",
"above than or equal to", "above or equal to", "larger than or equal to",
"larger or equal to", "superior to or equal to", "superior or equal to",
"bigger or equal to", "over or equal to", "surpassing or equal to"]
]
operators = {
"less than or equal to": "<=",
"less or equal to": "<=",
"smaller than or equal to": "<=",
"smaller or equal to": "<=",
"lower than or equal to": "<=",
"lower or equal to": "<=",
"inferior to or equal to": "<=",
"inferior or equal to": "<=",
"greater than or equal to": ">=",
"greater or equal to": ">=",
"more than or equal to": ">=",
"more or equal to": ">=",
"higher than or equal to": ">=",
"higher or equal to": ">=",
"above than or equal to": ">=",
"above or equal to": ">=",
"larger than or equal to": ">=",
"larger or equal to": ">=",
"superior to or equal to": ">=",
"superior or equal to": ">=",
"bigger or equal to": ">=",
"over or equal to": ">=",
"lesser or equal to": "<=",
"surpassing or equal to": ">="
}
found_phrases = []
found_operators = []
for variations in comparison_phrases:
pattern = r"\b(" + "|".join([re.escape(v) for v in variations]) + r")\b"
matches = re.findall(pattern, sentence, re.IGNORECASE)
if matches:
for match in matches:
found_phrases.append(match)
found_operators.append(operators[match])
comparative_list = [{'comparative': []}]
for phrase, operator in zip(found_phrases, found_operators):
# comparative_list[0]['comparative'].append(phrase)
comparative_list[0]['comparative'].append((phrase, operator))
final_comptives_list = [{'comparative': comparative_list[0]['comparative'][i:i + 2]} for i in range(0, len(comparative_list[0]['comparative']), 2)]
final_clean_list = []
for item in final_comptives_list:
for value in item['comparative']:
final_clean_list.append({'comparative': value})
return final_clean_list
def check_substrings(lst):
"""
This function checks all the elements of a list and if any substring exist in any other element it returns a list of tuples
where the first element is the substring and the second the string that contains the substring
"""
substring_tuples = []
for i, comp1 in enumerate(lst):
for j, comp2 in enumerate(lst):
if i == j:
continue
if comp1['comparative'][0] in comp2['comparative'][0]:
substring_tuples.append((comp1, comp2))
return substring_tuples
def identify_comparatives(sentence):
"""
This function combines the results of all the aforementioned techniques (simple and advance) to identify bigger than, smaller than, equal to patterns
"""
# first identify the double symbols (<= >= ==)
identify_double_symbols_initial = identify_double_symbol_comparisons(sentence)
# this is because (for example) bigger than is a subset of bigger or equal than (and it returns conflicts)
if identify_double_symbols_initial:
for elem in identify_double_symbols_initial:
sentence = sentence.replace(elem['comparative'][0], " ")
identify_double_symbols = []
for item in identify_double_symbols_initial:
for k, v in item.items():
if isinstance(v, tuple):
item[k] = v[1]
identify_double_symbols.append(item)
# Identify straightforward patterns
straight_comptives = find_comptives_straight_patterns(sentence)
# Identify advanced bigger/smaller comparatives
bigger_smaller_comparatives = identify_bigger_smaller_advanced(sentence)
# Identify advanced equal-to comparatives
equal_to_comparatives = find_equal_to_comptives_ngrams(sentence)
single_verb = single_verb_comptives(sentence)
multi_verb = multiword_verb_comptives(sentence)
# return all the patterns that were captured
comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb + identify_double_symbols
# since those different techniques might capture similar patterns, we keep only unique references. More precisely
# we discard any unique reference while also any reference thay may exist as a substring on any other reference
# sort the list by length of the comparatives, in descending order
comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False)
unique_comparatives = {}
for i, item in enumerate(comparatives):
comparative = item['comparative'][0]
# check if the comparative is already in the dictionary or a substring/similar string of an existing comparative
is_unique = True
for existing_comp in unique_comparatives:
if (comparative in existing_comp) or (existing_comp in comparative):
is_unique = False
break
if is_unique:
unique_comparatives[comparative] = item
elif i == len(comparatives) - 1:
# if it's the last item and it's not unique, replace the first unique item in the list with this item
for j, existing_item in enumerate(unique_comparatives.values()):
if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]):
unique_comparatives.pop(list(unique_comparatives.keys())[j])
unique_comparatives[comparative] = item
break
unique_output = list(unique_comparatives.values())
clean_unique_output = []
# this snippet is to handle the extra cases of smaller than or equal to etc
# in case a reference of eg "smaller than" is found by the previous modules, while also a reference of "smaller than or equal to"
# then the snippet checks whether the "smaller than" reference exists only as a substring of "smaller than or equal to" or if it
# also exists as a seperate, standalone reference on the initial sentence (in which case it is kept, otherwise it is dismissed)
if len(unique_output) > 1:
list_of_tuples = check_substrings(unique_output)
for elem in list_of_tuples:
dupl_sent = sentence
dupl_sent = dupl_sent.replace(elem[1]['comparative'][0], " ")
clean_unique_output.append(elem[1])
if elem[0]['comparative'][0] in dupl_sent:
clean_unique_output.append(elem[0])
if clean_unique_output:
return clean_unique_output
else:
return unique_output
def comparatives_binding(sentence):
try:
comparative_symbols = find_comptives_symbols(sentence)
comparative_mentions = identify_comparatives(sentence)
# starting with the symbols, if one was captured
if len(comparative_symbols) == 1:
# if the rest of the functions are empty (meaning that there are no other references)
if len(comparative_mentions) == 0:
return comparative_symbols[0]
else:
return (0, "COMPARATIVES", "more_comparatives_mentions")
# in case that there is no symbol
elif len(comparative_symbols) == 0:
# we need only one mention of comparatives
if len(comparative_mentions) == 1:
return comparative_mentions[0]
# case of no comparative mentions
elif len(comparative_mentions) == 0:
return (0, "COMPARATIVES", "no_comparatives")
# case of no more than one comparative mentions
else:
return (0, "COMPARATIVES", "more_comparatives_mentions")
# case of multiple symbol references
else:
return (0, "COMPARATIVES", "more_symbol_comparatives")
except:
return (0, "COMPARATIVES", "unknown_error") |