ValadisCERTH commited on
Commit
e60af45
·
1 Parent(s): 4d76748

Create earthquaqeIdentification

Browse files
Files changed (1) hide show
  1. earthquaqeIdentification +142 -0
earthquaqeIdentification ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import numpy as np
3
+
4
+ # Load the spacy model with GloVe embeddings
5
+ nlp = spacy.load("en_core_web_lg")
6
+
7
+ # Define earthquake-related keywords
8
+ earthquake_single_keywords = ['earthquake', 'seismic', 'tremor', 'quake', 'aftershock', 'seismology', 'tectonic', 'plate', 'seismometer', 'temblor', 'trembler', 'seism', 'shock', 'vibration', 'shake', 'groundswell', 'earthquakes', 'seismics', 'tremors', 'quakes', 'aftershocks', 'seismologies', 'tectonics', 'plates', 'seismometers', 'temblors', 'tremblers', 'seisms', 'shocks', 'vibrations', 'shakes', 'groundswells']
9
+
10
+ # Compute embeddings for single-word keywords
11
+ earthquake_single_embeddings = [nlp(keyword).vector for keyword in earthquake_single_keywords]
12
+
13
+ # Define multi-word earthquake-related keywords
14
+ earthquake_multi_keywords = ['seismic activity', 'earthquake risk', 'earthquake zone', 'seismic wave', 'earthquake damage', 'seismic shift', 'tectonic plate', 'fault line', 'seismic retrofitting', 'seismic hazard', 'aftershock sequence', 'earthquake drill', 'seismic reflection', 'plate tectonic', 'seismic tomography', 'seismic profiling', 'seismicity pattern', 'earthquake swarm', 'seismic gap', 'seismic inversion', 'seismic reflection', 'seismic scattering', 'seismic attenuation', 'seismic imaging', 'seismic map', 'seismic data', 'earthquake monitoring', 'earth shaking', 'seismic activities', 'earthquake risks', 'earthquake zones', 'seismic waves', 'earthquake damages', 'seismic shifts', 'tectonic plates', 'fault lines', 'seismic retrofittings', 'seismic hazards', 'aftershock sequences', 'earthquake drills', 'seismic reflections', 'plate tectonics', 'seismic tomographies', 'seismic profilings', 'seismicity patterns', 'earthquake swarms', 'seismic gaps', 'seismic inversions', 'seismic reflections', 'seismic scatterings', 'seismic attenuations', 'seismic imagings', 'seismic maps', 'earth shakings']
15
+
16
+ # Compute embeddings for multi-word keywords
17
+ earthquake_multi_embeddings = []
18
+ for keyword in earthquake_multi_keywords:
19
+ keyword_embeddings = [nlp(w).vector for w in keyword.split()]
20
+ combined_emb = np.mean(keyword_embeddings, axis=0) # Compute the average embedding for the multi-word token
21
+ earthquake_multi_embeddings.append(combined_emb)
22
+
23
+ def straight_pattern_matching(ngram):
24
+ """
25
+ Function to compute a straightforward similarity between a word and the pre-defined references
26
+ """
27
+ if ngram in earthquake_single_keywords or ngram in earthquake_multi_keywords:
28
+ return ngram
29
+ else:
30
+ return False
31
+
32
+
33
+ # Define a function to compute the semantic similarity between a word and a set of embeddings
34
+ def compute_similarity_earthquake(word, embeddings, excluded_keywords):
35
+ """
36
+ Compute the semantic similarity for earthquaqe events
37
+ """
38
+
39
+ # Check if the word is in the excluded keywords list
40
+ if word in excluded_keywords:
41
+ return False
42
+
43
+ # Compute the GloVe embedding of the word
44
+ word_emb = nlp(word).vector
45
+
46
+ # Compute the cosine similarity between the word embedding and the keyword embeddings
47
+ similarity_scores = [np.dot(word_emb, emb) / (np.linalg.norm(word_emb) * np.linalg.norm(emb)) for emb in embeddings]
48
+
49
+ # Return if the maximum similarity score is above a certain threshold
50
+ if max(similarity_scores) > 0.7:
51
+ return word
52
+
53
+ else:
54
+ return False
55
+
56
+
57
+ def identify_earthquake_event(input_sentence):
58
+ """
59
+ Compute the semantic similarity for earthquaqe events
60
+ """
61
+
62
+ try:
63
+
64
+ # Define excluded keywords to ignore (because cases like I want bars with magnituted 6 - were given as correct)
65
+ excluded_keywords = ['magnitude', 'richter', 'moment', 'scale', 'intensity', 'amplitude', 'energy', 'force',
66
+ 'power', 'seismicity', 'event',
67
+ 'magnitudes', 'richters', 'moments', 'scales', 'intensities', 'amplitudes', 'energies',
68
+ 'forces', 'powers', 'seismicities', 'events']
69
+
70
+ parsed_sentence = nlp(input_sentence)
71
+
72
+ # start with simple straight pattern matching of single keywords
73
+ for word in parsed_sentence:
74
+ if word.text not in excluded_keywords:
75
+ straight_matching_single = straight_pattern_matching(word.text)
76
+
77
+ if straight_matching_single:
78
+ # return {'earthquaqe_event': [True, straight_matching_single]}
79
+ return {"event": "earthquake"}
80
+
81
+ # Continue with embeddings single matching
82
+ earthquaqe_keywords_single = []
83
+
84
+ # Check for single-word earthquake-related keywords
85
+ earthquaqe_keywords_single = [
86
+ compute_similarity_earthquake(word.text.lower(), earthquake_single_embeddings, excluded_keywords) for word
87
+ in parsed_sentence]
88
+
89
+ single_keyword_flag = False
90
+
91
+ # check until you find one such reference and then break
92
+ for elem in earthquaqe_keywords_single:
93
+ if elem:
94
+ single_keyword_flag = True
95
+ target_elem_single = elem
96
+ break
97
+
98
+ # if there is at least one referece, we can assume that the sentence refers to earthquaqe events
99
+ if single_keyword_flag:
100
+ # return {'earthquaqe_event': [True, target_elem_single]}
101
+ return {"event":"earthquake"}
102
+
103
+ # otherwise we examine for 2grams multi-word straight patterns and embeddings
104
+ earthquaqe_keywords_multi = []
105
+
106
+ # check 2-grams
107
+ for i in range(len(parsed_sentence) - 1):
108
+ bigram = parsed_sentence[i:i + 2].text.lower()
109
+
110
+ # case of straight matching
111
+ straight_matching_multi = straight_pattern_matching(word.text)
112
+
113
+ if straight_matching_multi:
114
+ # return {'earthquaqe_event': [True, straight_matching_multi]}
115
+ return {"event": "earthquake"}
116
+
117
+ # if no straight matching then perform embeddings
118
+ else:
119
+ earthquaqe_keywords_multi.append(
120
+ compute_similarity_earthquake(bigram, earthquake_multi_embeddings, excluded_keywords))
121
+
122
+ # case that the straight multi matching did not give any output
123
+ multi_keyword_flag = False
124
+
125
+ # check until you find one such reference and then break
126
+ for elem in earthquaqe_keywords_multi:
127
+ if elem:
128
+ multi_keyword_flag = True
129
+ target_elem_multi = elem
130
+ break
131
+
132
+ # if there is at least one referece, we can assume that the sentence refers to earthquaqe events
133
+ if multi_keyword_flag:
134
+ # return {'earthquaqe_event': [True, target_elem_multi]}
135
+ return {"event":"earthquake"}
136
+
137
+ # otherwise there is no reference
138
+ else:
139
+ return (0, 'EARTHQUAKE_EVENT', 'no_earthquake_reference')
140
+
141
+ except:
142
+ return (0, 'EARTHQUAKE_EVENT', 'unknown_error')