lingbionlp commited on
Commit
ec0f536
·
1 Parent(s): 5d94a6e

Upload dic_ner.py

Browse files
Files changed (1) hide show
  1. src/dic_ner.py +165 -0
src/dic_ner.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Fri Jun 12 15:05:00 2020
4
+
5
+ @author: luol2
6
+ """
7
+ import sys
8
+ import json
9
+ import io
10
+ from src.split_tokenzier import ssplit_token_pos_lemma
11
+ class Trie(object):
12
+ class Node(object):
13
+ def __init__(self):
14
+ self.term = None
15
+ self.next = {}
16
+
17
+ def __init__(self, terms=[]):
18
+ self.root = Trie.Node()
19
+ for term in terms:
20
+ self.add(term)
21
+
22
+ def add(self, term):
23
+ node = self.root
24
+ for char in term:
25
+ if not char in node.next:
26
+ node.next[char] = Trie.Node()
27
+ node = node.next[char]
28
+ node.term = term
29
+
30
+ def match(self, query):
31
+ results = []
32
+ for i in range(len(query)):
33
+ node = self.root
34
+ for j in range(i, len(query)):
35
+ node = node.next.get(query[j])
36
+ if not node:
37
+ break
38
+ if node.term:
39
+ results.append((i, len(node.term)))
40
+ return results
41
+
42
+ def __repr__(self):
43
+ output = []
44
+ def _debug(output, char, node, depth=0):
45
+ output.append('%s[%s][%s]' % (' '*depth, char, node.term))
46
+ for (key, n) in node.next.items():
47
+ _debug(output, key, n, depth+1)
48
+ _debug(output, '', self.root)
49
+ return '\n'.join(output)
50
+
51
+ class dic_ont():
52
+
53
+ def __init__(self, ont_files):
54
+
55
+ dicin=open(ont_files['dic_file'],'r',encoding='utf-8')
56
+ win_size=50000
57
+ Dic=[]
58
+ print("loading dict!")
59
+ for line in dicin:
60
+ line=line.strip()
61
+ if len(line.split())<=win_size:
62
+ words=line.split()
63
+ for i in range(len(words)):
64
+ if len(words[i])>3 and (not words[i].isupper()):
65
+ words[i]=words[i].lower()
66
+ line=' '.join(words[0:])
67
+ Dic.append(line.strip())
68
+ print("Dic_len:",len(Dic))
69
+ dicin.close()
70
+
71
+ self.dic_trie = Trie(Dic)
72
+ print("load dic done!")
73
+
74
+ #load word id mapping
75
+ fin_map=open(ont_files['word_id_file'],'r',encoding='utf-8')
76
+ self.word_id=json.load(fin_map)
77
+ fin_map.close()
78
+
79
+ #load id word mapping
80
+ fin_map=open(ont_files['id_word_file'],'r',encoding='utf-8')
81
+ self.id_word=json.load(fin_map)
82
+ fin_map.close()
83
+
84
+ def matching(self, source):
85
+
86
+ fin=io.StringIO(source)
87
+ fout=io.StringIO()
88
+
89
+ sent_list=[]
90
+ sent = []
91
+ sent_ori_list=[]
92
+ sent_ori=[]
93
+
94
+ for line in fin:
95
+ line=line.strip()
96
+ if line=="":
97
+ sent_list.append(sent)
98
+ sent_ori_list.append(sent_ori)
99
+ sent=[]
100
+ sent_ori=[]
101
+ else:
102
+ words=line.split('\t')
103
+ words[1]=words[1].lower()
104
+ sent.append(words[1]) # word lemma
105
+ sent_ori.append(words[0])
106
+ sent=[]
107
+ fin.close()
108
+
109
+ for k in range(len(sent_list)):
110
+ sent = sent_list[k]
111
+ sentence=' '.join(sent[0:])+" "
112
+ sentence_ori=' '.join(sent_ori_list[k])
113
+ # print('sentence:',sentence)
114
+ result=self.dic_trie.match(sentence)
115
+ # print('result:',result)
116
+ new_result=[]
117
+ for i in range(0,len(result)):
118
+ if result[i][0]==0 and sentence[result[i][1]]==" ":
119
+ new_result.append([result[i][0],result[i][0]+result[i][1]])
120
+ elif result[i][0]>0 and sentence[result[i][0]-1]==' ' and sentence[result[i][0]+result[i][1]]==' ':
121
+ new_result.append([result[i][0],result[i][0]+result[i][1]])
122
+ # print('new result:',new_result)
123
+
124
+
125
+
126
+ if len(new_result)==0:
127
+ fout.write(sentence_ori+'\n\n')
128
+
129
+ else:
130
+ fout.write(sentence_ori+'\n')
131
+ for ele in new_result:
132
+ entity_text=sentence[ele[0]:ele[1]]
133
+ if entity_text in self.word_id.keys():
134
+ ontid=self.word_id[entity_text]
135
+ else:
136
+ print('no id:', entity_text)
137
+ ontid=['None']
138
+ if ele[0]==0:
139
+ sid="0"
140
+ else:
141
+ temp_sent=sentence[0:ele[0]]
142
+ sid=str(len(temp_sent.rstrip().split(' ')))
143
+ temp_sent=sentence[0:ele[1]]
144
+ eid=str(len(temp_sent.rstrip().split(' '))-1)
145
+ # print(sid,eid,entity_text,ontid[0])
146
+ #fout.write(sid+'\t'+eid+'\t'+entity_text+'\t'+";".join(ontid)+'\t1.00\n')
147
+ fout.write(sid+'\t'+eid+'\t'+entity_text+'\t'+ontid[0]+'\t1.00\n')
148
+ fout.write('\n')
149
+
150
+ return fout.getvalue()
151
+
152
+
153
+ if __name__=='__main__':
154
+
155
+ ontfiles={'dic_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/hpo_noabb_lemma.dic',
156
+ 'word_hpo_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/word_ontid_map.json',
157
+ 'hpo_word_file':'//panfs/pan1/bionlp/lulab/luoling/HPO_project/bioTag/dict/ontid_word_map.json'}
158
+ biotag_dic=dic_ont(ontfiles)
159
+ text='Nevoid basal cell carcinoma syndrome (NBCCS) is a hereditary condition transmitted as an autosomal dominant trait with complete penetrance and variable expressivity. The syndrome is characterised by numerous basal cell carcinomas (BCCs), odontogenic keratocysts of the jaws, palmar and/or plantar pits, skeletal abnormalities and intracranial calcifications. In this paper, the clinical features of 37 Italian patients are reviewed. Jaw cysts and calcification of falx cerebri were the most frequently observed anomalies, followed by BCCs and palmar/plantar pits. Similar to the case of African Americans, the relatively low frequency of BCCs in the Italian population is probably due to protective skin pigmentation. A future search based on mutation screening might establish a possible genotype phenotype correlation in Italian patients.'
160
+ ssplit_token=ssplit_token_pos_lemma(text)
161
+ # print(ssplit_token)
162
+ dic_result=biotag_dic.matching(ssplit_token)
163
+ print(dic_result)
164
+
165
+