KoichiYasuoka commited on
Commit
fb432f9
1 Parent(s): aa2ea32

preprocess improved

Browse files
Files changed (1) hide show
  1. ud.py +11 -12
ud.py CHANGED
@@ -2,19 +2,18 @@ from transformers import TokenClassificationPipeline
2
 
3
  class UniversalDependenciesPipeline(TokenClassificationPipeline):
4
  def preprocess(self,sentence,offset_mapping=None):
 
5
  from tokenizers.pre_tokenizers import Whitespace
6
- t=[]
7
- for k,(s,e) in Whitespace().pre_tokenize_str(sentence):
8
- if t==[]:
9
- t.append((k,(s,e)))
 
 
10
  else:
11
- j=t[-1][0]+"_"+k
12
- if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
13
- t[-1]=(j,(t[-1][1][0],e))
14
- else:
15
- t.append((k,(s,e)))
16
- r=super().preprocess(sentence=" ".join(i for i,j in t))
17
  m=[(0,0)]+[j for i,j in t]+[(0,0)]
 
18
  w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
19
  if len(m)!=len(w):
20
  for i,j in enumerate(w):
@@ -22,7 +21,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
22
  s,e=m[i]
23
  m.insert(i+1,(s+len(j)-2,e))
24
  m[i]=(s,s+len(j)-2)
25
- r["offset_mapping"]=m
26
  r["sentence"]=sentence
27
  return r
28
  def _forward(self,model_inputs):
@@ -49,7 +48,7 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
49
  k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
50
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
51
  h=self.chu_liu_edmonds(m)
52
- v=[(s,e) for s,e in model_outputs["offset_mapping"] if s<e]
53
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
54
  g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
55
  if g:
 
2
 
3
  class UniversalDependenciesPipeline(TokenClassificationPipeline):
4
  def preprocess(self,sentence,offset_mapping=None):
5
+ import torch
6
  from tokenizers.pre_tokenizers import Whitespace
7
+ v=Whitespace().pre_tokenize_str(sentence)
8
+ t=[v[0]]
9
+ for k,(s,e) in v[1:]:
10
+ j=t[-1][0]+"_"+k
11
+ if self.tokenizer.convert_tokens_to_ids(j)!=self.tokenizer.unk_token_id:
12
+ t[-1]=(j,(t[-1][1][0],e))
13
  else:
14
+ t.append((k,(s,e)))
 
 
 
 
 
15
  m=[(0,0)]+[j for i,j in t]+[(0,0)]
16
+ r=super().preprocess(sentence=" ".join(i for i,j in t))
17
  w=self.tokenizer.convert_ids_to_tokens(r["input_ids"][0])
18
  if len(m)!=len(w):
19
  for i,j in enumerate(w):
 
21
  s,e=m[i]
22
  m.insert(i+1,(s+len(j)-2,e))
23
  m[i]=(s,s+len(j)-2)
24
+ r["offset_mapping"]=torch.tensor([m])
25
  r["sentence"]=sentence
26
  return r
27
  def _forward(self,model_inputs):
 
48
  k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
49
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
50
  h=self.chu_liu_edmonds(m)
51
+ v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
52
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
53
  g="aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none"
54
  if g: