KoichiYasuoka commited on
Commit
aa2ea32
1 Parent(s): 92f0b14

multi-syllable words

Browse files
Files changed (1) hide show
  1. maker.py +16 -30
maker.py CHANGED
@@ -21,36 +21,22 @@ class UDgoeswithDataset(object):
21
  if len(t)==10 and t[0].isdecimal():
22
  c.append(t)
23
  elif c!=[]:
24
- d=list(c)
25
- v=tokenizer([t[1].replace(" ","_") for t in c],add_special_tokens=False)["input_ids"]
26
- for i in range(len(v)-1,-1,-1):
27
- for j in range(1,len(v[i])):
28
- c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
29
- y=["0"]+[t[0] for t in c]
30
- h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
31
- p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[])
32
- if len(v)<tokenizer.model_max_length-3:
33
- self.ids.append([cls]+v+[sep])
34
- self.tags.append([dep]+p+[dep])
35
- label=set(sum([self.tags[-1],list(label)],[]))
36
- for i,k in enumerate(v):
37
- self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
38
- self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
39
- c=d
40
- v=tokenizer([t[1].replace("_"," ") for t in c],add_special_tokens=False)["input_ids"]
41
- for i in range(len(v)-1,-1,-1):
42
- for j in range(1,len(v[i])):
43
- c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
44
- y=["0"]+[t[0] for t in c]
45
- h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
46
- p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[])
47
- if len(v)<tokenizer.model_max_length-3:
48
- self.ids.append([cls]+v+[sep])
49
- self.tags.append([dep]+p+[dep])
50
- label=set(sum([self.tags[-1],list(label)],[]))
51
- for i,k in enumerate(v):
52
- self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
53
- self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
54
  c=[]
55
  self.label2id={l:i for i,l in enumerate(sorted(label))}
56
  def __call__(*args):
 
21
  if len(t)==10 and t[0].isdecimal():
22
  c.append(t)
23
  elif c!=[]:
24
+ for x in [lambda i:i.replace(" ","_"),lambda i:i.replace("_"," ")]:
25
+ d=list(c)
26
+ v=tokenizer([x(t[1]) for t in d],add_special_tokens=False)["input_ids"]
27
+ for i in range(len(v)-1,-1,-1):
28
+ for j in range(1,len(v[i])):
29
+ d.insert(i+1,[d[i][0],"_","_","X","_","_",d[i][0],"goeswith","_","_"])
30
+ y=["0"]+[t[0] for t in d]
31
+ h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(d,1)]
32
+ p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in d],sum(v,[])
33
+ if len(v)<tokenizer.model_max_length-3:
34
+ self.ids.append([cls]+v+[sep])
35
+ self.tags.append([dep]+p+[dep])
36
+ label=set(sum([self.tags[-1],list(label)],[]))
37
+ for i,k in enumerate(v):
38
+ self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
39
+ self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  c=[]
41
  self.label2id={l:i for i,l in enumerate(sorted(label))}
42
  def __call__(*args):