|
import string, re, opencc |
|
|
|
|
|
全型2半型= str.maketrans( |
|
' 0123456789' |
|
'abcdefghijklmnopqrstuvwxyz' |
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|
'!゛#$%&()*+、ー。/:;〈=〉?@[]^_‘{|}~', |
|
' 0123456789' |
|
'abcdefghijklmnopqrstuvwxyz' |
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|
'!"#$%&()*+,-./:;<=>?@[]^_`{|}~' |
|
) |
|
|
|
def 把怪字修進unicode(xStr): |
|
xStr= re.sub('\uf5c3','𪜶', xStr) |
|
return xStr |
|
|
|
def ryNormText(s): |
|
""" |
|
<<<None>>> ==> 刪除 |
|
標點 ==> 空白 |
|
連續空白 ==> 1個空白 |
|
簡繁 |
|
""" |
|
|
|
punc1= string.punctuation |
|
punc1 |
|
punc2= '。,﹐、!?::;『』「」…,\n' |
|
|
|
punc= f"[{punc1}{punc2}]" |
|
|
|
|
|
s= re.sub('<<<None>>>','',s) |
|
|
|
|
|
s= re.sub(punc,' ',s) |
|
|
|
|
|
|
|
s= re.sub('[ ]+',' ',s) |
|
|
|
|
|
|
|
s= re.sub(' ','',s) |
|
|
|
s= 把怪字修進unicode(s) |
|
|
|
|
|
s= opencc.OpenCC('s2tw').convert(s) |
|
|
|
|
|
return s |
|
|
|
import unicodedata |
|
import re |
|
|
|
|
|
def separ_char_word(inputString= '我是呂仁園 Renyuan Lyu'): |
|
|
|
inputString= 把怪字修進unicode(inputString) |
|
|
|
y= '' |
|
for x in inputString: |
|
y += x |
|
try: |
|
un= unicodedata.name(x) |
|
if un.startswith('CJK'): |
|
y += ' ' |
|
else: |
|
pass |
|
except Exception as ex: |
|
y = ' '+y+' ' |
|
print(f'ryErr:(def 中英分開:){ex= }\t【{x= }】\t{inputString= }') |
|
|
|
y= re.sub('[ ]+',' ', y) |
|
return y |
|
|
|
|
|
|
|
|