2022年4月26日 星期二

data clean

f = open('test2.txt','r',encoding='utf-8')
lines = f.readlines()
d = ''.join(lines)

d = d.replace('(',' ')
d = d.replace(')',' ')
d = d.replace(':',' ')
d = d.replace('.',' ')
d = d.replace('_',' ')
d = d.replace('\t',' ')
d = d.replace('  ',' ')
d = d.replace('  ',' ')
d = d.replace('  ',' ')

d = [ i for i in d if 'z' >= i >='A' or i==' ' ]

d = ''.join(d)
# print(d)
d = d.split(' ')
d = [i for i in d if len(i)>1]
d = set(d)
d = list(d)
d.sort
for i in d:
    print(i)
print(len(d))