def createData():
spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3]
enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3]
jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())]
jpwords = [a for a in set(jpwords) if len(a)>3]
# minLen = min(len(enwords), len(spwords), len(jpwords))
featuresets = \
[(createTupleDict(w,numChars),'English') for w in enwords] + \
[(createTupleDict(w,numChars),'Spanish') for w in spwords] + \
[(createTupleDict(w,numChars),'Japanese') for w in jpwords]
random.shuffle(featuresets)
l=int(len(featuresets)*0.8)
training_set = featuresets[:l]
testing_set = featuresets[l:]
return (training_set, testing_set)
评论列表
文章目录