def augment(texts, dic_thes):
if prm.aug<2:
return texts
out = []
for text in texts:
words_orig = wordpunct_tokenize(text)
maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
for j in range(prm.aug):
words = list(words_orig) #copy
for k in range(randint(1,maxrep)):
idx = randint(0,len(words)-1)
word = words[idx]
if word in dic_thes:
synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
#print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
words[idx] = dic_thes[word][synonym]
out.append(" ".join(words))
return out
评论列表
文章目录