def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word("??", 10000)
jieba.suggest_freq(("?", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "?"))
python类load_userdict()的实例源码
def __init__(self,n_core = 16):
self.rootdir = os.getcwd()
self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
self.n_CORE=n_core
jieba.enable_parallel(self.n_CORE-1)
def __init__(self):
self.__root_path = "data/dict/"
jieba.load_userdict("data/dict/user.dict") # ???????
# ????
self.__phrase_dict = self.__get_phrase_dict()
self.__positive_dict = self.__get_dict(self.__root_path + "positive_dict.txt")
self.__negative_dict = self.__get_dict(self.__root_path + "negative_dict.txt")
self.__conjunction_dict = self.__get_dict(self.__root_path + "conjunction_dict.txt")
self.__punctuation_dict = self.__get_dict(self.__root_path + "punctuation_dict.txt")
self.__adverb_dict = self.__get_dict(self.__root_path + "adverb_dict.txt")
self.__denial_dict = self.__get_dict(self.__root_path + "denial_dict.txt")
def read(self,file_name,timelength):
#f = open("data/1993410.txt", "r")
#timelength = 5640
# f = open("data/5077534.txt", "r")
# timelength = 4740
f = open(file_name, "r")
#timelength = 2582
tempLine=[]
#vocabulary=set()
vocabulary = {}
jieba.load_userdict("data/metadata/user_dict.txt")
for lineNo,line in enumerate(f.readlines()):
pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
m=pattern.match(line)
if m:
temp={}
temp={"time":int(float(m.group(1).split(',')[0])), \
"text":[word for word,flag in pseg.cut(m.group(2)) \
if word not in self.stop_words and flag not in \
["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
"lineno":lineNo+1}
if len(temp["text"])>3:
tempLine.append(temp)
for item in temp["text"]:
if item not in vocabulary:
vocabulary[item]=0
#print(len(tempLine))
lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
# print vocabulary
# print "vocabulary size: %d " % len(vocabulary)
# print "video comment size: %d " % len(lines)
# print lines[12]
self.store(lines,timelength)
return lines,timelength,vocabulary
def __init__(self, user_dict=None):
"""
Init WordSegment Client
@user_dict: user dict
????????????????????????????????
"""
self.user_dict = user_dict
if self.user_dict is not None:
jieba.load_userdict(self.user_dict)
def clean():
jieba.load_userdict("../data/segmention/unigram.txt")
output = open("./train.data", "w")
with open("../data/prepare_data", "r") as f:
for line in f:
line = unicode(line.strip())
#??????
line = line.lower()
#?????query
if len(line) <= 2:
continue
#???????id?query
if re.match('[0-9]{18}', line) != None:
continue
#???????query
eng_flag = True
for i in line:
if i >= u'\u4e00' and i <= u'\u9fa5':
eng_flag = False
break
if eng_flag == True:
continue
#????
ll = jieba.cut(line)
line = []
for i in ll:
if i == u"\u2006" or i == u" " or i == " ":
continue
line.append(i)
#??????????
for i in range(len(line)):
if synonym_dict.has_key(line[i]):
line[i] = synonym_dict[line[i]]
#????query
if line in s_list:
continue
l = ",".join(line)
s_list.append(line)
output.write(l + "\n")
output.close()
return
def __init__(self, itemInfos):
lastTime = time.time()
# itemInfos : dict[(pid, description)]
# train model
jieba.load_userdict('./dict.txt.big.txt')
stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
stopWords.add('\n')
stopWords.add(' ')
stopWords.add(u'\u2022')
stopWords.add(u'\xa9')
texts = []
self.name2id = {}
self.id2name = []
for k, v in itemInfos.iteritems():
seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
texts.append(list(seg_list))
self.name2id[k] = len(self.id2name)
self.id2name.append(k)
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]
print "start cast :", (time.time() - lastTime)
lastTime = time.time()
dictionary = corpora.Dictionary(texts)
print "dictionary cast :", (time.time() - lastTime)
lastTime = time.time()
corpus = [dictionary.doc2bow(text) for text in texts]
print "doc2bow cast :", (time.time() - lastTime)
lastTime = time.time()
tfidf = models.TfidfModel(corpus)
print "tfid model cast :", (time.time() - lastTime)
lastTime = time.time()
lastTime = time.time()
corpus_tfidf = tfidf[corpus]
print "tfidf corpus cast :", (time.time() - lastTime)
lastTime = time.time()
self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
print "lsi model cast :", (time.time() - lastTime)
lastTime = time.time()
#corpus_lsi = lsi[corpus_tfidf]
self.index = similarities.MatrixSimilarity(self.lsi[corpus])
self.corpus = corpus
self.pidName = getPidName()
print "init finish"