python类load_userdict()的实例源码

common_lib.py 文件源码 项目:WaiMaiOpinionMiner 作者: chaoming0625 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("??", 10000)
    jieba.suggest_freq(("?", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "?"))
Jeffmxh_sentiment_analyse.py 文件源码 项目:emotion_analyse_py 作者: jeffmxh 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self,n_core = 16):
        self.rootdir = os.getcwd()
        self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
        self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
        jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
        self.n_CORE=n_core
        jieba.enable_parallel(self.n_CORE-1)
dict.py 文件源码 项目:classifier-in-action 作者: shibing624 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self):
        self.__root_path = "data/dict/"
        jieba.load_userdict("data/dict/user.dict")  # ???????

        # ????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_path + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_path + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_path + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_path + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_path + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_path + "denial_dict.txt")
ReadBulletScreen.py 文件源码 项目:T-SJTTR 作者: Wind-Ward 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def read(self,file_name,timelength):

        #f = open("data/1993410.txt", "r")
        #timelength = 5640
        # f = open("data/5077534.txt", "r")
        # timelength = 4740
        f = open(file_name, "r")
        #timelength = 2582

        tempLine=[]
        #vocabulary=set()
        vocabulary = {}
        jieba.load_userdict("data/metadata/user_dict.txt")
        for lineNo,line in enumerate(f.readlines()):
            pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
            m=pattern.match(line)
            if m:
                temp={}
                temp={"time":int(float(m.group(1).split(',')[0])), \
                                   "text":[word  for word,flag in pseg.cut(m.group(2))  \
                                           if word not in self.stop_words and flag not in \
                                           ["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ],
                                   "lineno":lineNo+1}

                if len(temp["text"])>3:
                    tempLine.append(temp)
                    for item in temp["text"]:
                        if item not in vocabulary:
                            vocabulary[item]=0
        #print(len(tempLine))
        lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
        # print vocabulary
        # print  "vocabulary size: %d " % len(vocabulary)
        # print  "video comment size: %d " % len(lines)
        # print  lines[12]
        self.store(lines,timelength)
        return lines,timelength,vocabulary
word_segment.py 文件源码 项目:http_server 作者: chenguolin 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, user_dict=None):
        """
        Init WordSegment Client

        @user_dict: user dict

        ????????????????????????????????
        """
        self.user_dict = user_dict
        if self.user_dict is not None:
            jieba.load_userdict(self.user_dict)
clean.py 文件源码 项目:entity_words_identification 作者: actank 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def clean():
    jieba.load_userdict("../data/segmention/unigram.txt")
    output = open("./train.data", "w")
    with open("../data/prepare_data", "r") as f:
        for line in f:
            line = unicode(line.strip())

            #??????
            line = line.lower()

            #?????query
            if len(line) <= 2:
                continue
            #???????id?query
            if re.match('[0-9]{18}', line) != None:
                continue
            #???????query
            eng_flag = True
            for i in line:
                if i >= u'\u4e00' and i <= u'\u9fa5':
                    eng_flag = False
                    break
            if eng_flag == True:
                continue
            #????
            ll = jieba.cut(line)
            line = []
            for i in ll:
                if i == u"\u2006" or i == u" " or i == " ":
                    continue
                line.append(i)
            #??????????
            for i in range(len(line)):
                if synonym_dict.has_key(line[i]):
                    line[i] = synonym_dict[line[i]]

            #????query
            if line in s_list:
                continue
            l = ",".join(line)
            s_list.append(line)
            output.write(l + "\n")
    output.close()
    return
LSIEngine.py 文件源码 项目:RecommendSystem 作者: dhjack 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, itemInfos):

        lastTime = time.time()
        # itemInfos : dict[(pid, description)]
        # train model
        jieba.load_userdict('./dict.txt.big.txt')
        stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")])
        stopWords.add('\n')
        stopWords.add(' ')
        stopWords.add(u'\u2022')
        stopWords.add(u'\xa9')
        texts = []
        self.name2id = {}
        self.id2name = []
        for k, v in itemInfos.iteritems():
            seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords]
            texts.append(list(seg_list))
            self.name2id[k] = len(self.id2name)
            self.id2name.append(k)

        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1] for text in texts]

        print  "start cast :", (time.time() - lastTime)

        lastTime = time.time()
        dictionary = corpora.Dictionary(texts)
        print  "dictionary cast :", (time.time() - lastTime)

        lastTime = time.time()
        corpus = [dictionary.doc2bow(text) for text in texts]
        print  "doc2bow cast :", (time.time() - lastTime)

        lastTime = time.time()
        tfidf = models.TfidfModel(corpus)
        print  "tfid model cast :", (time.time() - lastTime)
        lastTime = time.time()

        lastTime = time.time()
        corpus_tfidf = tfidf[corpus]
        print  "tfidf corpus cast :", (time.time() - lastTime)

        lastTime = time.time()
        self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) 
        print  "lsi model cast :", (time.time() - lastTime)
        lastTime = time.time()

        #corpus_lsi = lsi[corpus_tfidf] 
        self.index = similarities.MatrixSimilarity(self.lsi[corpus]) 
        self.corpus = corpus

        self.pidName = getPidName()
        print "init finish"


问题


面经


文章

微信
公众号

扫码关注公众号