python类load_userdict()的实例源码

cut_text.py 文件源码 项目:internet-content-detection 作者: liubo0621 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, dict_path = ''):
        super(Singleton, self).__init__()
        if not hasattr(self,'_stop_words'):
            #???????
            if dict_path:
                jieba.load_userdict(dict_path)

            self._stop_words = set((
                '', ' ', '\n', "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
                "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
                "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
            ))
job_spider.py 文件源码 项目:51job 作者: chenjiandongx 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def post_desc_counter():
        """ ??????
        """
        # import thulac
        post = open(os.path.join("data", "post_require.txt"),
                    "r", encoding="utf-8").read()
        # ?? thulac ??
        # thu = thulac.thulac(seg_only=True)
        # thu.cut(post, text=True)

        # ?? jieba ??
        file_path = os.path.join("data", "user_dict.txt")
        jieba.load_userdict(file_path)
        seg_list = jieba.cut(post, cut_all=False)
        counter = dict()
        for seg in seg_list:
            counter[seg] = counter.get(seg, 1) + 1
        counter_sort = sorted(
            counter.items(), key=lambda value: value[1], reverse=True)
        pprint(counter_sort)
        with open(os.path.join("data", "post_pre_desc_counter.csv"),
                  "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sort)
hot_words.py 文件源码 项目:LagouJob 作者: EclipseXuLu 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_hot_words(text):
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    jieba.load_userdict(USER_CORPUS)
    df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
    print(df)
    df.to_excel('./hotwords/DM.xlsx', 'DM')
manage.py 文件源码 项目:sentiment-analysis 作者: kasheemlew 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def parse():
    """parse the comments"""
    import jieba
    import jieba.posseg as pseg

    # Load User's Dictionary
    path_list = os.getcwd().split('/')
    path_list.append("dict.txt")
    dict_path = '/'.join(path_list)
    jieba.load_userdict(dict_path)

    # Disimss These Flags
    dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
            'ud', 'ug', 'uv']

    comments = Comment.query.all()
    for comment in comments:
         word_list = []
         pseg_cut = pseg.cut(comment.body)
         for word, flag in pseg_cut:
             if flag not in dismiss:
                 word_list.append(word)
         comment.parsed = '/'.join(word_list)
         db.session.add(comment)
         print "Comment %04d Parsed!" % comment.id

    db.session.commit()
    print "ALL DONE!"
preprocessing.py 文件源码 项目:seq2seq_chatterbot 作者: StephenLee2016 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self):
        self.encoderFile = "./question.txt"
        self.decoderFile = './answer.txt'
        self.dictFile = 'word_dict.txt'
        # ???????????
        jieba.load_userdict(self.dictFile)
        # ???????
        self.stopwordsFile = "./preprocessing/stopwords.dat"
seq2seq.py 文件源码 项目:seq2seq_chatterbot 作者: StephenLee2016 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self):
        print("tensorflow version: ", tf.__version__)
        tf.reset_default_graph()

        self.encoder_vec_file = "./preprocessing/enc.vec"
        self.decoder_vec_file = "./preprocessing/dec.vec"
        self.encoder_vocabulary = "./preprocessing/enc.vocab"
        self.decoder_vocabulary = "./preprocessing/dec.vocab"
        self.dictFile = './word_dict.txt'
        self.batch_size = 1
        self.max_batches = 10000
        self.show_epoch = 100
        self.model_path = './model/'

        # jieba????
        jieba.load_userdict(self.dictFile)

        self.model = dynamicSeq2seq(encoder_cell=LSTMCell(20),
                                    decoder_cell=LSTMCell(40), 
                                    encoder_vocab_size=540,
                                    decoder_vocab_size=1600,
                                    embedding_size=20,
                                    attention=True,
                                    bidirectional=True,
                                    debug=False,
                                    time_major=True)
        self.location = ["??", "??", "??", "??","??"]
        self.user_info = {"__username__":"Stephen", "__location__":"??"}
        self.robot_info = {"__robotname__":"JiJi"}
        self.dec_vocab = {}
        self.enc_vocab = {}
        tag_location = ''
        with open(self.encoder_vocabulary, "r") as enc_vocab_file:
            for index, word in enumerate(enc_vocab_file.readlines()):
                self.enc_vocab[word.strip()] = index
        with open(self.decoder_vocabulary, "r") as dec_vocab_file:
            for index, word in enumerate(dec_vocab_file.readlines()):
                self.dec_vocab[index] = word.strip()
WordSegmentation.py 文件源码 项目:free-rider-killer 作者: YukiSora 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def main(argv):
    f = open('freeRiderData.txt')
    jieba.load_userdict('KeywordDictionary.txt')
    for line in f:
        # ????
        seg_list = jieba.cut(line, cut_all=False)
        print("Default Mode: " + "/ ".join(seg_list))   

    return
classifiers.py 文件源码 项目:SentimentPolarityAnalysis 作者: chaoming0625 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def __init__(self):
        self.__root_filepath = "f_dict/"

        jieba.load_userdict("f_dict/user.dict")  # ??????

        # ????????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
common_lib.py 文件源码 项目:FineGrainedOpinionMining 作者: chaoming0625 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word(u"??", 10000)
    jieba.suggest_freq((u"?", u"??"))
    jieba.suggest_freq((u"??", u"??"))
    jieba.suggest_freq((u"??", u"??"))
    jieba.suggest_freq((u"??", u"?"))
common_lib.py 文件源码 项目:FineGrainedOpinionMining 作者: chaoming0625 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init():
    user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
    jieba.load_userdict(user_dict_path)
    jieba.add_word("??", 10000)
    jieba.suggest_freq(("?", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "??"))
    jieba.suggest_freq(("??", "?"))
WordExtractor.py 文件源码 项目:Rnews 作者: suemi994 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self,userDict=None,conf={}):
        self.userDict=userDict
        self.conf={}
        self.configFromDict(conf)
        if self.userDict:
            jieba.load_userdict(userDict)
        self.configDefault()
nlu_api.py 文件源码 项目:KnowledgeGraph-QA-Service 作者: kangzhun 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, custom_dict_path=CUSTOM_DICTIONARY_PATH):
        super(JiebaClient, self).__init__()
        try:
            jieba.load_userdict(custom_dict_path)
            self.debug("init JiebaClient, with custom_dict_path=%s", custom_dict_path)
        except Exception, e:
            self.exception(e)
            self.error('@@@@@@@@@@@@@@@@@@@@@@@@@@@ loading custom_dictionary failed')
Any.py 文件源码 项目:sentiment-analysis 作者: l-passer 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def cutwords_jieba(self,sentence,userdict='dict/userdict.txt',stopwords='dict/stopwords.txt'):
        stropw = []
        if userdict:
            jieba.load_userdict(userdict)
            stropw = [line.strip() for line in open(stopwords,'r',encoding='utf-8').readlines()]

        frequency = defaultdict(int)
        l = list(jieba.cut(sentence))
        for t in l:
            frequency[t] += 1

        texts = [token for token in frequency if frequency[token] > 0]

        rtexts = list(set(texts)-set(stropw))
        return rtexts
ReadBulletScreen.py 文件源码 项目:TPTM 作者: Wind-Ward 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def read(self,file_name,POS_tag):
        f = open(file_name, "r")
        tempLine=[]
        #vocabulary = {}
        jieba.load_userdict("data/metadata/user_dict.txt")
        for lineNo,line in enumerate(f.readlines()):
            pattern=re.compile("^<d p=\"(.+)\">(.+)</d>")
            m=pattern.match(line)
            if m:
                info=m.group(1).split(',')
                temp={"time":int(float(info[0])), \
                                   "text":[word  for word,flag in pseg.cut(m.group(2))  \
                                           if word not in self.stop_words and flag not in \
                                           POS_tag ],
                                   "lineno":lineNo+1,
                                   "user":info[6]}

                #?????? ???????>3???
                temp2=[]
                for index,text in enumerate(temp["text"]):
                    if len(text)>1:
                        temp2.append(text)
                if len(temp2)>=3:
                    print(temp2)
                    temp["text"]=temp2
                    tempLine.append(temp)


        lines=sorted(tempLine, key= lambda e:(e.__getitem__('time')))
        print len(lines)
        return lines#,vocabulary
data_utils.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self):
        self.ut_path = '../data/ut.data'
        self.vocab_path = '../data/vocab.data'
        self.ids_path = '../data/ids.data'
        self.train_path = '../data/train.data'
        self.dev_path = '../data/dev.data'
        self.test_path = '../data/test.data'
        self.dict_path = '../data/medical.txt'
        self.emd_path = '../data/emd/ylemd.bin'
        self.tag_path = '../data/tag.data'
        jieba.load_userdict(self.dict_path)
data_utils.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self):
        self.ut_path = '../data/uterance.data'
        self.mark_path = '../data/mark.data'
        self.vocab_path = '../data/vocab.data'
        self.ids_path = '../data/ids.data'
        self.train_path = '../data/train.data'
        self.dev_path = '../data/dev.data'
        self.test_path = '../data/test.data'
        self.dict_path = '../data/medical.txt'
        self.emd_path = '../data/emd/ylemd.bin'
        jieba.load_userdict(self.dict_path)
gen.py 文件源码 项目:deeplearning4chatbot 作者: liangjz92 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self,size):
        self.data_path = 'skin.data'
        self.train_size = int(size*0.7)
        self.dev_size = int(size*0.1)
        self.test_size = size - self.train_size - self.dev_size
        jieba.load_userdict('medical.txt')
        self.sentences = []
        self.orders = []
        self.stop_line = []
        for line in open('goodbye.data'):
            line = line.strip()
            self.stop_line.append(line)
        self.ac_dialogs = []
fenci.py 文件源码 项目:SinaWeiboSpider 作者: SuperSaiyanSSS 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self):
        jieba.load_userdict("keyword.txt")
        jieba.load_userdict("mingan_word.txt")
        self.topK = 12
        self.mingan_list = []
        self.get_mingan_list()
preprocessing.py 文件源码 项目:dynamic-seq2seq 作者: yanwii 项目源码 文件源码 阅读 53 收藏 0 点赞 0 评论 0
def __init__(self):
        self.encoderFile = "./question.txt"
        self.decoderFile = './answer.txt'
        self.dictFile = 'word_dict.txt'
        jieba.load_userdict(self.dictFile)
        self.stopwordsFile = "./preprocessing/stopwords.dat"
seq2seq.py 文件源码 项目:dynamic-seq2seq 作者: yanwii 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def __init__(self):
        print("tensorflow version: ", tf.__version__)
        tf.reset_default_graph()

        self.encoder_vec_file = "./preprocessing/enc.vec"
        self.decoder_vec_file = "./preprocessing/dec.vec"
        self.encoder_vocabulary = "./preprocessing/enc.vocab"
        self.decoder_vocabulary = "./preprocessing/dec.vocab"
        self.dictFile = './word_dict.txt'
        self.batch_size = 1
        self.max_batches = 100000
        self.show_epoch = 100
        self.model_path = './model/'

        # jieba????
        jieba.load_userdict(self.dictFile)

        self.model = dynamicSeq2seq(encoder_cell=LSTMCell(40),
                                    decoder_cell=LSTMCell(40), 
                                    encoder_vocab_size=600,
                                    decoder_vocab_size=1600,
                                    embedding_size=20,
                                    attention=False,
                                    bidirectional=False,
                                    debug=False,
                                    time_major=True)
        self.location = ["??", "??", "??", "??"]
        self.user_info = {"__username__":"yw", "__location__":"??"}
        self.robot_info = {"__robotname__":"Rr"}
        self.dec_vocab = {}
        self.enc_vocab = {}
        self.dec_vecToSeg = {}
        tag_location = ''
        with open(self.encoder_vocabulary, "r") as enc_vocab_file:
            for index, word in enumerate(enc_vocab_file.readlines()):
                self.enc_vocab[word.strip()] = index
        with open(self.decoder_vocabulary, "r") as dec_vocab_file:
            for index, word in enumerate(dec_vocab_file.readlines()):
                self.dec_vecToSeg[index] = word.strip()
                self.dec_vocab[word.strip()] = index
wiki_cut.py 文件源码 项目:zhNewsCrawler 作者: YCKung 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cut_main():
    jieba.set_dictionary('dict.txt.big')
    #jieba.load_userdict("userdict.txt")
    if len(sys.argv) == 3:
        inputfile = sys.argv[1]
        outputfile = sys.argv[2]
    else:
        print "Usage: python cut.py filetoCut.txt cuttedFile.txt"
        sys.exit()
    readNcut(inputfile,outputfile)
cut.py 文件源码 项目:zhNewsCrawler 作者: YCKung 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def cut_main(inputfile,outputfile):
    jieba.set_dictionary('dict.txt.big')
    #-----user define dict-----
    #jieba.load_userdict("userdict.txt")
    readNcut(inputfile,outputfile)
main.py 文件源码 项目:aibot 作者: Qiware 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def load_userdict():
    """
    Load user dictionary
    """
    # ????
    jieba.load_userdict("./dict/name/amuse.txt");
    jieba.load_userdict("./dict/name/sporter.txt");
    jieba.load_userdict("./dict/name/politicians.txt");

    # ????
    jieba.load_userdict("./dict/sport.txt"); # ????

    # ????
    jieba.load_userdict("./dict/dict.txt");
train.py 文件源码 项目:aibot 作者: Qiware 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def load_userdict():
    # ????
    jieba.load_userdict("./dict/name/amuse.txt");
    jieba.load_userdict("./dict/name/sporter.txt");
    jieba.load_userdict("./dict/name/politicians.txt");

    # ????
    jieba.load_userdict("./dict/sport.txt"); # ????

    # ????
    jieba.load_userdict("./dict/dict.txt");
text_analysis.py 文件源码 项目:CloudMusic-Crawler 作者: GreatV 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def words_split(corpus_path):

    with open(corpus_path, 'r') as f:
        content = f.read()

    jieba.load_userdict('data/userdict.txt') # ?????????
    jieba.enable_parallel(4) # ????


    seg_list = jieba.cut(content, cut_all = False) # ??

    return seg_list


# ?????
preprocessing.py 文件源码 项目:seq2seq 作者: yanwii 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self):
        #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
        #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
        #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
        self.encoderFile = "./data/question.txt"
        self.decoderFile = "./data/answer.txt"
        self.savePath = './data/'

        jieba.load_userdict("./data/supplementvocab.txt")
contentparser.py 文件源码 项目:GeoNews 作者: chunlaw 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, diction=None, content=None):
        self.diction = diction or "assets/location.dict"
        self.content = content or ""
        jieba.load_userdict(self.diction)
classifiers.py 文件源码 项目:FusionOfMultipleClassifers 作者: chaoming0625 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self):
        self.__root_filepath = "f_dict/"

        jieba.load_userdict("f_dict/user.dict")  # ??????

        # ????????
        self.__phrase_dict = self.__get_phrase_dict()
        self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt")
        self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt")
        self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt")
        self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt")
        self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt")
        self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
gen_dataset.py 文件源码 项目:KnowledgeGraph 作者: SilverHelmet 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def gen_dataset_from_baike():
    doc_path = os.path.join(rel_ext_dir, 'sample_baike_doc.json')
    out_path = os.path.join(rel_ext_dir, 'data/raw_dataset.txt')

    name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache')
    fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache')
    finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path)


    Print('load userdict')
    jieba.load_userdict(os.path.join(rel_ext_dir, 'trimmed_baike_dict.txt'))

    Print('gen dataset from [%s]' %doc_path)
    outf = file(out_path, 'w')
    for line in tqdm(file(doc_path), total = nb_lines_of(doc_path)):
        p = line.split('\t')
        baike_url = p[0].decode('utf-8')
        paragraphs = json.loads(p[1])
        for paragraph in paragraphs:
            sentences = split_sentences(paragraph)
            for sentence in sentences:
                cases, words = gen_dataset(sentence, finder)
                if len(cases) > 0:
                    out_obj = {
                        'words': "#".join(words),
                        'cases': map(str, cases),
                    }
                    outf.write("%s\t%s\n" %(baike_url, json.dumps(out_obj, ensure_ascii = False)))
    outf.close()
text_segment.py 文件源码 项目:JiaYuan 作者: EclipseXuLu 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))


问题


面经


文章

微信
公众号

扫码关注公众号