python类set_dictionary()的实例源码-面圈网

jieba2.py 文件源码项目：hadan-gcloud 作者: youkpan 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    #jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    #stopwordset = set()
    #with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
    #    for line in sw:
    #        stopwordset.add(line.strip('\n'))

    output = open('allbook-segment.txt','w')

    texts_num = 0

    with open("allbook.txt", "rb") as f:
      #if(f.readline() == ""):
      print("geting data")
      bookdata = f.read(190000000).decode('UTF-8')
      print("geting data  OK ")
      lineu = bookdata
      p = 0
      for p in range(0,len(bookdata),100):
            line = bookdata[p:p+100]
            #print(line)
            words = jieba.cut(line, cut_all=False)
            for word in words:
                output.write(word +' ')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("???? %d ????" % texts_num)
    output.close()

matcher.py 文件源码项目：PTTChatBot_DL2017 作者: thisray 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n'))

chinese_text_processor.py 文件源码项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def __config_jieba(self):
        """
        ??????
        """
        jieba.set_dictionary(jieba_dictionary)

chinese_text_processor.py 文件源码项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def tokenize_file(self, text_path, text_output_path='./tokenized_texts.txt'):
        """
        ???????? jieba ????
        """

        # jieba custom setting.
        jieba.set_dictionary(jieba_dictionary)

        # load stopwords set
        stopwordset = set()

        with open(jieba_stopwords, 'r', encoding='utf-8') as sw:
            for line in sw:
                stopwordset.add(line.strip('\n'))

        # ??
        texts_num = 0

        # ??????
        output = open(text_output_path, 'w')

        # ??????
        with open(text_path, 'r') as content:
            for line in content:
                line = line.strip('\n')

                # ??????
                words = jieba.cut(line, cut_all=False)
                for word in words:
                    if word not in stopwordset:
                        output.write(word + ' ')

                output.write('\n')

                texts_num += 1
                if texts_num % 10000 == 0:
                    logging.info("???? %d ????" % texts_num)
        output.close()

segment.py 文件源码项目：word2vec-tutorial 作者: zake7749 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopwordset = set()
    with io.open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
        for line in sw:
            stopwordset.add(line.strip('\n'))

    texts_num = 0

    output = io.open('wiki_seg.txt','w',encoding='utf-8')
    with io.open('wiki_zh_tw.txt','r',encoding='utf-8') as content :
        for line in content:
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopwordset:
                    output.write(word +' ')

            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("???? %d ????" % texts_num)
    output.close()

segment.py 文件源码项目：word2vec-tutorial 作者: zake7749 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopwordset = set()
    with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
        for line in sw:
            stopwordset.add(line.strip('\n'))

    texts_num = 0

    output = open('wiki_seg.txt','w')
    with open('wiki_zh_tw.txt','r') as content :
        for line in content:
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopwordset:
                    output.write(word +' ')

            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("???? %d ????" % texts_num)
    output.close()

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr)

momoCrawler3.py 文件源码项目：momoCrawler 作者: njames741 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self, status):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
        }
        self.headers2 = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
        }
        self.cookies = {
            '_ts_id': '999999999999999999',
        }
        self.cookies2 = {
            '_ts_id': '888888888888888888',
        }
        self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
            'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
            'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
            '12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
            'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
            'supplementary', 'bottle', 'combination', 'look_times', 'label'))
        # outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
        if status == 'c':
            self.with_header = False
        elif status == 'i':
            self.with_header = True
        else:
            raise SystemInputError('???????: c -> ??, i -> ??????')

        jieba.set_dictionary('dict.txt.big')

    # ??

momoCrawler.py 文件源码项目：momoCrawler 作者: njames741 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def __init__(self, status):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
        }
        self.headers2 = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
        }
        self.cookies = {
            '_ts_id': '999999999999999999',
        }
        self.cookies2 = {
            '_ts_id': '888888888888888888',
        }
        self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
            'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
            'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
            '12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
            'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
            'supplementary', 'bottle', 'combination', 'look_times', 'label'))
        # outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
        if status == 'c':
            self.with_header = False
        elif status == 'i':
            self.with_header = True
        else:
            raise SystemInputError('???????: c -> ??, i -> ??????')

        jieba.set_dictionary('dict.txt.big')

    # ??

momoCrawlerTemp.py 文件源码项目：momoCrawler 作者: njames741 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self, status):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
        }
        self.headers2 = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
        }
        self.cookies = {
            '_ts_id': '999999999999999999',
        }
        self.cookies2 = {
            '_ts_id': '888888888888888888',
        }
        self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
            'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
            'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
            '12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
            'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
            'look_times', 'label'))
        # outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
        if status == 'c':
            self.with_header = False
        elif status == 'i':
            self.with_header = True
        else:
            raise SystemInputError('???????: c -> ??, i -> ??????')

        jieba.set_dictionary('dict.txt.big')

    # ??

jieba_test.py 文件源码项目：jieba 作者: isuhao 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print >> sys.stderr, " , ".join(result)
        print  >> sys.stderr, "testSetDictionary"

data_desc_process.py 文件源码项目：lagou_data_analysis 作者: jasminecjc 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def set_dic():
    _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ))
    settings_path = os.environ.get('dict.txt')
    if settings_path and os.path.exists(settings_path):
        jieba.set_dictionary(settings_path)
    elif os.path.exists(os.path.join(_curpath, 'data/dict.txt.big')):
        jieba.set_dictionary('data/dict.txt.big')
    else:
        print "Using traditional dictionary!"

FOOD_command.py 文件源码项目：slack_emoji_bot 作者: linnil1 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, slack, custom):
        self.slack = slack
        self.rundata = custom['data']
        self.colorPrint = custom['colorPrint']

        self.food_dir = "data/midnight.json"
        self.food_dic = "data/dict.txt.big"

        # find midnight channel
        self.nochannel = False
        rep = self.slack.api_call("channels.list")
        self.channel_id = ""
        for c in rep['channels']:
            if c['name'].lower() == custom['food_channelname']:
                self.channel_id = c['id']
                break
        if not self.channel_id:
            self.colorPrint(
                "No midnight channel",
                "Restart when midnight channel can use",
                color="FAIL")
            self.nochannel = True
            return

        jieba.set_dictionary(self.food_dic)
        jieba.initialize()

        # add and del words
        for word in self.rundata.get('FOOD_addword'):
            jieba.add_word(word)
        for word in self.rundata.get('FOOD_delword'):
            jieba.del_word(word)

        self.init()

wiki_cut.py 文件源码项目：zhNewsCrawler 作者: YCKung 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def cut_main():
    jieba.set_dictionary('dict.txt.big')
    #jieba.load_userdict("userdict.txt")
    if len(sys.argv) == 3:
        inputfile = sys.argv[1]
        outputfile = sys.argv[2]
    else:
        print "Usage: python cut.py filetoCut.txt cuttedFile.txt"
        sys.exit()
    readNcut(inputfile,outputfile)

cut.py 文件源码项目：zhNewsCrawler 作者: YCKung 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def cut_main(inputfile,outputfile):
    jieba.set_dictionary('dict.txt.big')
    #-----user define dict-----
    #jieba.load_userdict("userdict.txt")
    readNcut(inputfile,outputfile)

matcher.py 文件源码项目：PTT-Chat-Generator 作者: zake7749 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n'))