def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# jieba custom setting.
#jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
#stopwordset = set()
#with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
# for line in sw:
# stopwordset.add(line.strip('\n'))
output = open('allbook-segment.txt','w')
texts_num = 0
with open("allbook.txt", "rb") as f:
#if(f.readline() == ""):
print("geting data")
bookdata = f.read(190000000).decode('UTF-8')
print("geting data OK ")
lineu = bookdata
p = 0
for p in range(0,len(bookdata),100):
line = bookdata[p:p+100]
#print(line)
words = jieba.cut(line, cut_all=False)
for word in words:
output.write(word +' ')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
python类set_dictionary()的实例源码
def jiebaCustomSetting(self, dict_path, usr_dict_path):
jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))
chinese_text_processor.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def __config_jieba(self):
"""
??????
"""
jieba.set_dictionary(jieba_dictionary)
chinese_text_processor.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def tokenize_file(self, text_path, text_output_path='./tokenized_texts.txt'):
"""
???????? jieba ????
"""
# jieba custom setting.
jieba.set_dictionary(jieba_dictionary)
# load stopwords set
stopwordset = set()
with open(jieba_stopwords, 'r', encoding='utf-8') as sw:
for line in sw:
stopwordset.add(line.strip('\n'))
# ??
texts_num = 0
# ??????
output = open(text_output_path, 'w')
# ??????
with open(text_path, 'r') as content:
for line in content:
line = line.strip('\n')
# ??????
words = jieba.cut(line, cut_all=False)
for word in words:
if word not in stopwordset:
output.write(word + ' ')
output.write('\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# jieba custom setting.
jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopwordset = set()
with io.open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
for line in sw:
stopwordset.add(line.strip('\n'))
texts_num = 0
output = io.open('wiki_seg.txt','w',encoding='utf-8')
with io.open('wiki_zh_tw.txt','r',encoding='utf-8') as content :
for line in content:
words = jieba.cut(line, cut_all=False)
for word in words:
if word not in stopwordset:
output.write(word +' ')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# jieba custom setting.
jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
stopwordset = set()
with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
for line in sw:
stopwordset.add(line.strip('\n'))
texts_num = 0
output = open('wiki_seg.txt','w')
with open('wiki_zh_tw.txt','r') as content :
for line in content:
line = line.strip('\n')
words = jieba.cut(line, cut_all=False)
for word in words:
if word not in stopwordset:
output.write(word +' ')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
def testSetDictionary(self):
jieba.set_dictionary("foobar.txt")
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testSetDictionary", file=sys.stderr)
def __init__(self, status):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
}
self.headers2 = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
}
self.cookies = {
'_ts_id': '999999999999999999',
}
self.cookies2 = {
'_ts_id': '888888888888888888',
}
self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
'12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
'supplementary', 'bottle', 'combination', 'look_times', 'label'))
# outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
if status == 'c':
self.with_header = False
elif status == 'i':
self.with_header = True
else:
raise SystemInputError('???????: c -> ??, i -> ??????')
jieba.set_dictionary('dict.txt.big')
# ??
def __init__(self, status):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
}
self.headers2 = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
}
self.cookies = {
'_ts_id': '999999999999999999',
}
self.cookies2 = {
'_ts_id': '888888888888888888',
}
self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
'12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
'supplementary', 'bottle', 'combination', 'look_times', 'label'))
# outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
if status == 'c':
self.with_header = False
elif status == 'i':
self.with_header = True
else:
raise SystemInputError('???????: c -> ??, i -> ??????')
jieba.set_dictionary('dict.txt.big')
# ??
def __init__(self, status):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
}
self.headers2 = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
}
self.cookies = {
'_ts_id': '999999999999999999',
}
self.cookies2 = {
'_ts_id': '888888888888888888',
}
self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \
'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \
'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \
'12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \
'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \
'look_times', 'label'))
# outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??']
if status == 'c':
self.with_header = False
elif status == 'i':
self.with_header = True
else:
raise SystemInputError('???????: c -> ??, i -> ??????')
jieba.set_dictionary('dict.txt.big')
# ??
def testSetDictionary(self):
jieba.set_dictionary("foobar.txt")
for content in test_contents:
result = jieba.cut(content)
assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
result = list(result)
assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testSetDictionary"
def set_dic():
_curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ))
settings_path = os.environ.get('dict.txt')
if settings_path and os.path.exists(settings_path):
jieba.set_dictionary(settings_path)
elif os.path.exists(os.path.join(_curpath, 'data/dict.txt.big')):
jieba.set_dictionary('data/dict.txt.big')
else:
print "Using traditional dictionary!"
def __init__(self, slack, custom):
self.slack = slack
self.rundata = custom['data']
self.colorPrint = custom['colorPrint']
self.food_dir = "data/midnight.json"
self.food_dic = "data/dict.txt.big"
# find midnight channel
self.nochannel = False
rep = self.slack.api_call("channels.list")
self.channel_id = ""
for c in rep['channels']:
if c['name'].lower() == custom['food_channelname']:
self.channel_id = c['id']
break
if not self.channel_id:
self.colorPrint(
"No midnight channel",
"Restart when midnight channel can use",
color="FAIL")
self.nochannel = True
return
jieba.set_dictionary(self.food_dic)
jieba.initialize()
# add and del words
for word in self.rundata.get('FOOD_addword'):
jieba.add_word(word)
for word in self.rundata.get('FOOD_delword'):
jieba.del_word(word)
self.init()
def cut_main():
jieba.set_dictionary('dict.txt.big')
#jieba.load_userdict("userdict.txt")
if len(sys.argv) == 3:
inputfile = sys.argv[1]
outputfile = sys.argv[2]
else:
print "Usage: python cut.py filetoCut.txt cuttedFile.txt"
sys.exit()
readNcut(inputfile,outputfile)
def cut_main(inputfile,outputfile):
jieba.set_dictionary('dict.txt.big')
#-----user define dict-----
#jieba.load_userdict("userdict.txt")
readNcut(inputfile,outputfile)
def jiebaCustomSetting(self, dict_path, usr_dict_path):
jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))