def fetch(self):
# cut the text in semi-redundant sequences of maxlen characters
#text=self.text
text=self.next_text()
chars=self.chars
maxlen=self.maxlen
step=self.step
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, self.char_indices[char]] = 1
y[i, self.char_indices[next_chars[i]]] = 1
return text,X,y
python类cut()的实例源码
def WordBeark():
logger.info("running Word Beark in " + path + data)
inputfile = path + data + ".zhs"
outputfile = path + data + ".wordbreak"
i = 0
output = open(outputfile, 'w')
input = open(inputfile, 'r')
for line in input.readlines():
seg_list = jieba.cut(line)
output.write(u' '.join(seg_list))
i = i + 1
if (i % 10000 == 0):
logger.info("Cut " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
def predict(text):
words = jieba.cut(text)
words = " ".join(words)
index2label = {i: l.strip() for i, l in enumerate(tv_classfication.label_list)}
word2vec_model = Word2Vec.load(tv_classfication.word2vec_path)
text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None)
x_test = []
for doc, _ in text_converter.transform_to_ids([words]):
x_test.append(doc)
x_test = np.array(x_test)
graph = tf.Graph()
with graph.as_default(),tf.Session() as sess:
model = bi_lstm_model.Bi_lstm()
model.restore_model(sess)
print(tv_classfication.index2label.get(model.predict(sess,x_test)[0]))
def lyrics():
with open('lyrics.json', 'r', encoding='utf-8') as f:
data = json.load(f)
tokens = list()
for v in data.values():
# ??????, ???????? 2 ??, ?????
tokens += [seg for seg in jieba.cut(v) if seg.split() and len(seg) > 1]
# ?? tokens ?????????
counter = Counter(tokens)
print(counter.most_common(10))
# ???, ???????????
wcloud = WordCloud(font_path='NotoSansMonoCJKtc-Regular.otf').generate(' '.join(tokens))
plt.imshow(wcloud)
plt.axis('off')
plt.show()
def cut_words(input_file, output_file):
count = 0
with io.open(output_file, mode = 'w', encoding = 'utf-8') as outfile:
with io.open(input_file, mode = 'r', encoding = 'utf-8') as infile:
for line in infile:
line = line.strip()
if len(line) < 1: # empty line
continue
if line.startswith('doc'): # start or end of a passage
if line == 'doc': # end of a passage
outfile.write(u'\n')
count = count + 1
if(count % 1000 == 0):
print('%s articles were finished.......' %count)
continue
for word in jieba.cut(line):
outfile.write(word + ' ')
print('%s articles were finished.......' %count)
def extract_tags(key_word, a_name):
'''
???????????, ????????????,??????,
?????????JD??????, ??????????5??????????,
???????????????????????????????
'''
cut_tags = [tag for tag in jieba.cut(a_name)][:8]
analyse_tags = jieba.analyse.extract_tags(a_name)
tags = [tag for tag in cut_tags if tag in analyse_tags]
# ?????????????tags???
try:
tags.remove(key_word)
except:
pass
tags.insert(0, key_word)
if len(tags) > 5:
tags = tags[:5]
return ' '.join(tags)
def handleLine(self, line):
# ???????
line = line.replace(' ', '')
line = line.replace('\n', '')
line = line.replace('em', '')
# ??
words = jieba.cut(line)
for word in words:
if len(word)<=1:
continue
if word in self.data:
self.data[word] = self.data[word]+1
else:
self.data[word] = 1
eval_data_helpers.py 文件源码
项目:question-classification-cnn-rnn-attention
作者: sefira
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def process_data(line):
"""
word break and remove word
Returns split sentences
"""
# Word break
seg_list = jieba.cut(line)
line = u' '.join(seg_list)
# Remove word
ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
line = u"".join(ss).strip()
if(len(line) < 2):
return "UNK"
return line
eval_data_helpers.py 文件源码
项目:question-classification-cnn-rnn-attention
作者: sefira
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def process_data(line):
"""
word break and remove word
Returns split sentences
"""
# Word break
seg_list = jieba.cut(line)
line = u' '.join(seg_list)
# Remove word
ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
line = u"".join(ss).strip()
if(len(line) < 2):
return "UNK"
return line
eval_data_helpers.py 文件源码
项目:question-classification-cnn-rnn-attention
作者: sefira
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def process_data(line):
"""
word break and remove word
Returns split sentences
"""
# Word break
seg_list = jieba.cut(line)
line = u' '.join(seg_list)
# Remove word
ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line)
line = u"".join(ss).strip()
if(len(line) < 2):
return "UNK"
return line
def mainTestInteractive(self, sess):
""" Try predicting the sentences that the user will enter in the console
Args:
sess: The current running session
"""
# TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also)
# TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode)
# TODO: Log the questions asked for latter re-use (merge with test/samples.txt)
print('Testing: Launch interactive mode:')
print('')
print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high '
'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.')
import jieba
while True:
question = input(self.SENTENCES_PREFIX[0])
if question == '' or question == 'exit':
break
questionc = jieba.cut(question, cut_all=False)
question = str(" ".join(questionc)).decoder("GBK")
print(question)
questionSeq = [] # Will be contain the question as seen by the encoder
answer = self.singlePredict(question, questionSeq)
if not answer:
print('Warning: sentence too long, sorry. Maybe try a simpler sentence.')
continue # Back to the beginning, try again
print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True)))
if self.args.verbose:
print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True))
print(self.textData.sequence2str(answer))
print()
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# jieba custom setting.
#jieba.set_dictionary('jieba_dict/dict.txt.big')
# load stopwords set
#stopwordset = set()
#with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
# for line in sw:
# stopwordset.add(line.strip('\n'))
output = open('allbook-segment.txt','w')
texts_num = 0
with open("allbook.txt", "rb") as f:
#if(f.readline() == ""):
print("geting data")
bookdata = f.read(190000000).decode('UTF-8')
print("geting data OK ")
lineu = bookdata
p = 0
for p in range(0,len(bookdata),100):
line = bookdata[p:p+100]
#print(line)
words = jieba.cut(line, cut_all=False)
for word in words:
output.write(word +' ')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("???? %d ????" % texts_num)
output.close()
def word_seg_cn(docs):
docs = [list(jieba.cut(sent)) for sent in docs]
return docs
def word_seg_cn(docs):
docs = [list(jieba.cut(sent)) for sent in docs]
return docs
def cutandsplit(s):
for ln in filterlist(splitsentence(stripblank(s))):
l = RE_BRACKETS.sub(brcksub, ln.strip())
if notchinese(l):
continue
yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))
def cutandsplit(s):
for ln in filterlist(splitsentence(stripblank(s))):
l = RE_BRACKETS.sub(brcksub, ln.strip())
if notchinese(l):
continue
yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))