def process_sent(sent, vocab_int, steps):
"""
this file token sentence and make it into numpy array, return a fixed length 2d array
:param sent:
:param vocab_int:
:param steps:
:return:
"""
sent_list = jieba.lcut(sent)
# if words not in vocab dict then let this word be a random index which maybe other words
index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
if len(index_list) < steps:
index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
else:
index_list = index_list[0: steps]
return np.array([index_list])
python类lcut()的实例源码
def process_sent(sent, vocab_int, steps):
"""
this file token sentence and make it into numpy array, return a fixed length 2d array
:param sent:
:param vocab_int:
:param steps:
:return:
"""
sent_list = jieba.lcut(sent)
# if words not in vocab dict then let this word be a random index which maybe other words
index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
if len(index_list) < steps:
index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
else:
index_list = index_list[0: steps]
return np.array([index_list])
def prepare_data(self):
corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
vocabs = []
for l in corpus_cut:
for i in l:
vocabs.append(i)
# vocabs = reduce(lambda x, y: x+y, corpus_cut)
# count every vocab frequency
# but currently we don't think about the 'most' frequent one, just let it go
counter = collections.Counter(vocabs)
counter = counter.most_common()
vocabs_set, _ = zip(*counter)
vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}
data_flatten = np.array([vocab_int_map[v] for v in vocabs])
#step=3
data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(0,data_flatten.shape[0]-self.n_steps -1,3)])
# let's shuffle data to see anything happens
np.random.shuffle(data)
return len(vocabs_set), vocab_int_map, data
def prepare_data(self):
corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
vocabs = []
for l in corpus_cut:
for i in l:
vocabs.append(i)
# vocabs = reduce(lambda x, y: x+y, corpus_cut)
# count every vocab frequency
# but currently we don't think about the 'most' frequent one, just let it go
counter = collections.Counter(vocabs)
counter = counter.most_common()
vocabs_set, _ = zip(*counter)
vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}
data_flatten = np.array([vocab_int_map[v] for v in vocabs])
data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(data_flatten.shape[0] // (self.n_steps + 1))])
# let's shuffle data to see anything happens
np.random.shuffle(data)
return len(vocabs_set), vocab_int_map, data
def delNOTNeedWords(content,customstopwords=None):
# words = jieba.lcut(content)
if customstopwords == None:
customstopwords = "stopwords.txt"
import os
if os.path.exists(customstopwords):
stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
customstopwords = stop_words
result=''
return_words = []
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
for word, flag in words:
# print word.encode('utf-8')
tempword = word.encode('utf-8').strip(' ')
if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
# and flag[0] in [u'n', u'f', u'a', u'z']):
# ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
result += tempword # +"/"+str(w.flag)+" " #????
return_words.append(tempword)
return result,return_words
def result_by_time(self, sentence):
seg_list = jieba.lcut(sentence, cut_all=False)
n, cleaned_dict = self.clean_list(seg_list)
time_scores = {}
for term in cleaned_dict.keys():
r = self.fetch_from_db(term)
if r is None:
continue
docs = r[2].split('\n')
for doc in docs:
docid, date_time, tf, ld = doc.split('\t')
if docid in time_scores:
continue
news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
now_datetime = datetime.now()
td = now_datetime - news_datetime
docid = int(docid)
td = (timedelta.total_seconds(td) / 3600) # hour
time_scores[docid] = td
time_scores = sorted(time_scores.items(), key = operator.itemgetter(1))
if len(time_scores) == 0:
return 0, []
else:
return 1, time_scores
def gen_idf_file(self):
files = listdir(self.doc_dir_path)
n = float(len(files))
idf = {}
for i in files:
root = ET.parse(self.doc_dir_path + i).getroot()
title = root.find('title').text
body = root.find('body').text
seg_list = jieba.lcut(title + '?' + body, cut_all=False)
seg_list = set(seg_list) - self.stop_words
for word in seg_list:
word = word.strip().lower()
if word == '' or self.is_number(word):
continue
if word not in idf:
idf[word] = 1
else:
idf[word] = idf[word] + 1
idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
for word, df in idf.items():
idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
idf_file.close()
def delstopwords(content):
result=''
words = jieba.lcut(content)
return_words = []
for w in words:
if w not in app.config['stopwords']:
result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
return_words.append(w.encode('utf-8'))
# words = pseg.lcut(content)
# with app.test_request_context():
# for word, flag in words:
# if (word not in app.config['stopwords'] and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
# result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
# print result
return result,return_words
def delNOTNeedWords(content,stopwords):
# words = jieba.lcut(content)
result=''
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
# jieba.cut()
text_list = []
for word, flag in words:
# print word.encode('utf-8')
if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
# text_list.append(word.encode('utf-8'))
result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
# ''.join(text_list)
return result
# return ''.join(text_list)
def cut_with_stop_words(string):
segs = jieba.lcut(string)
final = []
if True:
for seg in segs:
if seg not in stopwords:
final.append(seg)
return final
else:
return segs
def cut_with_stop_words(string):
segs = jieba.lcut(string)
final = []
if False:
for seg in segs:
if seg not in stopwords:
final.append(seg)
return final
else:
return segs
def cut_with_stop_words(string):
segs = jieba.lcut(string)
final = []
if True:
for seg in segs:
if seg not in stopwords:
final.append(seg)
return final
else:
return segs
def cut_with_stop_words(string):
segs = jieba.lcut(string)
final = ''
for seg in segs:
if seg not in stopwords:
final = final + seg
return final
def main(_):
print("Loading vocabulary")
cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt")
en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt")
cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path)
_, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)
print("Building model...")
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
model = create_model(sess, False)
# Decode from standard input.
sys.stdout.write("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
while sentence:
seg_list = jieba.lcut(sentence.strip())
#print(" ".join(seg_list))
token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list]
#print(token_ids)
outputs = model.test(sess, token_ids)
outputs = outputs.tolist()
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])
print(output.capitalize())
print("> ")
sys.stdout.flush()
sentence = sys.stdin.readline()
def jieba_example():
raw = "????S5????,123,?,?"
raw_seq = jieba.cut(raw)
raw_seq_list = jieba.lcut(raw)
raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
raw_with_ictclas = pseg.cut(raw)
for word, flag in raw_with_ictclas:
print word, flag
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
"""
:param raw_str: str
:return: list[(str, str)]
"""
res = [(a, b) for a, b in pseg.lcut(raw_str)]
if filter_invalid_word_flag:
return filter_invalid_word(res)
else:
return res
def process_lyrics(file_name):
lyrics = []
content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
for l in content:
if len(l) < 40:
continue
l = start_token + l + end_token
lyrics.append(l)
lyrics = sorted(lyrics, key=lambda line: len(line))
print('all %d songs...' % len(lyrics))
# if not os.path.exists(os.path.dirname(segment_list_file)):
# os.mkdir(os.path.dirname(segment_list_file))
# if os.path.exists(segment_list_file):
# print('load segment file from %s' % segment_list_file)
# with open(segment_list_file, 'rb') as p:
# all_words = pickle.load(p)
# else:
all_words = []
for lyric in lyrics:
all_words += jieba.lcut(lyric, cut_all=False)
# with open(segment_list_file, 'wb') as p:
# pickle.dump(all_words, p)
# print('segment result have been save into %s' % segment_list_file)
# calculate how many time appear per word
counter = collections.Counter(all_words)
print(counter['E'])
# sorted depends on frequent
counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*counter_pairs)
print('E' in words)
words = words[:len(words)] + (' ',)
word_int_map = dict(zip(words, range(len(words))))
# translate all lyrics into int vector
lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
return lyrics_vector, word_int_map, words
def segement(self, strs):
return jieba.lcut(strs)
def word_tokenize(tokens):
# return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
#from my.corenlp_interface import CoreNLPInterface
#url = 'vision-server2.corp.ai2'
#port = 8000
#interface = CoreNLPInterface(url, port)
#sent_tokenize = interface.split_doc
#word_tokenize = interface.split_sent
def get_train_data(language):
# Load data from files
path = "./data/" + language + "/"
positive_examples = list(open(path + "rt-polarity.pos", "r").readlines())
positive_examples = [s.strip() for s in positive_examples[:100]] # -1000
negative_examples = list(open(path + "rt-polarity.neg", "r").readlines())
negative_examples = [s.strip() for s in negative_examples[:100]]
x_text = positive_examples + negative_examples
x_text = [sent for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
# Build vocabulary
max_length_of_sentence = max([len(jieba.lcut(x)) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_of_sentence)
x = np.array(list(vocab_processor.fit_transform(x_text)))
# Randomly shuffle data
np.random.seed(1234)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/cross-validation set
cross_validation_indices = np.array(random.sample(np.arange(len(y)), int(len(y) * 0.1) ))
train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices)))
x_train, x_dev = x_shuffled[train_indices], x_shuffled[cross_validation_indices]
y_train, y_dev = y_shuffled[train_indices], y_shuffled[cross_validation_indices]
return [x_train, x_dev, y_train, y_dev, vocab_processor]
def cut(sentence):
if not __init_seg:
__init()
return jieba.lcut(sentence)
def cut(sentence):
if not __init_seg:
__init()
return jieba.lcut(sentence)
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
"""
????????????????????
simType=simple, simple_POS, vec
"""
self.lastTxt.append(intxt)
if simType not in ('simple', 'simple_pos', 'vec'):
return 'error: maxSimTxt?simType?????: {}'.format(simType)
# ??????????????? simple_pos ??
embedding = self.vecModel
if simType == 'vec' and not embedding:
simType = 'simple_pos'
for t in self.zhishiku:
questions = t.q_vec if simType == 'vec' else t.q_word
in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)
t.sim = max(
similarity(in_vec, question, method=simType, embedding=embedding)
for question in questions
)
maxSim = max(self.zhishiku, key=lambda x: x.sim)
logger.info('maxSim=' + format(maxSim.sim, '.0%'))
if maxSim.sim < simCondision:
return '?????????????????????????'
return maxSim.a
def tokenize(question, on='jieba'):
""" ???????????
:param question: ???????
:return: ?????????
"""
if on == 'ltp':
# LTP ??
words = segmentor.segment(question.encode('utf-8'))
rv = _remove_stopwords([i.decode('utf-8') for i in words])
else:
# jieba ??
rv = _remove_stopwords(jieba.lcut(question))
logging.debug("NLP:tokenize: {}".format(" ".join(rv)))
return rv
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list
def predict_with_content(classifier, news_content, feature_words):
word_list = jieba.lcut(news_content)
x = np.array([1 if word in word_list else 0 for word in feature_words]).reshape(1, -1)
return classifier.predict(x)[0]
def words_extract(news_folder):
"""??????????
Args:
news_folder/
??/
??/
??/
"""
subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
if os.path.isdir(os.path.join(news_folder, subfolder))]
data_list = [] # element: ([word1, word2, ...], "??")
jieba.enable_parallel(4)
# ??????????
for subfolder in subfolder_list:
news_class = subfolder
subfolder = os.path.join(news_folder, subfolder)
news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
if os.path.isfile(os.path.join(subfolder, news))]
for news in news_list:
with open(news, 'r') as f:
content = f.read()
word_list = jieba.lcut(content)
data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
jieba.disable_parallel()
return data_list