def extract_dictionary_feature(file_name, col_tag=0, col_content=1):
# ????
adv = codecs.open('./data/vocabulary/adv.txt', 'rb', encoding='utf-8').read().split('\n')
inverse = codecs.open('./data/vocabulary/inverse.txt', 'rb', encoding='utf-8').read().split('\n')
negdict = codecs.open('./data/vocabulary/negdict.txt', 'rb', encoding='utf-8').read().split('\n')
posdict = codecs.open('./data/vocabulary/posdict.txt', 'rb', encoding='utf-8').read().split('\n')
contents = pd.read_excel(file_name, header=None)
print 'cut words...'
cw = lambda x: [pair for pair in psg.lcut(x) if pair.word not in stopwords]
contents['pairs'] = contents[col_content].apply(cw)
matrix = reviews2matrix(list(contents['pairs']), posdict, negdict, inverse, adv)
x = matrix2vec(matrix)
y = list(contents[col_tag])
return x, y
python类lcut()的实例源码
def delNOTNeedWords(content,customstopwords=None):
# words = jieba.lcut(content)
if customstopwords == None:
customstopwords = "stopwords.txt"
import os
if os.path.exists(customstopwords):
stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
customstopwords = stop_words
result=''
return_words = []
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
for word, flag in words:
# print word.encode('utf-8')
tempword = word.encode('utf-8').strip(' ')
if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
# and flag[0] in [u'n', u'f', u'a', u'z']):
# ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
result += tempword # +"/"+str(w.flag)+" " #????
return_words.append(tempword)
return result,return_words
def delNOTNeedWords(content,stopwords):
# words = jieba.lcut(content)
result=''
# for w in words:
# if w not in stopwords:
# result += w.encode('utf-8') # +"/"+str(w.flag)+" " #????
words = pseg.lcut(content)
# jieba.cut()
text_list = []
for word, flag in words:
# print word.encode('utf-8')
if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
# text_list.append(word.encode('utf-8'))
result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
# ''.join(text_list)
return result
# return ''.join(text_list)
def jieba_example():
raw = "????S5????,123,?,?"
raw_seq = jieba.cut(raw)
raw_seq_list = jieba.lcut(raw)
raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
raw_with_ictclas = pseg.cut(raw)
for word, flag in raw_with_ictclas:
print word, flag
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
"""
:param raw_str: str
:return: list[(str, str)]
"""
res = [(a, b) for a, b in pseg.lcut(raw_str)]
if filter_invalid_word_flag:
return filter_invalid_word(res)
else:
return res
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
"""
????????????????????
simType=simple, simple_POS, vec
"""
self.lastTxt.append(intxt)
if simType not in ('simple', 'simple_pos', 'vec'):
return 'error: maxSimTxt?simType?????: {}'.format(simType)
# ??????????????? simple_pos ??
embedding = self.vecModel
if simType == 'vec' and not embedding:
simType = 'simple_pos'
for t in self.zhishiku:
questions = t.q_vec if simType == 'vec' else t.q_word
in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)
t.sim = max(
similarity(in_vec, question, method=simType, embedding=embedding)
for question in questions
)
maxSim = max(self.zhishiku, key=lambda x: x.sim)
logger.info('maxSim=' + format(maxSim.sim, '.0%'))
if maxSim.sim < simCondision:
return '?????????????????????????'
return maxSim.a
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted, (txt, hyp)
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
global stop_word_path
self.stop = stop
self.stopwords = codecs.open(stop_word_path + 'stopwords.txt', encoding='UTF-8').read()
self.negwords = set([u"?", u"??", u"??", u"?", u"??", u"??", u"??", u"??", u"??"])
text_words = pseg.lcut(rtepair[0])
hyp_words = pseg.lcut(rtepair[1])
self.text_words = set()
self.hyp_words = set()
# ??????????????
pass
# ?? wordnet ????????
if lemmatize:
pass
# ????
for word, flag in text_words:
if word not in self.stopwords:
self.text_words.add((word, flag))
for word, flag in hyp_words:
if word not in self.stopwords:
self.hyp_words.add((word, flag))
# ????
self._overlap = self.hyp_words & self.text_words # hyp ? text??
self._hyp_extra = self.hyp_words - self.text_words # hyp? text??
self._txt_extra = self.text_words - self.hyp_words # text? hyp??
def delstopwords(content):
result = ''
words = pseg.lcut("".join(content.split()))
for word, flag in words:
if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz",
"/y"]: # ??????????????????
result += word.encode('utf-8') # +"/"+str(w.flag)+" " #????
return result
def prefix_process(curr_index, sentence, score):
"""
?????????????
:param curr_index: w ? sentence ??????
:param score: ??????
:param sentence: ??
:return:
"""
num_cnt = 5
if curr_index - num_cnt > 0:
seg = sentence[curr_index - num_cnt:curr_index]
else:
seg = sentence[0:curr_index]
# ????????
for curr_neg_prefix in double_none_prefix:
if seg.endswith(curr_neg_prefix):
return 0.8 * score
# ????????
for curr_neg_prefix in set_neg_prefix:
if seg.endswith(curr_neg_prefix):
temp_pair = pseg.lcut(sentence[0:curr_index])
for i, (w, f) in enumerate(reversed(temp_pair)):
if f.startswith(u"x"):
break
elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
if (len(temp_pair)-i-2) > 0 and temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
return 1.3 * score
return -1.3 * score
temp_pair = pseg.lcut(seg)
for i, (w, f) in enumerate(reversed(temp_pair)):
if f.startswith(u"x"):
break
elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"):
if temp_pair[len(temp_pair)-i-2].word in set_neg_prefix:
return -0.6 * score
# ?????????????
for curr_very_prefix in set_very_prefix:
if seg.endswith(curr_very_prefix):
return 1.3 * score
return score