def parse_item(item):
"""
:param item: dictionary
:return: void
"""
def add_school_seq(t):
t['school_seq'] = jieba.lcut_for_search(t['school'])
def add_title_seq(t):
t['title_seq'] = build_tf(t['title'])[1]
def add_abstract_seq_and_tf(t):
t['abstract_seq_tf'], t['abstract_seq'] = build_tf(t['abstract'])
add_abstract_seq_and_tf(item)
add_school_seq(item)
add_title_seq(item)
item['_id'] = str(item['_id'])
return item
python类lcut_for_search()的实例源码
def __init__(self):
self.conn = conn
self.cursor = cursor
sql = '''
CREATE TABLE IF NOT EXISTS Competition(
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(100),
publishdate datetime,
detail TEXT
)ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;'''
self.cursor.execute(sql)
self.conn.commit()
for match in competition_list:
splitword = jieba.lcut_for_search(match) #????
onesplit = []
for word in splitword:
if re.match('??|??|??|???|??|??|??|?|??|??|??'.decode('utf8'), word) == None:
onesplit.append(word)
competition_split.append(onesplit)
def search_cut(sentence):
"""
HMM?????
:param sentence:
:return:
"""
return jieba.lcut_for_search(sentence)
inverted_files.py 文件源码
项目:Information_retrieva_Projectl-
作者: Google1234
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000):
'''
:param filename: ?????????.txt
:param read_buff_size:????????????
:param output_file_token_size:???????????????
:param ????????????????????? ??????
:return:??????
'''
#??????????????????
block_read=read_block(read_buff_size,filename)
punct = set(u'''/+%#:!),.:;?]}¢'"????????????????
?????????????????????????????
??•·???--?’”([{£¥'"??????????????????
?????????“‘-—_…''')
Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
buff_dir=filename[:-4]+'_buff' #?????????? ?????????????????????????
if os.path.exists(buff_dir):
pass
else:
os.mkdir(buff_dir)
file_numbers=1
while True:
print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers
spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt')
count=0
while True:
doc_id,content=block_read.pop_token()
if content==''or count==output_file_record_size:
break
content_list=jieba.lcut_for_search(content)
spimi.push_id(doc_id)
for j in range(len(content_list)):
if content_list[j] not in punct and content_list[j] not in Letters_and_numbers :
spimi.push_word(content_list[j])
del content_list,doc_id,content
count+=1
spimi.push_word('')#?? ?????
file_numbers+=1
if content=='':
break
print ("process :cuting word +making inverted_index files---->>>>Finish")
#????????
merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/')
print "process:mergeing inverted index files----->Finish"
#????????? ?-??????
Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt")
shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#????
shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt')
shutil.rmtree(buff_dir)#?????
del merged_filename,buff_dir,punct,Letters_and_numbers
def releventScore(self, text, ques, tfidf={}):
def filtWord(li):
# filt out stop words
nl = []
for l in li:
if l not in STOPWORDS:
nl.append(l)
return nl
def sims(t, q):
if t in self.dic.keys() and q in self.dic.keys():
vector1 = self.dic[t]
vector2 = self.dic[q]
dot_product = 0.0
normA = 0.0
normB = 0.0
for a, b in zip(vector1, vector2):
dot_product += a * b
normA += a**2
normB += b**2
if normA == 0.0 or normB == 0.0:
return 0
else:
return dot_product / ((normA * normB)**0.5)
else:
l = max([len(t), len(q)])
if Levenshtein.distance(t, q) < l:
return (l - Levenshtein.distance(t, q)) / l * 0.7
else:
return 0
ttoks = filtWord(jieba.lcut_for_search(text))
qtoks = filtWord(jieba.lcut_for_search(ques))
score = 0
if len(ttoks) == 0:
return 0
for tword in ttoks:
for qword in qtoks:
if tword in tfidf.keys():
rate = tfidf[tword]
else:
rate = 1
if tword == qword:
# exact match
score += rate * 2.5
elif sims(tword, qword) > 0.4:
# similar
score += sims(tword, qword) * rate
# remove advantage of length
return score / len(ttoks) / len(qtoks) * 100