def _index_docs(indexFile, writer):
for line in indexFile:
ind, ent_name, info, keywords, imgurl, filename, url = line.split('\t')
print("adding %s" % ind)
filename = "{:05d}".format(int(ind)) + '.jpg'
keywords = keywords.replace('%', ' ')
ent_name = " ".join(x.strip() for x in jieba.cut_for_search(ent_name))
keywords = " ".join(x.strip() for x in jieba.cut_for_search(keywords))
try:
doc = Document()
doc.add(Field('ind', ind, Field.Store.YES, Field.Index.NO))
doc.add(Field('ent_name', ent_name, Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field('keywords', keywords, Field.Store.NO, Field.Index.ANALYZED))
# doc.add(Field('n_colors', n_colors, Field.Store.NO, Field.Index.ANALYZED))
writer.addDocument(doc)
except Exception, e:
print("Failed in indexDocs: %r" % e)
python类cut_for_search()的实例源码
def GET(self):
data=web.input()
if data:
searchword=data.searchword
else:
searchword=''
news_list=list()
topic=list()
if searchword:
cut = jieba.cut_for_search(searchword)
word_list = []
for word in cut:
if word not in punct and word not in Letters_and_numbers:
word_list.append(word.encode("utf-8"))
topK=query.calculate(word_list,config.query_return_numbers)
for k in topK:
data = dict()
title, content, url= id_index.get_data(k)
data['id'] = k
data['content'] = content.decode("utf-8")[:config.query_return_snipper_size]
data['title']=title.decode("utf-8")
data['url'] = url.decode("utf-8")
news_list.append(data)
del data,cut,word_list,word,topK,title,content,url
#word2Vec??????
word2vec.cal(searchword.encode('utf-8'))
print word2vec.result.length
if word2vec.result.length==0:#????????1
pass
else:
for i in range(config.recommand_topic_numbers):
topic.append(word2vec.result.word[i].char)
return render.index(searchword,news_list,topic)
def GET(self):
data=web.input()
if data:
ID=data.id
news = dict()
title, content, url=id_index.get_data(int(ID))
news['content'] = content.decode("utf-8")
news['title'] = title.decode("utf-8")
news['url'] = url.decode("utf-8")
recomand=[]
#????
cut = jieba.cut_for_search(content)
word_list = []
for word in cut:
if word not in punct and word not in Letters_and_numbers:
# ????????????????????
if recommand.stopword.has_key(word.encode("utf-8")):
pass
else:
word_list.append(word.encode("utf-8"))
topk= recommand.calculate(word_list, config.recommand_numbers, 10)
for i in topk:#????
#for i in recommand.dic[int(ID)]:#????
if i !=int(ID):
title, content, url=id_index.get_data(i)
recomand.append([title.decode('utf-8'),content.decode('utf-8'),url.decode('utf-8')])
news['recommand']=recomand
del title,content,url,recomand
else:
ID=''
news = dict()
news['title'] = "No Such News"
news['content'] = "Oh No!"
news['url'] = "#"
news['recommand']=[['','',''] for m in range(config.recommand_numbers)]
return render.news(news)
def calculate(self,doc_id,Top_numbers=10,multiple=10):
title,content,url=self.index.get_data(doc_id)
cut=jieba.cut_for_search(content)
word_list=[]
for word in cut:
if word not in self.punct and word not in self.Letters_and_numbers :
#????????????????????
if self.stopword.has_key(word.encode("utf-8")):
pass
else:
word_list.append(word.encode("utf-8"))
return self.FastCos.calculate(word_list,Top_numbers,multiple)
def cut_search(data):
'''
?????????????????????????????
??????-->??/??/??/????
'''
temp_result = jieba.cut_for_search(data)
temp_result = '/'.join(temp_result)
return temp_result
def search_func_factory(analyzer, searcher, vm_env):
"""Search function factory"""
def retrieve(doc):
return doc.get('ind')
def search(**kwargs):
vm_env.attachCurrentThread()
query = BooleanQuery()
print("Searched keywords:")
for field_name, keywords in kwargs.items():
# assert field_name in SearchConfig.searchable_fields
# keywords = list(filter(None, jieba.cut(keywords, cut_all=True)))
keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords))))
for kw in keywords:
print(kw)
# construct query
for kw in keywords:
q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw)
query.add(q, BooleanClause.Occur.SHOULD)
if field_name == 'keywords':
for kw in keywords:
q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw)
query.add(q, BooleanClause.Occur.SHOULD)
# search
scoreDocs = searcher.search(query, 50).scoreDocs
return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]
return search
def CUT(f):
s = jieba.cut_for_search(f)
return ' '.join(s)
def testCutForSearch(self):
for content in test_contents:
result = jieba.cut_for_search(content)
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testCutForSearch", file=sys.stderr)
def testCutForSearch_NOHMM(self):
for content in test_contents:
result = jieba.cut_for_search(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print(" , ".join(result), file=sys.stderr)
print("testCutForSearch_NOHMM", file=sys.stderr)
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print(word, "/", end=' ')
print("")
def run(self):
seg_list = jieba.cut("?????????",cut_all=True)
print("Full Mode:" + "/ ".join(seg_list)) #???
seg_list = jieba.cut("?????????",cut_all=False)
print("Default Mode:" + "/ ".join(seg_list)) #????
seg_list = jieba.cut("??????????")
print(", ".join(seg_list))
seg_list = jieba.cut_for_search("??????????????????????????") #??????
print(", ".join(seg_list))
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print(word, "/", end=' ')
print("")
def cut_for_search(self, text):
'''
@summary: ??????????????????????????????????????????
---------
@param text: ??
---------
@result:
'''
result = list(jieba.cut_for_search(text))
result = self.__del_stop_key(result)
return result
def extractSearchWords(self,sentence):
return list(jieba.cut_for_search(sentence))
def seg_for_search(self, sentence):
words = list()
for item in jieba.cut_for_search(sentence):
words.append(item)
return words
def testCutForSearch(self):
for content in test_contents:
result = jieba.cut_for_search(content)
assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
result = list(result)
assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
print >> sys.stderr, " , ".join(result)
print >> sys.stderr, "testCutForSearch"
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print word, "/",
print ""
def run(self):
seg_list = jieba.cut("?????????",cut_all=True)
print "Full Mode:" + "/ ".join(seg_list) #???
seg_list = jieba.cut("?????????",cut_all=False)
print "Default Mode:" + "/ ".join(seg_list) #????
seg_list = jieba.cut("??????????")
print ", ".join(seg_list)
seg_list = jieba.cut_for_search("??????????????????????????") #??????
print ", ".join(seg_list)
def cuttest(test_sent):
result = jieba.cut_for_search(test_sent)
for word in result:
print word, "/",
print ""
def test_demo1():
text = "?????????"
seg_list = jieba.cut(text, cut_all=True)
print u"[???]: ", "/ ".join(seg_list)
seg_list = jieba.cut(text, cut_all=False)
print u"[????]: ", "/ ".join(seg_list)
seg_list = jieba.cut(text)
print u"[????]: ", "/ ".join(seg_list)
seg_list = jieba.cut("??????????")
print u"[????]: ", "/ ".join(seg_list)
seg_list = jieba.cut_for_search(text)
print u"[??????]: ", "/ ".join(seg_list)
# Read file and cut