def parse():
"""parse the comments"""
import jieba
import jieba.posseg as pseg
# Load User's Dictionary
path_list = os.getcwd().split('/')
path_list.append("dict.txt")
dict_path = '/'.join(path_list)
jieba.load_userdict(dict_path)
# Disimss These Flags
dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
'ud', 'ug', 'uv']
comments = Comment.query.all()
for comment in comments:
word_list = []
pseg_cut = pseg.cut(comment.body)
for word, flag in pseg_cut:
if flag not in dismiss:
word_list.append(word)
comment.parsed = '/'.join(word_list)
db.session.add(comment)
print "Comment %04d Parsed!" % comment.id
db.session.commit()
print "ALL DONE!"
python类posseg()的实例源码
def synonym_cut(sentence, pattern="wf"):
"""Cut the sentence into a synonym vector tag.
??????????????
If a word in this sentence was not found in the synonym dictionary,
it will be marked with default value of the word segmentation tool.
????????????????????????
Args:
pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
"""
sentence = sentence.rstrip(tone_words)
synonym_vector = []
if pattern == "w":
result = list(jieba.cut(sentence))
synonym_vector = [item for item in result if item not in punctuation_all]
elif pattern == "k":
synonym_vector = analyse.extract_tags(sentence, topK=1)
elif pattern == "t":
synonym_vector = analyse.extract_tags(sentence, topK=10)
elif pattern == "wf":
result = posseg.cut(sentence)
# synonym_vector = [(item.word, item.flag) for item in result \
# if item.word not in punctuation_all]
# Modify in 2017.4.27
for item in result:
if item.word not in punctuation_all:
if len(item.flag) < 4:
item.flag = list(posseg.cut(item.word))[0].flag
synonym_vector.append((item.word, item.flag))
elif pattern == "tf":
result = posseg.cut(sentence)
tags = analyse.extract_tags(sentence, topK=10)
for item in result:
if item.word in tags:
synonym_vector.append((item.word, item.flag))
return synonym_vector
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def testPosseg(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg", file=sys.stderr)
def testPosseg_NOHMM(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
print("testPosseg_NOHMM", file=sys.stderr)
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def testPosseg(self):
import jieba.posseg as pseg
for content in test_contents:
result = pseg.cut(content)
assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
result = list(result)
assert isinstance(result, list), "Test Posseg error on content: %s" % content
print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result])
print >> sys.stderr, "testPosseg"
def isJux(answer):
"""
??????????????1.????0
:param answer:
:return:
"""
tag = []
seg = jieba.posseg.cut(answer)
for i in seg:
if i.flag == '*':
tag.extend((i.word, i.flag))
if len(tag)/len(answer.split(u'?')) > 0.5:
return 0
else:
return 1
def getLocations(self):
seg_list = jieba.posseg.cut( self.content )
ns = []
lastNs = False
for i in seg_list:
if i.flag == 'nnl':
print (i.word)
if lastNs:
ns[-1] += i.word
else:
ns.append( i.word )
lastNs = True
else:
lastNs = False
return ns
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
word_list = []
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
word_list.append(w)
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
res_list = []
for word in word_list:
weight = freq[word]
res_list.append((word, weight))
return res_list
def cutfunc(sentence, _, HMM=True):
for w, f in jieba.posseg.cut(sentence, HMM):
yield w + posdelim + f
def __init__(self, idf_path=None):
self.tokenizer = jieba.dt
self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()
utils.py 文件源码
项目:Content-Based-News-Recommendation-System-in-Spark
作者: Labyrinth108
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def preprocess_WithSpeech(stopword_file, news):
content = jieba.posseg.cut(news) # ??????
content = filter(lambda x: hasMeaningfulWords(x), content) # ???????????
content = [i.word for i in content]
content = filter(lambda x: len(x) > 1, content) # ?????
stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()]
parsed = set(content) - set(stopw)
return ' '.join(parsed)
util.py 文件源码
项目:Content-Based-News-Recommendation-System-in-Spark
作者: Labyrinth108
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def preprocess_per_news(news):
content = jieba.posseg.cut(news) # ??????
content = filter(lambda x: hasMeaningfulWords(x), content) # ???????????
content = [i.word for i in content]
content = filter(lambda x: len(x) > 1, content) # ?????
stopword_file = "/Users/luoyi/Scala/OnlineRS/com/Recsys_engine/data/stop_word.txt"
stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()]
parsed = set(content) - set(stopw)
return ' '.join(parsed)
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
words = self.postokenizer.cut(sentence)
else:
words = self.tokenizer.cut(sentence)
freq = {}
for w in words:
if allowPOS:
if w.flag not in allowPOS:
continue
elif not withFlag:
w = w.word
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(freq, key=freq.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags