def getData(Mentions,S,E,contextMention,contextEntity, id):
for mention in Mentions:
jieba.add_word(mention.name)
S.append(mention.name)
id.append('-') #????????????
contextMention[mention.name] = mention.context
for item in Mentions:
temp = []
cnt = 0
for candidate in item.candidates:
if cnt > 100:
break
cnt += 1
temp.append(candidate.title)
contextEntity[candidate.title] = candidate.context
id.append(candidate.id)
E.append(temp)
# element{???mention/entity}
# ??element
python类add_word()的实例源码
def test():
x = u"??????????????????Nintendo???2006-11-02???????????????????????????3???????????????????????????????????????"
x = u'???????????????'
sentences = split_sentences(x)
# jieba.add_word(u'????????', 5, 'baike')
# jieba.add_word(u'Nintendo', 5, 'baike')
# jieba.add_word(u'????', 5, 'baike')
# jieba.add_word(u'???', 5, 'baike')
# jieba.add_word(u'????', 5, 'baike')
# name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.sample.cache')
# fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.sample.cache')
name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache')
fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache')
finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path)
for x in gen_dataset(sentences[0], finder):
print x
print '-' * 50
for x in gen_dataset(sentences[1], finder):
print x
def add_word_dict(word, freq=None, tag=None):
'''
?????????
'''
jieba.add_word(word, freq=None, tag=None)
def jiebaCustomSetting(self, dict_path, usr_dict_path):
jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))
def TaibaCustomSetting(self, usr_dict):
with open(usr_dict, 'r', encoding='utf-8') as dic:
for word in dic:
Taiba.add_word(word.strip('\n'))
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word(u"??", 10000)
jieba.suggest_freq((u"?", u"??"))
jieba.suggest_freq((u"??", u"??"))
jieba.suggest_freq((u"??", u"??"))
jieba.suggest_freq((u"??", u"?"))
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word("??", 10000)
jieba.suggest_freq(("?", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "?"))
data_preprocess.py 文件源码
项目:Neural-Headline-Generator-CN
作者: QuantumLiu
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
jieba.enable_parallel(32)
for word in custom_words:
jieba.add_word(word)
words=jieba.lcut(text)
return words
def __init__(self, slack, custom):
self.slack = slack
self.rundata = custom['data']
self.colorPrint = custom['colorPrint']
self.food_dir = "data/midnight.json"
self.food_dic = "data/dict.txt.big"
# find midnight channel
self.nochannel = False
rep = self.slack.api_call("channels.list")
self.channel_id = ""
for c in rep['channels']:
if c['name'].lower() == custom['food_channelname']:
self.channel_id = c['id']
break
if not self.channel_id:
self.colorPrint(
"No midnight channel",
"Restart when midnight channel can use",
color="FAIL")
self.nochannel = True
return
jieba.set_dictionary(self.food_dic)
jieba.initialize()
# add and del words
for word in self.rundata.get('FOOD_addword'):
jieba.add_word(word)
for word in self.rundata.get('FOOD_delword'):
jieba.del_word(word)
self.init()
def __init__(self):
self.negative = []
self.adverb = []
self.questionMark = []
self.rootPath = "E:\workout\data\senitment_data"
self.wordtypeDict, self.wordfreqDict = self.UserDefineLibrary()
for word in self.wordfreqDict.keys():
jieba.add_word(str(word))
self.initialize()
# @staticmethod
def jiebaCustomSetting(self, dict_path, usr_dict_path):
jieba.set_dictionary(dict_path)
with open(usr_dict_path, 'r', encoding='utf-8') as dic:
for word in dic:
jieba.add_word(word.strip('\n'))
def TaibaCustomSetting(self, usr_dict):
with open(usr_dict, 'r', encoding='utf-8') as dic:
for word in dic:
Taiba.add_word(word.strip('\n'))
def __init():
user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt")
jieba.load_userdict(user_dict_path)
jieba.add_word("??", 10000)
jieba.suggest_freq(("?", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "??"))
jieba.suggest_freq(("??", "?"))
def main(self, datadict):
if self.nochannel:
return
if datadict['type'] == 'message' and \
datadict.get('subtype') == "file_share" and \
datadict.get('channel') == self.channel_id:
self.imageAdd(datadict['file'])
if not datadict['type'] == 'message' or 'subtype' in datadict:
return
if datadict['text'].startswith("food "):
text = re.search(
r"(?<=food ).*", datadict['text'], re.DOTALL).group().strip()
payload = {
"username": "?? Midnight",
"icon_emoji": ":_e9_a3_9f:",
"thread_ts": datadict.get("thread_ts")or'',
"channel": datadict['channel']}
try:
ans = self.wordSearch(text)
self.slack.api_call("chat.postMessage",
attachments=[self.wordParse(ans)],
**payload
)
except BaseException:
self.slack.api_call("chat.postMessage",
text="Sorry Not Found",
**payload
)
elif datadict['text'].startswith("foodadd "):
text = re.search(r"(?<=foodadd ).*",
datadict['text']).group().strip()
jieba.add_word(text)
self.rundata.append("FOOD_addword", text)
self.init()
elif datadict['text'].startswith("fooddel "):
text = re.search(r"(?<=fooddel ).*",
datadict['text']).group().strip()
jieba.del_word(text)
self.rundata.append("FOOD_delword", text)
self.init()