def getChList(docStrByte):
## ??????????????????????????????????
inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#?????????????????
strList = ''.join(inputStr.split('\n'))#????????????????
rawTokens = list(jieba.tokenize(strList))#????
#stopWord ? ???????key ???????value??None
fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
stopWord = {}.fromkeys(fSW.split('\n'))
stopWord[''] = None
final = []
s = nltk.stem.SnowballStemmer('english')
for seg in rawTokens:
# print(seg[0].strip())
rawWord = seg[0].strip()#strip()?????????????
if (rawWord.isalpha()):#?????????????
word = s.stem(rawWord)
else:
word = rawWord
if word not in stopWord:#?????
final.append(word)#????list
return final
python类tokenize()的实例源码
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize", file=sys.stderr)
def testTokenize_NOHMM(self):
for content in test_contents:
result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize_NOHMM", file=sys.stderr)
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUConfig, **Any) -> None
if config['language'] != 'zh':
raise Exception("tokenizer_jieba is only used for Chinese. Check your configure json file.")
for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))
def process(self, message, **kwargs):
# type: (Message, **Any) -> None
message.set("tokens", self.tokenize(message.text))
def tokenize(self, text):
# type: (Text) -> List[Token]
tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token