python类tokenize()的实例源码-面圈网

categorizing.py 文件源码项目：nlp-chinese_text_classification 作者: iamiamn 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def getChList(docStrByte):
    ## ??????????????????????????????????

    inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#?????????????????
    strList = ''.join(inputStr.split('\n'))#????????????????
    rawTokens = list(jieba.tokenize(strList))#????

    #stopWord ? ???????key ???????value??None
    fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read()
    stopWord = {}.fromkeys(fSW.split('\n'))
    stopWord[''] = None

    final = []
    s = nltk.stem.SnowballStemmer('english')
    for seg in rawTokens:
        # print(seg[0].strip())
        rawWord = seg[0].strip()#strip()?????????????
        if (rawWord.isalpha()):#?????????????
            word = s.stem(rawWord)
        else:
            word = rawWord

        if  word not in stopWord:#?????
            final.append(word)#????list
    return final

analyzer.py 文件源码项目：PTTChatBot_DL2017 作者: thisray 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

analyzer.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr)

jieba_test.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr)

test_tokenize_no_hmm.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

test_tokenize.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

analyzer.py 文件源码项目：Malicious_Domain_Whois 作者: h-j-13 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

analyzer.py 文件源码项目：jieba-GAE 作者: liantian-cn 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

analyzer.py 文件源码项目：my_bit_v1 作者: iSawyer 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

jieba_tokenizer.py 文件源码项目：Rasa_NLU_Chi 作者: crownpku 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        if config['language'] != 'zh':
            raise Exception("tokenizer_jieba is only used for Chinese. Check your configure json file.")

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

jieba_tokenizer.py 文件源码项目：Rasa_NLU_Chi 作者: crownpku 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def process(self, message, **kwargs):
        # type: (Message, **Any) -> None

        message.set("tokens", self.tokenize(message.text))

jieba_tokenizer.py 文件源码项目：Rasa_NLU_Chi 作者: crownpku 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def tokenize(self, text):
        # type: (Text) -> List[Token]
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens

analyzer.py 文件源码项目：http_server 作者: chenguolin 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

analyzer.py 文件源码项目：http_server 作者: chenguolin 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token