text_processor.py 文件源码-python代码片段

def tokenize(str_stream, eos=True, remove_punct=False):
    """
    Given a str or str_stream (f.read()) convert the str to a list of sentences,
        e.g.: [[word, word], [word, word, ...], ...]
    :param str_stream: a str or a str_stream
    :param eos: wether turns '.' into <eos> tag
    :param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
    :return: a list of sentences, each sentence is a list of words (str)
    """
    # do lazy import coz import nltk is very slow
    import nltk
    try:
        nltk.data.load('tokenizers/punkt/english.pickle')
    except LookupError:
        print('punct resource not found, using nltk.download("punkt") to download resource data...')
        nltk.download('punkt')
    tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
    # get POS Tags
    tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
    pos_tags = []
    for token_tags in tokens_tags:
        _, tags = zip(*token_tags)
        pos_tags.append(tags)
    # tag number
    tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
    if eos:
        for token in tokens:
            token[-1] = '<eos>'
    if remove_punct:
        tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
    return tokens, pos_tags