python类word_tokenize()的实例源码-面圈网

regex.py 文件源码项目：linkedin_recommend 作者: duggalr2 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def tokenize_and_stem(text):
    """
    First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    """
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if 'intern' == token:
                token = ''
            if 'student' == token:
                token = ''
            if 'and' == token:
                token = ''
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0]
    return stems

readdata.py 文件源码项目：Natural-Language-Processing-Python-and-NLTK 作者: PacktPublishing 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

main.py 文件源码项目：That-s-Fake 作者: rajeevdesai 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk

representation.py 文件源码项目：YelpDataChallenge 作者: fujunswufe 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def get_sentence_tokens(text):
    '''
    Given a text(review), return the token list of each sentence
    :param text:
    :return:
    '''
    sentences = sent_tokenize(text)

    sent_tokens = []
    for sentence in sentences:
        sent_token = word_tokenize(sentence)
        sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
        sent_tokens.append(sent_token)
    # remove stop words and short tokens

    # stemming, experiment shows that stemming works nothing...
    # if (stemming):
    #     stemmer = PorterStemmer()
    #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
    return sent_tokens

createVocabulary.py 文件源码项目：review-classification 作者: vishnupriyam 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def createbigramvocabulary(reviewfile, vocabfile):
    createvocabulary(reviewfile, vocabfile)
    finput = open(reviewfile,"r")
    foutput = open(vocabfile,"a")

    all_bigrams = []
    for line in finput:
        tokenized_line = []
        tokenized_line.append('*')
        tokenized_line.extend(word_tokenize(line[1:]))
        tokenized_line.append('$')
        bgrms = bigrams(tokenized_line)
        all_bigrams.extend(bgrms)

    c = Counter(all_bigrams)

    for b in c:
        if (b[0] != "+" and b[0] != "-" and c[b] >= 3):
            foutput.write(b[0] + " " + b[1] + "\n")

    finput.close()
    foutput.close()

conversation.py 文件源码项目：facebook-message-analysis 作者: szheng17 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def word_count(message, word):
        """
        Computes the number of times a word appears in a message
        (case-insensitive).

        Args:
            message: A Message object.
            word: A string with no spaces.

        Returns:
            An int representing the number of times word (case-insensitive)
                appears in the text of message split by spaces.
        """
        if ' ' in word:
            raise ValueError('word cannot contain spaces')
        lowercase_tokens = [token.lower() for token in nltk.word_tokenize(message.text)]
        return lowercase_tokens.count(word.lower())

mem-network.py 文件源码项目：Deep-Learning-with-Keras 作者: PacktPublishing 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
        for question in questions:
            for word in nltk.word_tokenize(question):
                counter[word.lower()] += 1
        for answer in answers:
            for word in nltk.word_tokenize(answer):
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

QnARecurAtteLatest2GRU.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

QnARecurAtteLatest3Atten.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

QnARecurAtteLatest2Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

QnARecurAtteLatest2GRU1SATTE.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

QnARecurAtteLatest3Attenenhance.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

QnARecurAtteLatest.py 文件源码项目：recurrent-attention-for-QA-SQUAD-based-on-keras 作者: wentaozhu 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def tokenizeVal(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    tokenizedSent = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sent)]
    tokenIdx2CharIdx = [None] * len(tokenizedSent)
    idx = 0
    token_idx = 0
    while idx < len(sent) and token_idx < len(tokenizedSent):
        word = tokenizedSent[token_idx]
        if sent[idx:idx+len(word)] == word:
            tokenIdx2CharIdx[token_idx] = idx
            idx += len(word)
            token_idx += 1 
        else:
            idx += 1
    return tokenizedSent, tokenIdx2CharIdx

preprocess_data.py 文件源码项目：identifiera-sarkasm 作者: risnejunior 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def tokenize_text( sample_text ):
    global sequence_lengths
    processed_text = []

    if cfg.remove_punctuation:
        cleaned = sample_text.lower().translate( t_table )
    else:
        cleaned = sample_text

    if cfg.use_casual_tokenizer:
        tokens = tknzr.tokenize( cleaned )
    else:
        tokens = nltk.word_tokenize( cleaned, language='english')

    if cfg.remove_stopwords:
        tokens = [w for w in tokens if not w in stopwords.words('english')]

    sequence_lengths.append( len( tokens ) )
    processed_text.extend( tokens )

    return processed_text

amazon_process.py 文件源码项目：shalo 作者: henryre 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def process_imdb(fname, setting):
    labels, sentences = [], []
    filename = setting + ".csv"
    quota = [0,0]
    if setting == 'test':
        maxquota = 5000
    else:
        maxquota = 15000
    with open(os.path.join(fname, filename), 'rb') as f:
        csvreader = csv.reader(f)
        for line in csvreader: 
            label = 0 if line[0] ==  "1" else 1
            quota[label] += 1
            if quota[label] > maxquota:
                continue
            sentence = line[2].replace("\"", "")
            text = nltk.word_tokenize(sentence.decode('utf-8'))
            labels.append(int(label))
            sentences.append(text)
    return sentences, labels

word_tokenizers.py 文件源码项目：paraphrase-id-tensorflow 作者: nelson-liu 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def tokenize(self, sentence):
        """
        Given a string, tokenize it into words (with the conventional notion
        of word).

        Parameters
        ----------
        sentence: str
            The string to tokenize.

        Returns
        -------
        tokenized_sentence: List[str]
            The tokenized representation of the string, as a list of tokens.
        """
        return nltk.word_tokenize(sentence.lower())

textindex.py 文件源码项目：cloud-vision 作者: GoogleCloudPlatform 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def add(self, filename, document):
        """
        Add a document string to the index.
        """
        # You can uncomment the following line to see the words found in each
        # image.
        # print("Words found in %s: %s" % (filename, document))
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue
            if token in ['.', ',', ':', '']:
                continue
            if self.stemmer:
                token = self.stemmer.stem(token)
            # Add the filename to the set associated with the token.
            self.redis_token_client.sadd(token, filename)

        # store the 'document text' for the filename.
        self.redis_docs_client.set(filename, document)

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def _identify_pronoun(self, answer):
        """Calculate percentage of pronouns within answer
        - Args:
            answer(str): answer text
        - Returns:
            percentage(float): ratio of pronouns in answer
        """
        text = nltk.word_tokenize(answer)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        # init variables
        num_pronouns = 0
        num_terms = len(post)
        percentage = 0
        for k, v in post:
            if v in pronoun_list:
                num_pronouns += 1
        percentage = float(num_pronouns) / num_terms
        return percentage

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def _identify_pronoun2(self, sentence):
        """Calculate percentage of pronouns in the sentence that are in the answer
        - Args:
            sentence(str): question sentence 
        - Returns:
            pronoun_in_sentence(list): pronouns in sentence 
            sentence_len(int): length of current sentence 
        """
        text = nltk.word_tokenize(sentence)
        post = nltk.pos_tag(text)
        pronoun_list = ['PRP', 'PRP$', 'WP', 'WP$']
        pronoun_in_sentence = []
        sentence_len = len(post)
        for k, v in post:
            if v in pronoun_list:
                pronoun_in_sentence.append(k)
        return pronoun_in_sentence, sentence_len

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def _first_tagger_after_answer_span(self, question):
        """Get the first tagger after answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term after span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == '_____':
                index = idx + 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'

feature_construction.py 文件源码项目：Automatic-Question-Generation 作者: bwanglzu 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def _first_tagger_before_answer_span(self, question):
        """Get the first tagger before answer span
        - Args:
            question(string): string of current question 
        - Returns:
            tagger(string): tagger of first term before span
        """
        index = 0
        text = nltk.word_tokenize(question)
        post = nltk.pos_tag(text)
        for idx, t in enumerate(post):
            if t[0] == "_____":
                index = idx - 1
                break
        try:
            return post[index][1]
        except IndexError:
            return 'dummy'

env_test.py 文件源码项目：squadgym 作者: aleSuglia 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("env_data", type=str, help="Generated environment data filename in JSON format")
    args = parser.parse_args()

    print("-- Initialized environment")
    env = SquadEnv(args.env_data)

    context, question = env.reset()
    done = False

    while not done:
        print("Context ids: {}".format(context))
        print("Question ids: {}".format(question))
        print("Context tokens: {}".format(ids2tokens(context, env.id2token)))
        print("Question tokens: {}".format(ids2tokens(question, env.id2token)))
        answer_tokens = tokens2ids(word_tokenize(input("Answer: ")) + ["#eos#"], env.token2id)

        question_reward = 0
        for token in answer_tokens:
            (context, question), reward, done, _ = env.step(token)
            question_reward += reward

        print("You got {} reward".format(question_reward))

lemmatiser.py 文件源码项目：LDA-REST 作者: valentinarho 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def LemNormalize(text):
    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)
    # shortword = re.compile(r'\W*\b\w{1,2}\b')
    # transformed = shortword.sub('', transformed)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # remove short words (less than 3 char)
    tokenized = [w for w in tokenized if len(w) > 3]
    tokenizer = LemTokens(tokenized)

    return tokenizer

lemmatiser.py 文件源码项目：LDA-REST 作者: valentinarho 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def LemNormalizeIt(text):

    # convert non ascii characters
    text = text.encode('ascii', 'replace').decode()
    # remove punctuation and digits
    remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
    transformed = text.lower().translate(remove_punct_and_digits)

    # tokenize the transformed string
    tokenized = nltk.word_tokenize(transformed)

    # apply lemming with morph it
    morph_it = load_morph_it()
    tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]

    return tokenized

algorithm.py 文件源码项目：wntf 作者: tonybaloney 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def tag(self, lines):
        '''
        Tokenize and categorise the words in the collection of
        text

        :param lines: The list of strings with the text to match
        :type  lines: ``list`` of ``str``

        :rtype: :class:
        :return:
        '''
        try:
            tokenized_words = nltk.word_tokenize(lines)
            return nltk.pos_tag(tokenized_words)
        except LookupError as le:
            print("Run install_words.py first")
            raise le

keyword_extractor.py 文件源码项目：resume-optimizer 作者: mhbuehler 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def _generate_candidate_keywords(self, sentences, max_length=3):
        """Creates a list of candidate keywords, or phrases of at most max_length words, from a set of sentences"""
        phrase_list = []
        for sentence in sentences:
            words = map(lambda x: "|" if x in self.stopwords else x,
                        nltk.word_tokenize(sentence.lower()))
            phrase = []
            for word in words:
                if word == "|" or is_punctuation(word):
                    if len(phrase) > 0:
                        if len(phrase) <= max_length:
                            phrase_list.append(phrase)
                        phrase = []
                else:
                    phrase.append(word)

        return phrase_list

hnmt.py 文件源码项目：hnmt 作者: robertostling 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def get_tokenizer(name, lowercase):
    if name == 'char':
        if lowercase:
            return (lambda s: list(s.strip().lower()))
        else:
            return (lambda s: list(s.strip()))
    elif (name == 'space') or (name == 'bpe'):
        if lowercase:
            return (lambda s: s.lower().split())
        else:
            return str.split
    elif name == 'word':
        if lowercase:
            return (lambda s: word_tokenize(s.lower()))
        else:
            return word_tokenize
    else:
        raise ValueError('Unknown tokenizer: "%s"' % name)

dataset.py 文件源码项目：R-net 作者: matthew-z 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def _set_tokenizer(self, tokenizer):
        """
        Set tokenizer

        :param tokenizer: tokenization method
        :return: None
        """
        if tokenizer == "nltk":
            self.tokenizer = nltk.word_tokenize
        elif tokenizer == "spacy":
            spacy_en = spacy.load("en")

            def spacy_tokenizer(seq):
                return [w.text for w in spacy_en(seq)]

            self.tokenizer = spacy_tokenizer
        else:
            raise ValueError("Invalid tokenizing method %s" % tokenizer)

sparkcrawl.py 文件源码项目：FYP-AutoTextSum 作者: MrRexZ 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def map_coocurence(context_size, data):
    coocurrence_list = []
    try:
        if detect(data) == 'en':
            region = nltk.word_tokenize(data)
            for l_context, word, r_context in _context_windows(region, context_size, context_size):
                if isWord(word):
                    for i, context_word in enumerate(l_context[::-1]):
                        if isWord(context_word):
                            coocurrence_list.append(((word, context_word), 1 / (i + 1)))
                    for i, context_word in enumerate(r_context):
                        if isWord(context_word):
                            coocurrence_list.append(((word, context_word), 1 / (i + 1)))
    except LangDetectException:
        return coocurrence_list
    return coocurrence_list

dependencygraph.py 文件源码项目：one-day-with-cling 作者: mariana-scorp 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def from_sentence(sent):
        tokens = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(tokens)

        dg = DependencyGraph()
        for (index, (word, tag)) in enumerate(tagged):
            dg.nodes[index + 1] = {
                'word': word,
                'lemma': '_',
                'ctag': tag,
                'tag': tag,
                'feats': '_',
                'rel': '_',
                'deps': defaultdict(),
                'head': '_',
                'address': index + 1,
            }
        dg.connect_graph()

        return dg