python类wordpunct_tokenize()的实例源码-面圈网

NLTKPreprocessor.py 文件源码项目：ai-chatbot-framework 作者: alfredfrancis 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

analyzeTweets.py 文件源码项目：TwitterHinglishTranslation 作者: anant14014 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def parseTweetSet(tweets_data_path):
    tweets_text = []
    tweets_file = open(tweets_data_path, "r")
    english_stopwords_set = set(stopwords.words('english'))
    for line in tweets_file:
        tweet = json.loads(line)
        text = tweet['text']
        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]
        words_set = set(words)
        common_elements = words_set.intersection(english_stopwords_set)
        if (len(common_elements)>2):
            tweets_text.append(tweet['text'])

    tweets_text_set = set(tweets_text)
    #print len(tweets_text)
    #print len(tweets_text_set)
    #print tweets_text_set
    return list(tweets_text_set)

normalize.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def tokenize(self, text):
        """
        Performs tokenization in addition to normalization.
        """
        return self.normalize(nltk.wordpunct_tokenize(text))

parse.py 文件源码项目：atap 作者: foxbook 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens)

pre_processing.py 文件源码项目：twitter-gen-classifier-pt 作者: mvicente93 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def tokenize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(string.lower().strip())
    else:
        return nltk.wordpunct_tokenize(string.strip())

pre_processing.py 文件源码项目：twitter-gen-classifier-pt 作者: mvicente93 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def tokenize_and_normalize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(normalize(string).lower().strip())
    else:
        return nltk.wordpunct_tokenize(normalize(string).strip())

utils.py 文件源码项目：tRECS 作者: TeeOhh 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def nonenglish(string):
    # '''Description: This function takes in the string of descriptions and return the string with nonenglish words removed (useful for course syllabi)
    #   Parameters: String of descriptions
    #   Output: the string with nonenglish words removed'''
    words = set(nltk.corpus.words.words())
    result=[w for w in nltk.wordpunct_tokenize(string) if w.lower() in words]
    return " ".join(result)

article_language.py 文件源码项目：fake_news 作者: bmassman 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def calculate_languages_ratios(text):
    """
    Compute per language included in nltk number of unique stopwords appearing
    in analyzed text.
    """
    languages_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = {word.lower() for word in tokens}
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        common_elements = words & stopwords_set
        languages_ratios[language] = len(common_elements)
    return languages_ratios

analyzeTweets.py 文件源码项目：TwitterHinglishTranslation 作者: anant14014 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def translateHinglishTweets(tweets_text):
    counter = 0
    tweets_text_translated = []
    n = len(tweets_text)

    open_file = open("dictionary.pickle", "rb")
    dictionary = pickle.load(open_file)
    open_file.close()

    english_stopwords_set = set(stopwords.words('english'))

    for i in range(n):
        text = tweets_text[i]
        translated_text = ""
        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]
        for word in words:
            if word in english_stopwords_set:
                translated_text = translated_text + " " + word
            elif (word in dictionary):
                #print word + "-" + dictionary[word]
                translated_text = translated_text + " " + dictionary[word]
                counter = counter + 1
            else:
                translated_text = translated_text + " " + word
        tweets_text_translated.append(translated_text)

    #print counter
    return tweets_text_translated

tagger.py 文件源码项目：teem-tag 作者: P2Pvalue 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def __call__(self, text):
        '''
        @param text: the string of text to be tagged

        @returns: a list of tags respecting the order in the text
        '''


        sentences = nltk.sent_tokenize(text)
        punctuation = set(string.punctuation)
        proper_noun = lambda x: True if x == 'NN' else False

        tags = []

        #Giving importance to first sentece words.
        if len(sentences) > 0:
            #stripping away punctuation
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation])

            if len(words) > 1:
                tags.append(Tag(str(words[0][0])))
                for word, tag in words[1:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
            elif len(words) == 1:
                tags.append(Tag(str(words[0][0]), terminal=True))

        #Rest of the sentences
        for sent in sentences[1:]:
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation])
            if len(words) > 1:
                for word,tag in words[:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
            if len(words) > 0:
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
        return tags