def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
# if token in self.stopwords:
# continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
python类wordpunct_tokenize()的实例源码
def parseTweetSet(tweets_data_path):
tweets_text = []
tweets_file = open(tweets_data_path, "r")
english_stopwords_set = set(stopwords.words('english'))
for line in tweets_file:
tweet = json.loads(line)
text = tweet['text']
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
words_set = set(words)
common_elements = words_set.intersection(english_stopwords_set)
if (len(common_elements)>2):
tweets_text.append(tweet['text'])
tweets_text_set = set(tweets_text)
#print len(tweets_text)
#print len(tweets_text_set)
#print tweets_text_set
return list(tweets_text_set)
def tokenize(self, text):
"""
Performs tokenization in addition to normalization.
"""
return self.normalize(nltk.wordpunct_tokenize(text))
def parse(sent):
parser = nltk.ChartParser(grammar)
tokens = nltk.wordpunct_tokenize(sent)
return parser.parse(tokens)
def tokenize(string, lower=True):
if lower:
return nltk.wordpunct_tokenize(string.lower().strip())
else:
return nltk.wordpunct_tokenize(string.strip())
def tokenize_and_normalize(string, lower=True):
if lower:
return nltk.wordpunct_tokenize(normalize(string).lower().strip())
else:
return nltk.wordpunct_tokenize(normalize(string).strip())
def nonenglish(string):
# '''Description: This function takes in the string of descriptions and return the string with nonenglish words removed (useful for course syllabi)
# Parameters: String of descriptions
# Output: the string with nonenglish words removed'''
words = set(nltk.corpus.words.words())
result=[w for w in nltk.wordpunct_tokenize(string) if w.lower() in words]
return " ".join(result)
def calculate_languages_ratios(text):
"""
Compute per language included in nltk number of unique stopwords appearing
in analyzed text.
"""
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = {word.lower() for word in tokens}
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
common_elements = words & stopwords_set
languages_ratios[language] = len(common_elements)
return languages_ratios
def translateHinglishTweets(tweets_text):
counter = 0
tweets_text_translated = []
n = len(tweets_text)
open_file = open("dictionary.pickle", "rb")
dictionary = pickle.load(open_file)
open_file.close()
english_stopwords_set = set(stopwords.words('english'))
for i in range(n):
text = tweets_text[i]
translated_text = ""
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
for word in words:
if word in english_stopwords_set:
translated_text = translated_text + " " + word
elif (word in dictionary):
#print word + "-" + dictionary[word]
translated_text = translated_text + " " + dictionary[word]
counter = counter + 1
else:
translated_text = translated_text + " " + word
tweets_text_translated.append(translated_text)
#print counter
return tweets_text_translated
def __call__(self, text):
'''
@param text: the string of text to be tagged
@returns: a list of tags respecting the order in the text
'''
sentences = nltk.sent_tokenize(text)
punctuation = set(string.punctuation)
proper_noun = lambda x: True if x == 'NN' else False
tags = []
#Giving importance to first sentece words.
if len(sentences) > 0:
#stripping away punctuation
words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation])
if len(words) > 1:
tags.append(Tag(str(words[0][0])))
for word, tag in words[1:-1]:
tags.append(Tag(str(word), proper=proper_noun(tag)))
tags.append(Tag(str(words[-1][0]),
proper=proper_noun(str(words[-1][1])),
terminal=True))
elif len(words) == 1:
tags.append(Tag(str(words[0][0]), terminal=True))
#Rest of the sentences
for sent in sentences[1:]:
words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation])
if len(words) > 1:
for word,tag in words[:-1]:
tags.append(Tag(str(word), proper=proper_noun(tag)))
if len(words) > 0:
tags.append(Tag(str(words[-1][0]),
proper=proper_noun(str(words[-1][1])),
terminal=True))
return tags