def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = EnglishStemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
python类PorterStemmer()的实例源码
def select_top_words(word_list, n=10):
""" Filter out cluster term names"""
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
st = PorterStemmer()
out_st = []
out = []
for word in word_list:
word_st = st.stem(word)
if len(word_st) <= 2 or\
re.match('\d+', word_st) or \
re.match('[^a-zA-Z0-9]', word_st) or\
word in COMMON_FIRST_NAMES or \
word in CUSTOM_STOP_WORDS or\
word in ENGLISH_STOP_WORDS or \
word_st in out_st: # ignore stemming duplicate
continue
out_st.append(word_st)
out.append(word)
if len(out) >= n:
break
return out
def porter(inputpath=None, text=None):
"""
docstring
"""
data = ''
p = PorterStemmer()
if inputpath:
filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
pstemmed_list = []
for file in filenames:
with open(file, 'r') as f:
data = f.read()
if data:
texts = data.split(',')
stemmedfile = []
for text in texts:
pstemmed = p.stem(text)
stemmedfile.append(pstemmed)
pstemmed_list.extend(stemmedfile)
return pstemmed_list
if text:
pstemmed = p.stem(text)
return pstemmed
def stem_split(tokens):
""" Takes a list of tokens and splits stemmed tokens into
stem, ending - inserting ending as extra token.
returns: revised (possibly longer) list of tokens. """
stemmer = PorterStemmer()
token_list = list()
for token in tokens:
stem = stemmer.stem(token)
split_list = token.split(stem)
if token == stem:
token_list.append(token)
elif len(split_list) > 1:
token_list.append(stem)
token_list.append(split_list[1])
else:
token_list.append(split_list[0])
return token_list
def stem(words,stem_dic,mode="nltk",silent=1):
if silent==0:
print("stem ...")
if mode == "nltk":
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
else:
print("unknown mode",mode)
assert 0
for word in set(words):
if word not in stem_dic:
stem_dic[word] = stemmer.stem(word)
words = [stem_dic[word] for word in words]
return words
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
not token.isdigit() and not token in stop_words]
def __porter_stemmer(self):
"""Initializes PorterStemmer
Returns:
Initializes PorterStemmer
"""
self.stemmer = PorterStemmer()
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text))
words = [word for word in words if word not in cachedStopWords]
tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
p = re.compile('[a-zA-Z]+')
filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
return filtered_tokens
def __init__(self, full_word):
self.full_word = full_word
# TODO: Lemmatization requires downloads
# wnl = WordNetLemmatizer()
# lemmas = [wnl.lemmatize(token) for token in tokens]
self.stem = PorterStemmer().stem(full_word).lower()
def get_list():
stop_words = set(stopwords.words('english'))
filename = 'data/new_acronyms.json'
f = open(filename, 'r')
data = json.load(f)
paragraph_list = []
full_form_list = []
for k,v in data.items():
if k=="WDM":
for poss in v['possibilities']:
paragraph_list.append(poss['summary'])
full_form_list.append(poss['full_form'])
s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
paragraph_list.append(s)
full_form_list.append("Wavelength context")
texts = []
taggeddoc = []
p_stemmer = PorterStemmer()
tokeniser = RegexpTokenizer(r'\w+')
for index, para in enumerate(paragraph_list):
raw = para.lower()
tokens = tokeniser.tokenize(raw)
stopped_tokens = [t for t in tokens if not t in stop_words]
number_tokens = [x for x in stopped_tokens if x.isalpha]
stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
length_tokens = [i for i in stemmed_tokens if len(i) > 1]
texts.append(length_tokens)
td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])
taggeddoc.append(td)
return taggeddoc
def tokenizer_porter(text):
porter = PorterStemmer()
return [porter.stem(word) for word in text.split() if word not in stop]
# Cambiamos a este stemmer que tiene soporte para español
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def tweet_stemming(tweet, token_freqs):
"""
Stems tweets words and counts diversty
:param tweet: the tweet to analyze
:type tweet: str or unicode
:param token_freqs: counter of words frequency
:type token_freqs: Counter
:returns: words added to token_freqs
:rtype: int
"""
pattern_url = '((https?:\/\/)|www\.)([\da-z\.-]+)\.([\/\w \.-]*)( |$)'
regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
porter = PorterStemmer()
counter_tokens = 0
tweet_url_removed = re.sub(pattern_url, '', tweet, flags=re.MULTILINE) # remove URL
tweet_url_removed_tokenized = word_tokenize(tweet_url_removed) # tokenize tweet
tweet_url_removed_tokenized_cleaned_stemming = [] # cleaned of URLs and hashs, and stemming
for token in tweet_url_removed_tokenized:
new_token = regex_punctuation.sub(u'', token) # remove punctuation and hash
if not new_token == u'':
new_token_stemming = porter.stem(new_token)
tweet_url_removed_tokenized_cleaned_stemming.append(new_token_stemming)
token_freqs[new_token_stemming] += 1
counter_tokens += 1
return counter_tokens
def stem_list(word_list):
"""
Return a tokenised text list.
:param word_list: word list to be stemmed.
:return: list
"""
stemmer = PorterStemmer()
return [stemmer.stem(word) for word in word_list]
snowball.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
sse_client.py 文件源码
项目:Searchable-Symmetric-Encryption
作者: IanVanHoudt
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def __init__(self):
# TODO: placeholder for password. Will eventually take
# as an arg of some sort
self.password = b"password"
# TODO: need to sort out use of salt. Previously, salt was
# randomly generated in initKeys, but the resulting pass-
# words k & kPrime were different on each execution, and
# decryption was impossible. Hardcoding salt makes dectyption
# possible but may be a bad short cut
self.iv = None
self.salt = "$2b$12$ddTuco8zWXF2.kTqtOZa9O"
# Two keys, generated/Initialized by KDF
(self.k, self.kPrime) = self.initKeys()
# Two K's: generated/initialized by PRF
self.k1 = None
self.k2 = None
# client's cipher (AES w/ CBC)
self.cipher = self.initCipher()
# Stemming tool (cuts words to their roots/stems)
self.stemmer = PorterStemmer()
def stem(tokens):
""" Stem passed text tokens. """
stemmer = PorterStemmer()
return [stemmer.stem(token) for token in tokens]
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def __init__(self):
self.ps = PorterStemmer()
def __init__(self):
self.ps = PorterStemmer()
def getAllReviews(movieList):
reviews = np.array(map(lambda x: x["reviews"], movieList))
reviews = np.concatenate(reviews)
tokenizeReview = []
for review in reviews:
s = review['review']
s = RegexpTokenizer(r'\w+').tokenize(s.lower())
s = map(lambda x: PorterStemmer().stem(x), s)
s = filter(lambda x: x not in stopwords.words('english'), s)
tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))
return tokenizeReview
def getAllCritics(movieList):
reviews = np.array(map(lambda x: x["critics"], movieList))
reviews = np.concatenate(reviews)
tokenizeReview = []
for review in reviews:
s = review['review']
s = RegexpTokenizer(r'\w+').tokenize(s.lower())
s = map(lambda x: PorterStemmer().stem(x), s)
s = filter(lambda x: x not in stopwords.words('english'), s)
tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))
return tokenizeReview
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text))
words = [word for word in words if word not in cachedStopWords]
tokens = (list(map(lambda token: PorterStemmer().stem(token), words)))
p = re.compile('[a-zA-Z]+')
filtered_tokens = list(filter(lambda token: p.match(token) and
len(token) >= min_length, tokens))
return filtered_tokens
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def tokenizer_porter(text):
return [PorterStemmer().stem(word) for word in text.split()]
def __init__(self):
self.stemmer = PorterStemmer()
graphssl.py 文件源码
项目:graph-based-semi-supervised-learning
作者: deerishi
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self):
self.stemmer = PorterStemmer()
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def __init__(self, ignore_stopwords=False):
_LanguageSpecificStemmer.__init__(self, ignore_stopwords)
porter.PorterStemmer.__init__(self)
def __init__(self, lang="spanish"):
"""
Initializes the parameters for specific language
"""
self.languages = ["spanish", "english", "italian", "german"]
self.lang = lang
if self.lang not in self.languages:
raise LangDependencyError("Language not supported: " + lang)
self.stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None)
if self.stopwords is None:
self.stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang)))
LangDependency.STOPWORDS_CACHE[lang] = self.stopwords
self.neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None)
if self.neg_stopwords is None:
self.neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang)))
LangDependency.NEG_STOPWORDS_CACHE[lang] = self.neg_stopwords
if self.lang not in SnowballStemmer.languages:
raise LangDependencyError("Language not supported for stemming: " + lang)
if self.lang == "english":
self.stemmer = PorterStemmer()
else:
self.stemmer = SnowballStemmer(self.lang)