def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
python类RegexpTokenizer()的实例源码
aligned.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
topic_modeler.py 文件源码
项目:Artificial-Intelligence-with-Python
作者: PacktPublishing
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def process(input_text):
# Create a regular expression tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Create a Snowball stemmer
stemmer = SnowballStemmer('english')
# Get the list of stop words
stop_words = stopwords.words('english')
# Tokenize the input string
tokens = tokenizer.tokenize(input_text.lower())
# Remove the stop words
tokens = [x for x in tokens if not x in stop_words]
# Perform stemming on the tokenized words
tokens_stemmed = [stemmer.stem(x) for x in tokens]
return tokens_stemmed
def paragraph_to_words(paragraph, remove_stopwords=False, lemmatize=True, stem=False):
words = BeautifulSoup(paragraph["review"], "html.parser").get_text()
words = re.sub("[^a-zA-Z]", " ", words)
# tokenizer = RegexpTokenizer(r'\w+')
# words = tokenizer.tokenize(words.strip().lower())
words = words.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
if lemmatize:
words = [lemmatizer.lemmatize(w) for w in words]
if stem:
words = [stemmer.stem(w) for w in words]
return LabelDoc(words, paragraph["id"])
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def tweets(word_len, sent_len, train_valid_ratio=[5,1]):
df = pandas.read_csv('tweets_large.csv')
field = 'text'
label = 'label'
tokenizer = RegexpTokenizer(r'\w+')
# encode characters into numbers
encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer,
word_len=word_len, sent_len=sent_len)
encoder.build_char_map()
encode_X = encoder.make_char_embed()
# encode categories into one hot array
cat_encoder = CatNumberEncoder(df[label])
cat_encoder.build_cat_map()
encode_y = cat_encoder.make_cat_embed()
nclass = len(np.unique(encode_y))
encode_y = make_one_hot(encode_y, nclass)
return encode_X, encode_y, nclass
def analysis(reviews_collection_text):
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
raw_data = f.read()
with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
comments = f.readlines()
data = raw_data.replace('\n', ' ')
data_lower = data.lower()
tokens_with_punc = word_tokenize(data_lower)
tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
print("--- Most frequent tokens ---\n",
FreqDist(tokens_with_punc).most_common(15))
print("--- Tokens without punctuation ---\n",
FreqDist(tokens).most_common(15))
stop = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop]
print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
tagged = pos_tag(words)
nouns = [word for word, pos in tagged if (pos == 'NN')]
print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
adjts = [word for word, pos in tagged if (pos == 'JJ')]
print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
avgld = sum(lxdst) / len(comments)
print("--- Average lexical density ---\n", avgld)
def __init__(self, fname):
words_map = {}
for line in csv.reader(open(fname)):
word, syn = line
if word.startswith('#'):
continue
words_map[word] = syn
super(CSVWordReplacer, self).__init__(words_map)
######### for now just a wrapper to RegexpTokenizer #########
def __init__(self,pattern):
self.pattern = pattern
self.tokenizer = RegexpTokenizer(self.pattern)
######## defining a default stopwords set #############
def rm_punctuation(data,pattern=r'[a-zA-Z]+-?[0-9]*',silent=1):
if silent==0:
print("remove punctuation ...")
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(pattern)
return tokenizer.tokenize(" ".join(data))
topic_modeling.py 文件源码
项目:Python-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def __init__(self):
# Create a regular expression tokenizer
self.tokenizer = RegexpTokenizer(r'\w+')
# get the list of stop words
self.stop_words_english = stopwords.words('english')
# Create a Snowball stemmer
self.stemmer = SnowballStemmer('english')
# Tokenizing, stop word removal, and stemming
def get_list():
stop_words = set(stopwords.words('english'))
filename = 'data/new_acronyms.json'
f = open(filename, 'r')
data = json.load(f)
paragraph_list = []
full_form_list = []
for k,v in data.items():
if k=="WDM":
for poss in v['possibilities']:
paragraph_list.append(poss['summary'])
full_form_list.append(poss['full_form'])
s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
paragraph_list.append(s)
full_form_list.append("Wavelength context")
texts = []
taggeddoc = []
p_stemmer = PorterStemmer()
tokeniser = RegexpTokenizer(r'\w+')
for index, para in enumerate(paragraph_list):
raw = para.lower()
tokens = tokeniser.tokenize(raw)
stopped_tokens = [t for t in tokens if not t in stop_words]
number_tokens = [x for x in stopped_tokens if x.isalpha]
stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
length_tokens = [i for i in stemmed_tokens if len(i) > 1]
texts.append(length_tokens)
td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])
taggeddoc.append(td)
return taggeddoc
def get_summarized(self, input_data, num_sentences):
# TODO: allow the caller to specify the tokenizer they want
# TODO: allow the user to specify the sentence tokenizer they want
# TODO multilingual!
tokenizer = RegexpTokenizer('\w+')
stopwords_ = [smart_text(word) for word in stopwords.words('english')]
# get the frequency of each word in the input
base_words = [smart_text(word.lower()) for word in tokenizer.tokenize(smart_text(input_data))]
words = [smart_text(word) for word in base_words if word not in stopwords_]
word_frequencies = FreqDist(words)
# now create a set of the most frequent words
most_frequent_words = [pair[0] for pair in list(word_frequencies.items())[:100]]
# break the input up into sentences. working_sentences is used
# for the analysis, but actual_sentences is used in the results
# so capitalization will be correct.
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
actual_sentences = sent_detector.tokenize(input_data)
working_sentences = [sentence.lower() for sentence in actual_sentences]
# iterate over the most frequent words, and add the first sentence
# that inclues each word to the result.
output_sentences = []
for word in most_frequent_words:
for i in range(0, len(working_sentences)):
if (word in working_sentences[i] and actual_sentences[i] not in output_sentences):
output_sentences.append(actual_sentences[i])
break
if len(output_sentences) >= num_sentences: break
if len(output_sentences) >= num_sentences: break
# sort the output sentences back to their original order
return self.reorder_sentences(output_sentences=output_sentences, input_data=input_data)
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
def __init__(self, root, items, encoding='utf8'):
gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(self, root, items, sep='_',
sent_tokenizer=sent_tokenizer)
#: A list of all documents and their titles in ycoe.
def split_sentence_into_words(sentence):
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(sentence.lower())
def remove_punctuation(str):
tokenizer = RegexpTokenizer(r'\w+')
return tokenizer.tokenize(str)
def tokenize(text, level):
"""Tokenize a text into a list of strings.
Args:
text (str): An arbitrary string.
level (str): Either "char" or "word". For "char", the string is split into characters. For
"word", letters and numbers are glued to themselves and everything else is split.
Example: "asdf df!?123 as12" -> "asdf", " ", "df", "!", "?", "123", " ", "as", "12"
Returns:
list[str]: The tokens
Raises:
ValueError: If the level is not "char" or "word"
"""
if level == "char":
# No need for tokenizing
return list(text)
elif level == "word":
# Tokenize while keeping indentation. Glue letters and numbers to themselves but
# keep all other chars isolated.
tokenizer = RegexpTokenizer(r'\w+|\S|\s')
return tokenizer.tokenize(text)
else:
raise ValueError("Unknown token level: {}".format(level))
rte_classify.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
ycoe.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def __init__(self, root, items, encoding='utf8'):
gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(self, root, items, sep='_',
sent_tokenizer=sent_tokenizer)
#: A list of all documents and their titles in ycoe.
def remove_punc(string):
# '''Description: This function takes in a string of descriptions and return a tokenized string without punctuation
# Parameters: String of descriptions
# Output: Tokenized string with punctuation removed'''
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(string)
return " ".join(tokens)
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
def __init__(self, root, items, encoding='utf8'):
gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
TaggedCorpusReader.__init__(self, root, items, sep='_',
sent_tokenizer=sent_tokenizer)
#: A list of all documents and their titles in ycoe.
def getAllReviews(movieList):
reviews = np.array(map(lambda x: x["reviews"], movieList))
reviews = np.concatenate(reviews)
tokenizeReview = []
for review in reviews:
s = review['review']
s = RegexpTokenizer(r'\w+').tokenize(s.lower())
s = map(lambda x: PorterStemmer().stem(x), s)
s = filter(lambda x: x not in stopwords.words('english'), s)
tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))
return tokenizeReview
def getAllCritics(movieList):
reviews = np.array(map(lambda x: x["critics"], movieList))
reviews = np.concatenate(reviews)
tokenizeReview = []
for review in reviews:
s = review['review']
s = RegexpTokenizer(r'\w+').tokenize(s.lower())
s = map(lambda x: PorterStemmer().stem(x), s)
s = filter(lambda x: x not in stopwords.words('english'), s)
tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))
return tokenizeReview