def getredundantComponents(sentences):
window_size=4
introList=[]
midlist=[]
endlist=[]
for sent in sentences:
words = WordPunctTokenizer().tokenize(sent)
length_sent=len(words)
f_point = (length_sent)//3
m_point=(length_sent)//2
index_span=window_size//2
intro=' '.join(word for word in words[0:window_size])
mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
end=' '.join(word for word in words[-window_size:])
introList.append(intro)
midlist.append(mid)
endlist.append(end)
return introList, midlist, endlist
python类WordPunctTokenizer()的实例源码
def __init__(self, root, fileids=DOC_PATTERN, tags=None,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle'),
encoding='utf8', **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._good_tags = tags or self.TAGS
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
xmldocs.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
return out
def __init__(self):
self.model = WordPunctTokenizer()
def __init__(self):
self.tokenizers = {
'en': TweetTokenizer(),
'de': WordPunctTokenizer(),
'it': WordPunctTokenizer(),
'fr': WordPunctTokenizer(),
'default': WordPunctTokenizer()
}
self.tokenizer = TweetTokenizer()