def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1'):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
aligned.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录