public static List<TaggedSentence> getTaggedSentences(String xmlCorpusPath, int minNumberOfTokens) throws XMLStreamException, FactoryConfigurationError,
FileNotFoundException {
List<TaggedSentence> taggedSentences = new ArrayList<>();
XMLEventReader xmlEventReader = XMLInputFactory.newInstance().createXMLEventReader(new FileInputStream(xmlCorpusPath));
while (xmlEventReader.hasNext()) {
XMLEvent event = xmlEventReader.nextEvent();
if (event.isStartElement() && "source".equals(event.asStartElement().getName().getLocalPart())) {
String sentence = extractSentence(xmlEventReader);
List<StringTaggedToken> taggedTokens = extractTaggedTokens(xmlEventReader);
if (!allTagsRecognized(taggedTokens) || (taggedTokens.size() < minNumberOfTokens)) {
continue;
}
TaggedSentence taggedSentence = createTaggedSentence(sentence, taggedTokens);
taggedSentences.add(taggedSentence);
}
}
return taggedSentences;
}
XMLCorpusReader.java 文件源码
java
阅读 21
收藏 0
点赞 0
评论 0
项目:pos-tagger
作者:
评论列表
文章目录