/**
* Parses all document present in the referenced file path
*
* @param stringsQueue to parse
* @return list with all documents with it's content in untokenized/unstemmed raw keywords
*/
public List<Document> parse(ConcurrentLinkedQueue<String> stringsQueue) {
//compile our corpus regex so we can apply it on our parsing process
Pattern id_content = Pattern.compile(CORPUS_REGEX_DOCUMENT);
//parsing process
return stringsQueue.parallelStream()
.filter(line -> !line.isEmpty()) // line is not empty
.map(id_content::matcher)// regex it
.filter(Matcher::find) // did we regex anything? if so create document
.map(match ->
{
//get the corpusID for this new file that we processing
int corpusID = corpusCount.getAndIncrement();
//map the corpusID to its corresponding filepath
corpusIDToPath.computeIfAbsent(corpusID, v -> new ImmutablePair<>(match.group(4), Integer.parseInt(match.group(1))));
return new Document(
corpusID, //first match is doc id and used to create our own doc id
Arrays.asList(match.group(5).split(" ")).parallelStream() // split document content in words
.collect(Collectors.toList())); // and put them in a list
})
.collect(Collectors.toList()); //collect all parsed lines
}
CorpusReader.java 文件源码
java
阅读 45
收藏 0
点赞 0
评论 0
项目:information-retrieval
作者:
评论列表
文章目录