CorpusReader.java 文件源码-java代码片段

/**
 * Parses all document present in the referenced file path
 *
 * @param stringsQueue to parse
 * @return list with all documents with it's content in untokenized/unstemmed raw keywords
 */
public List<Document> parse(ConcurrentLinkedQueue<String> stringsQueue) {

    //compile our corpus regex so we can apply it on our parsing process
    Pattern id_content = Pattern.compile(CORPUS_REGEX_DOCUMENT);

    //parsing process
    return stringsQueue.parallelStream()
            .filter(line -> !line.isEmpty()) // line is not empty
            .map(id_content::matcher)// regex it
            .filter(Matcher::find) // did we regex anything? if so create document
            .map(match ->
            {
                //get the corpusID for this new file that we processing
                int corpusID = corpusCount.getAndIncrement();

                //map the corpusID to its corresponding filepath
                corpusIDToPath.computeIfAbsent(corpusID, v -> new ImmutablePair<>(match.group(4), Integer.parseInt(match.group(1))));
                return new Document(
                        corpusID, //first match is doc id and used to create our own doc id
                        Arrays.asList(match.group(5).split(" ")).parallelStream() // split document content in words
                                .collect(Collectors.toList())); // and put them in a list
            })
            .collect(Collectors.toList()); //collect all parsed lines
}