/**
* 测试爬虫模板
*
* @param info
* @return
*/
public List<Webpage> testSpiderInfo(SpiderInfo info) throws JMException {
final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
final String uuid = UUID.randomUUID().toString();
Task task = taskManager.initTask(uuid, info.getDomain(), info.getCallbackURL(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
task.addExtraInfo("spiderInfo", info);
QueueScheduler queueScheduler = new QueueScheduler();
MySpider spider = (MySpider) makeSpider(info, task)
.addPipeline(resultItemsCollectorPipeline)
.setScheduler(queueScheduler);
spider.startUrls(info.getStartURL());
//慎用爬虫监控,可能导致内存泄露
// spiderMonitor.register(spider);
spiderMap.put(uuid, spider);
taskManager.getTaskById(uuid).setState(State.RUNNING);
spider.run();
List<Webpage> webpageList = Lists.newLinkedList();
resultItemsCollectorPipeline.getCollected().forEach(resultItems -> webpageList.add(CommonWebpagePipeline.convertResultItems2Webpage(resultItems)));
return webpageList;
}
CommonSpider.java 文件源码
java
阅读 20
收藏 0
点赞 0
评论 0
项目:Gather-Platform
作者:
评论列表
文章目录