niconico 爬虫
<version>0.5.2</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
## process
/**
-
Created by ray on 2017/7/16.
-
爬虫管道 */ @Service public class NicoNicoProcessor implements PageProcessor {
private Logger logger = LoggerFactory.getLogger(this.getClass());
//https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=1 private String bashUrl = “http://www.nicovideo.jp/tag/%E8%B8%8A%E3%81%A3%E3%81%A6%E3%81%BF%E3%81%9F?page=" ;// + i //http://www.nicovideo.jp/watch/sm23385186 private String detailUrl = “http://www.nicovideo.jp/watch/";
@Value("${spider.niconico.maxSize}”) int maxSize;
@Autowired NicoNicoPipeLine pipeLine;
@Override public Site getSite() {
//HttpHost httpHost = new HttpHost("127.0.0.1",1087); Site site = Site.me() //.setHttpProxy(httpHost) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(10 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); return site;
}
@Override public void process(Page page){
String pageUrl = page.getUrl().toString(); //新增请求列表 List<String> requestUrls =new ArrayList<>(); List<SpiderNico> resList = new ArrayList<>(); logger.info(pageUrl); try { if (pageUrl.contains(bashUrl)){ //解析列表 List <String> htmlList = page.getHtml().xpath("//div/ul[@class='list']/li[@class='item']").all(); for (String tmp: htmlList) { if (tmp.length() < 5) continue; if (tmp.contains("data-id")){ try { Html html = new Html(tmp); //id String dataId = html.xpath("//li[@class='item']/@data-id").toString(); //标题 String title = html.xpath("//p[@class='itemTitle']/a/text()").toString(); //封面 String icon = html.xpath("//img[@class='jsLazyImage thumb']/@data-original").toString(); String view = "0"; String comment = "0"; String wrapTitle = null; String createTime = "0000-00-00"; //光看人数 view = html.xpath("//ul[@class='list']/li[@class='count view']/span/text()").toString().replace(",",""); //回复人数 comment = html.xpath("//ul[@class='list']/li[@class='count comment']/span/text()").toString().replace(",",""); //其他 wrapTitle = html.xpath("//div[@class='wrap']/p/@title").toString().replace(",",""); //创建时间 createTime = html.xpath("//p[@class='itemTime']/span/span/text()").toString().replace("/","-"); createTime = "20" + createTime; //增加nico对象 SpiderNico nico = new SpiderNico (); nico.setAid(dataId); nico.setTitle(title); nico.setDescription(wrapTitle); nico.setCreate(createTime); nico.setComment(Integer.valueOf(comment)); nico.setPic(icon); nico.setView(Integer.valueOf(view)); resList.add(nico); //增加请求地址 String url = detailUrl + dataId; requestUrls.add(url); } catch (Exception e){ logger.error("nico xpath:" + pageUrl ); } } } //批量增加请求 if (resList.size() > 0 ){ page.putField("type", 0); page.putField("data", resList); } } else if (pageUrl.contains(detailUrl)){ logger.info(pageUrl); } } catch (Exception e){ logger.error("url:" + pageUrl ); }
}
public void run(){
Spider spider = Spider.create(new NicoNicoProcessor()) //.setDownloader(new HttpClientDownloader()) //.setDownloader(new HttpDownloader()) .setDownloader(new SslDownloader()) //.addPipeline(new ConsolePipeline())//打印到控制台 .addPipeline(pipeLine); for (int i = 1; i < maxSize; i++) { String tmp = bashUrl + i; spider.addUrl(tmp); } spider.run();
} }
## pipeline
/**
-
Created by ray on 2017/7/16.
-
爬虫进程 */ @Service public class NicoNicoPipeLine implements Pipeline {
@Autowired SpiderNicoService service;
@Override public void process(ResultItems resultItems, Task task){
if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表内容 List<SpiderNico> list = resultItems.get("data"); for (SpiderNico obj: list) { service.updateBySpider(obj); } } else if(type == 1){ } }
} }