<version>0.5.2</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> ## process /** Created by ray on 2017/7/16. 爬虫管道 */ @Service public class NicoNicoProcessor implements PageProcessor { private Logger logger = LoggerFactory.getLogger(this.getClass()); //https://api.bilibili.com/archive_rank/getarchiverankbypartion?type=jsonp&tid=20&pn=1 private String bashUrl = “http://www.nicovideo.jp/tag/%E8%B8%8A%E3%81%A3%E3%81%A6%E3%81%BF%E3%81%9F?page=" ;// + i //http://www.nicovideo.jp/watch/sm23385186 private String detailUrl = “http://www.nicovideo.jp/watch/"; @Value("${spider.niconico.maxSize}”) int maxSize; @Autowired NicoNicoPipeLine pipeLine; @Override public Site getSite() { //HttpHost httpHost = new HttpHost("127.0.0.1",1087); Site site = Site.me() //.setHttpProxy(httpHost) .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36") .setSleepTime(10 * 1000) .setTimeOut(20 * 1000) .setRetryTimes(3) .setCycleRetryTimes(3); return site; } @Override public void process(Page page){ String pageUrl = page.getUrl().toString(); //新增请求列表 List<String> requestUrls =new ArrayList<>(); List<SpiderNico> resList = new ArrayList<>(); logger.info(pageUrl); try { if (pageUrl.contains(bashUrl)){ //解析列表 List <String> htmlList = page.getHtml().xpath("//div/ul[@class='list']/li[@class='item']").all(); for (String tmp: htmlList) { if (tmp.length()……
阅读全文