<version>0.5.2</version>
        <exclusions>
            <exclusion>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
            </exclusion>
        </exclusions>
	</dependency>

## process

package win.raychow.modules.spider.base.processor;

import com.alibaba.fastjson.JSON; import org.json.XML; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service;

import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.core.base.dao.CacheKey; import win.raychow.core.base.service.HtmlTool; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.domain.TumblrRecModel;

import java.util.ArrayList; import java.util.Arrays; import java.util.List;

/**

  • Created by ray on 2017/11/19. */

@Service public class TumblrProcessor implements PageProcessor {

private Logger logger = LoggerFactory.getLogger(this.getClass());

@Autowired
TumblrPipeLine pipeLine;

@Value("${spider.tumblr.prefixSexList}")
private String prefixSexList;

@Value("${spider.tumblr.prefixAnimalList}")
private String prefixAnimalList;

public final static String bashUrl = ".tumblr.com/api/read?type=video&num=20&start=";

private String getCategory(String url){

    //性
    String[] sexList = prefixSexList.split(CacheKey.Split);
    for (String id: sexList) {
        if (url.contains(id)){
            return  SpiderTumblr.Category_AV;
        }
    }

    //动物
    String[] animalList = prefixAnimalList.split(CacheKey.Split);
    for (String id: animalList) {
        if (url.contains(id)){
            return  SpiderTumblr.Category_Animal;
        }
    }

    return  SpiderTumblr.Category_Null;
}

@Override
public Site getSite() {

    //HttpHost httpHost = new HttpHost("127.0.0.1",1087);
    Site site = Site.me()
            //.setHttpProxy(httpHost)
            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
            .setSleepTime(30 * 1000)
            .setTimeOut(20 * 1000)
            .setRetryTimes(3)
            .setCycleRetryTimes(3);
    return site;

}

@Override
public void process(Page page){

    String pageUrl = page.getUrl().toString();

    logger.info(pageUrl);

    if (pageUrl.contains(bashUrl)) {

        try {
            String xml = page.getJson().toString();
            String json = XML.toJSONObject(xml).toString();
            TumblrRecModel rec = JSON.parseObject(json, TumblrRecModel.class);

            List<TumblrRecModel.Post> posts =  rec.getTumblr().getPosts().getPost();
            List<SpiderTumblr> list = new ArrayList<>();

            TumblrRecModel.Tumblelog tumblelog = rec.getTumblr().getTumblelog();

            //增加请求
            if (pageUrl.contains("1&fffff=0"))
            {
                List<String> requestUrls =new ArrayList<>();
                long total = Long.valueOf(rec.getTumblr().getPosts().getTotal());
                long pageMax = total / 20 + 1;
                for (int j = 1; j < pageMax; j++) {
                    String tmpUrl = pageUrl.replace("1&fffff=0",String.valueOf(20*j)) ;
                    requestUrls.add(tmpUrl);
                }
                page.addTargetRequests(requestUrls);
            }

            if (posts.size() == 0) return;

            for (int i = 0; i < posts.size(); i++) {
                String str = "";

                try {
                    TumblrRecModel.Post post = posts.get(i);
                    str = post.getVideoPlayer().get(0);
                    str = str.replace("\"","'");
                    String id = HtmlTool.match(str,"video","id").get(0);
                    String poster = HtmlTool.match(str,"video","poster").get(0);
                    String optionsJson = HtmlTool.match(str,"video","data-crt-options").get(0);
                    TumblrRecModel.Options optionsRec = JSON.parseObject(optionsJson,TumblrRecModel.Options.class);
                    String file = HtmlTool.match(str,"source","src").get(0);


                    //类型
                    String type = "";
                    if (str.toLowerCase().contains("video/mp4")){
                        type = "mp4";
                    }
                    else if (str.toLowerCase().contains("video/ogg")){
                        type = "ogg";

                    }
                    else if (str.toLowerCase().contains("video/webm")){
                        type = "webm";
                    }

                    String category = pageUrl.split("&ggggg=")[1].toLowerCase();


                    if (optionsRec.getHdUrl().length() > 10){
                        file = optionsRec.getHdUrl();
                    }

                    //String type = post.getVideoSource().getExtension();
                    String videoCaption = HtmlTool.removeHtmlTag(post.getVideoCaption());

                    String videoId = "tumblr_" + post.getUrl().substring(post.getUrl().lastIndexOf("/")).substring(1);
                    SpiderTumblr tumblr = new SpiderTumblr();
                    tumblr.setVideoId(videoId);
                    tumblr.setPosterImage(poster);
                    tumblr.setVideoImage(optionsRec.getFilmstrip().getUrl());
                    tumblr.setVideoUrl(file);
                    tumblr.setVideoType(type);
                    tumblr.setTitle(videoCaption);
                    tumblr.setBaseUrl(post.getUrl());
                    tumblr.setCategory(category);
                    tumblr.setBlogTitle(tumblelog.getTitle());
                    list.add(tumblr);
                }
                catch (Exception e){
                    logger.error("xml to data error :" + str );
                }
            }
            if (list.size() > 0){
                page.putField("type", 0);
                page.putField("data", list);
            }
        }
        catch (Exception e){
            logger.error("url:" + pageUrl );
        }
    }
}

public void run(){

    Spider spider = Spider.create(new TumblrProcessor())
            //.setDownloader(new HttpClientDownloader())
            //.setDownloader(new HttpDownloader())
            .setDownloader(new SslDownloader())
            //.addPipeline(new ConsolePipeline())//打印到控制台
            .addPipeline(pipeLine);

    //animal
    String[] animalIds = prefixAnimalList.split(CacheKey.Split);
    for (String prefix:animalIds) {
        String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix);
        spider.addUrl(tmpUrl);
    }

    //sex
    String[] sexIds = prefixSexList.split(CacheKey.Split);
    for (String prefix:sexIds) {
        String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix);
        spider.addUrl(tmpUrl);
    }

    spider.run();

}

}


## PipeLine

package win.raychow.modules.spider.base.processor;

import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.dao.SpiderTumblrService;

import java.util.List;

/**

  • Created by ray on 2017/11/19. */ @Service public class TumblrPipeLine implements Pipeline {

    @Autowired SpiderTumblrService service;

    @Override public void process(ResultItems resultItems, Task task){

     if (resultItems.getAll().isEmpty() == false) {
    
         int type = resultItems.get("type");
         if (type == 0){
             //列表内容
             List<SpiderTumblr> list = resultItems.get("data");
             for (SpiderTumblr tumblr: list) {
    
                 try {
                     String blogName = tumblr.getBaseUrl().replace("https://","").replace("http://","").split("\\.")[0];
                     tumblr.setBlogName(blogName);
                 }
                 catch (Exception e){
    
                 }
    
                 service.updateBySpider(tumblr);
             }
         }
         else if(type == 1){
    
    
         }
    
     }
    

    } }