Tumblr 爬虫
<version>0.5.2</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
## process
package win.raychow.modules.spider.base.processor;
import com.alibaba.fastjson.JSON; import org.json.XML; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import win.raychow.core.base.dao.CacheKey; import win.raychow.core.base.service.HtmlTool; import win.raychow.demo.spider.tool.SslDownloader; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.domain.TumblrRecModel;
import java.util.ArrayList; import java.util.Arrays; import java.util.List;
/**
- Created by ray on 2017/11/19. */
@Service public class TumblrProcessor implements PageProcessor {
private Logger logger = LoggerFactory.getLogger(this.getClass());
@Autowired
TumblrPipeLine pipeLine;
@Value("${spider.tumblr.prefixSexList}")
private String prefixSexList;
@Value("${spider.tumblr.prefixAnimalList}")
private String prefixAnimalList;
public final static String bashUrl = ".tumblr.com/api/read?type=video&num=20&start=";
private String getCategory(String url){
//性
String[] sexList = prefixSexList.split(CacheKey.Split);
for (String id: sexList) {
if (url.contains(id)){
return SpiderTumblr.Category_AV;
}
}
//动物
String[] animalList = prefixAnimalList.split(CacheKey.Split);
for (String id: animalList) {
if (url.contains(id)){
return SpiderTumblr.Category_Animal;
}
}
return SpiderTumblr.Category_Null;
}
@Override
public Site getSite() {
//HttpHost httpHost = new HttpHost("127.0.0.1",1087);
Site site = Site.me()
//.setHttpProxy(httpHost)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36")
.setSleepTime(30 * 1000)
.setTimeOut(20 * 1000)
.setRetryTimes(3)
.setCycleRetryTimes(3);
return site;
}
@Override
public void process(Page page){
String pageUrl = page.getUrl().toString();
logger.info(pageUrl);
if (pageUrl.contains(bashUrl)) {
try {
String xml = page.getJson().toString();
String json = XML.toJSONObject(xml).toString();
TumblrRecModel rec = JSON.parseObject(json, TumblrRecModel.class);
List<TumblrRecModel.Post> posts = rec.getTumblr().getPosts().getPost();
List<SpiderTumblr> list = new ArrayList<>();
TumblrRecModel.Tumblelog tumblelog = rec.getTumblr().getTumblelog();
//增加请求
if (pageUrl.contains("1&fffff=0"))
{
List<String> requestUrls =new ArrayList<>();
long total = Long.valueOf(rec.getTumblr().getPosts().getTotal());
long pageMax = total / 20 + 1;
for (int j = 1; j < pageMax; j++) {
String tmpUrl = pageUrl.replace("1&fffff=0",String.valueOf(20*j)) ;
requestUrls.add(tmpUrl);
}
page.addTargetRequests(requestUrls);
}
if (posts.size() == 0) return;
for (int i = 0; i < posts.size(); i++) {
String str = "";
try {
TumblrRecModel.Post post = posts.get(i);
str = post.getVideoPlayer().get(0);
str = str.replace("\"","'");
String id = HtmlTool.match(str,"video","id").get(0);
String poster = HtmlTool.match(str,"video","poster").get(0);
String optionsJson = HtmlTool.match(str,"video","data-crt-options").get(0);
TumblrRecModel.Options optionsRec = JSON.parseObject(optionsJson,TumblrRecModel.Options.class);
String file = HtmlTool.match(str,"source","src").get(0);
//类型
String type = "";
if (str.toLowerCase().contains("video/mp4")){
type = "mp4";
}
else if (str.toLowerCase().contains("video/ogg")){
type = "ogg";
}
else if (str.toLowerCase().contains("video/webm")){
type = "webm";
}
String category = pageUrl.split("&ggggg=")[1].toLowerCase();
if (optionsRec.getHdUrl().length() > 10){
file = optionsRec.getHdUrl();
}
//String type = post.getVideoSource().getExtension();
String videoCaption = HtmlTool.removeHtmlTag(post.getVideoCaption());
String videoId = "tumblr_" + post.getUrl().substring(post.getUrl().lastIndexOf("/")).substring(1);
SpiderTumblr tumblr = new SpiderTumblr();
tumblr.setVideoId(videoId);
tumblr.setPosterImage(poster);
tumblr.setVideoImage(optionsRec.getFilmstrip().getUrl());
tumblr.setVideoUrl(file);
tumblr.setVideoType(type);
tumblr.setTitle(videoCaption);
tumblr.setBaseUrl(post.getUrl());
tumblr.setCategory(category);
tumblr.setBlogTitle(tumblelog.getTitle());
list.add(tumblr);
}
catch (Exception e){
logger.error("xml to data error :" + str );
}
}
if (list.size() > 0){
page.putField("type", 0);
page.putField("data", list);
}
}
catch (Exception e){
logger.error("url:" + pageUrl );
}
}
}
public void run(){
Spider spider = Spider.create(new TumblrProcessor())
//.setDownloader(new HttpClientDownloader())
//.setDownloader(new HttpDownloader())
.setDownloader(new SslDownloader())
//.addPipeline(new ConsolePipeline())//打印到控制台
.addPipeline(pipeLine);
//animal
String[] animalIds = prefixAnimalList.split(CacheKey.Split);
for (String prefix:animalIds) {
String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix);
spider.addUrl(tmpUrl);
}
//sex
String[] sexIds = prefixSexList.split(CacheKey.Split);
for (String prefix:sexIds) {
String tmpUrl = "https://" + prefix + bashUrl + "1&fffff=0" + "&ggggg=" + this.getCategory(prefix);
spider.addUrl(tmpUrl);
}
spider.run();
}
}
## PipeLine
package win.raychow.modules.spider.base.processor;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import win.raychow.modules.spider.base.dao.SpiderTumblr; import win.raychow.modules.spider.base.dao.SpiderTumblrService;
import java.util.List;
/**
-
Created by ray on 2017/11/19. */ @Service public class TumblrPipeLine implements Pipeline {
@Autowired SpiderTumblrService service;
@Override public void process(ResultItems resultItems, Task task){
if (resultItems.getAll().isEmpty() == false) { int type = resultItems.get("type"); if (type == 0){ //列表内容 List<SpiderTumblr> list = resultItems.get("data"); for (SpiderTumblr tumblr: list) { try { String blogName = tumblr.getBaseUrl().replace("https://","").replace("http://","").split("\\.")[0]; tumblr.setBlogName(blogName); } catch (Exception e){ } service.updateBySpider(tumblr); } } else if(type == 1){ } }
} }