gecco爬虫框架是几年前的一个框架了,但是现阶段用来爬一些普通网站使用起来还是很方便的。
pom引入:
<dependency> <groupId>com.geccocrawler</groupId> <artifactId>gecco</artifactId> <version>1.3.21</version> </dependency>
代码可分为三部分:
一、启动代码
HttpGetRequest start = new HttpGetRequest("https://www.xxxx.com/aa/bb/493645.shtml");
start.setCharset("UTF-8");
GeccoEngine.create()
.classpath("com.lujing.mydemo")
//开始抓取的页面地址
.start(start)
//开启几个爬虫线程
.thread(2)
//单个爬虫每次抓取完一个请求后的间隔时间
.interval(100)
.run();
二、HtmlBean 实现类
如:
import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HrefBean;
import com.geccocrawler.gecco.spider.HtmlBean;
import java.util.List;
/**
* 抓取京东的某个商品列表页
*
* @author memory
*
*/
@Gecco(matchUrl="https://www.xxxx.com/aa/bb/index_{page}.shtml", pipelines={"consolePipeline", "myProductListPipeline"})
public class ProductList implements HtmlBean {
private static final long serialVersionUID = 4369792078959596706L;
@Request
private HttpRequest request;
@RequestParameter
private int page;
/**
* 抓取列表项的详细内容,包括titile,价格,详情页地址等
*/
@HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div.column_list > ul > li > div.left_con > a")
private List<HrefBean> details;
@JSVar(var = "gioParam",jsonpath = "$.channel_en")
private String gioParam;
@Html
@HtmlField(cssPath="html")
private String allHtml;
@Image
@HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div.column_list > ul > li > div.img_con > img")
private List<String> images;
/**
* 获得商品列表的总页数
*/
@Text
@HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div:nth-child(2) > div > ul > li:nth-last-child(2) > a")
private int totalPage;
public String getGioParam() {
return gioParam;
}
public void setGioParam(String gioParam) {
this.gioParam = gioParam;
}
public void setRequest(HttpRequest request) {
this.request = request;
}
public void setPage(int page) {
this.page = page;
}
public void setDetails(List<HrefBean> details) {
this.details = details;
}
public String getAllHtml() {
return allHtml;
}
public void setAllHtml(String allHtml) {
this.allHtml = allHtml;
}
public void setTotalPage(int totalPage) {
this.totalPage = totalPage;
}
public HttpRequest getRequest() {
return request;
}
public int getPage() {
return page;
}
public List<HrefBean> getDetails() {
return details;
}
public int getTotalPage() {
return totalPage;
}
public List<String> getImages() {
return images;
}
public void setImages(List<String> images) {
this.images = images;
}
}
该实现类的作用即为匹配爬取的url进行处理。如启动类第一个url,适配获取html中的页码及想要的信息等。
三、管道类Pipeline实现类
该类即处理第二步中获取的详细信息的类,可以翻页继续爬取等。
如:
import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
import com.geccocrawler.gecco.spider.HrefBean;
import com.huadongfeng.project.utils.FileUtil;
import java.util.List;
@PipelineName("myProductListPipeline")
public class ProductListPipeline implements Pipeline<ProductList> {
@Override
public void process(ProductList productList) {
HttpRequest currRequest = productList.getRequest();
int page = productList.getPage();
int totalPage = productList.getTotalPage();
System.out.println(totalPage);
if(page<4){
int i = page + 1;
String url = "https://www.xxx.com/aa/bb/index_" + i + ".shtml";
System.out.println(url);
SchedulerContext.into(currRequest.subRequest(url));
}
String gioParam = productList.getGioParam();
System.out.println(gioParam);
String allHtml = productList.getAllHtml();
List<String> images = productList.getImages();
for (String img : images){
System.out.println("img循环详情页"+img);
// DownloadImage.download("E:\\img",img);
}
List<HrefBean> details = productList.getDetails();
for (HrefBean bean:details){
// System.out.println("列表循环详情页"+bean.getTitle());
//进入祥情页面抓取
// SchedulerContext.into(currRequest.subRequest(bean.getUrl()));
FileUtil.writeFile("进入详情页=="+bean.getUrl()+ System.lineSeparator(),"E:\\","answer.txt","UTF-8");
}
}
}