Bootstrap

gecco爬虫框架使用指南

gecco爬虫框架是几年前的一个框架了,但是现阶段用来爬一些普通网站使用起来还是很方便的。

pom引入:

<dependency>
    <groupId>com.geccocrawler</groupId>
    <artifactId>gecco</artifactId>
    <version>1.3.21</version>
</dependency>

代码可分为三部分:

一、启动代码

HttpGetRequest start = new HttpGetRequest("https://www.xxxx.com/aa/bb/493645.shtml");
start.setCharset("UTF-8");
GeccoEngine.create()
        .classpath("com.lujing.mydemo")
        //开始抓取的页面地址
        .start(start)
        //开启几个爬虫线程
        .thread(2)
        //单个爬虫每次抓取完一个请求后的间隔时间
        .interval(100)
        .run();

二、HtmlBean 实现类

如:

import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HrefBean;
import com.geccocrawler.gecco.spider.HtmlBean;

import java.util.List;

/**
 * 抓取京东的某个商品列表页
 * 
 * @author memory
 *
 */
@Gecco(matchUrl="https://www.xxxx.com/aa/bb/index_{page}.shtml", pipelines={"consolePipeline", "myProductListPipeline"})
public class ProductList implements HtmlBean {
   
   private static final long serialVersionUID = 4369792078959596706L;

   @Request
   private HttpRequest request;

   @RequestParameter
   private int page;
   
   /**
    * 抓取列表项的详细内容,包括titile,价格,详情页地址等
    */
   @HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div.column_list > ul > li > div.left_con > a")
   private List<HrefBean> details;

   @JSVar(var = "gioParam",jsonpath = "$.channel_en")
   private String gioParam;

   @Html
   @HtmlField(cssPath="html")
   private String allHtml;

   @Image
   @HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div.column_list > ul > li > div.img_con > img")
   private List<String> images;
   /**
    * 获得商品列表的总页数
    */
   @Text
   @HtmlField(cssPath="#top > div.wrap > div > div.content > div.container > div.content_main > div:nth-child(2) > div > ul > li:nth-last-child(2) > a")
   private int totalPage;

   public String getGioParam() {
      return gioParam;
   }

   public void setGioParam(String gioParam) {
      this.gioParam = gioParam;
   }

   public void setRequest(HttpRequest request) {
      this.request = request;
   }

   public void setPage(int page) {
      this.page = page;
   }

   public void setDetails(List<HrefBean> details) {
      this.details = details;
   }

   public String getAllHtml() {
      return allHtml;
   }

   public void setAllHtml(String allHtml) {
      this.allHtml = allHtml;
   }

   public void setTotalPage(int totalPage) {
      this.totalPage = totalPage;
   }

   public HttpRequest getRequest() {
      return request;
   }

   public int getPage() {
      return page;
   }

   public List<HrefBean> getDetails() {
      return details;
   }


   public int getTotalPage() {
      return totalPage;
   }

   public List<String> getImages() {
      return images;
   }

   public void setImages(List<String> images) {
      this.images = images;
   }
}

该实现类的作用即为匹配爬取的url进行处理。如启动类第一个url,适配获取html中的页码及想要的信息等。

三、管道类Pipeline实现类

该类即处理第二步中获取的详细信息的类,可以翻页继续爬取等。

如:

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
import com.geccocrawler.gecco.spider.HrefBean;
import com.huadongfeng.project.utils.FileUtil;

import java.util.List;

@PipelineName("myProductListPipeline")
public class ProductListPipeline implements Pipeline<ProductList> {

   @Override
   public void process(ProductList productList) {
      HttpRequest currRequest = productList.getRequest();

      int page = productList.getPage();
      int totalPage = productList.getTotalPage();
      System.out.println(totalPage);
      if(page<4){
         int i = page + 1;
         String url = "https://www.xxx.com/aa/bb/index_" + i + ".shtml";
         System.out.println(url);
         SchedulerContext.into(currRequest.subRequest(url));
      }

      String gioParam = productList.getGioParam();
      System.out.println(gioParam);
      String allHtml = productList.getAllHtml();

      List<String> images = productList.getImages();
      for (String img : images){
         System.out.println("img循环详情页"+img);

//       DownloadImage.download("E:\\img",img);
      }

      List<HrefBean> details = productList.getDetails();
      for (HrefBean bean:details){
//       System.out.println("列表循环详情页"+bean.getTitle());
         //进入祥情页面抓取
//       SchedulerContext.into(currRequest.subRequest(bean.getUrl()));
         FileUtil.writeFile("进入详情页=="+bean.getUrl()+ System.lineSeparator(),"E:\\","answer.txt","UTF-8");
      }

   }

}

;