需求分析 1:爬取虎嗅首页获取首页文章地址:https://www.huxiu.com/ 2:爬取虎嗅分页地址,获取分页上的文章地址。 3:爬取文章详情页,获取文章信息(标题、正文、作者、发布时间、评论数、点赞数、收藏数)。 4:将爬到的文章信息入库。 实现思路 1:爬首页 请求地址:https://www.huxiu.com/ 请求方式:get 请求参数:无 请求头: User-Agent 返回数据:html文档 使用jsoup解析html文档,获取文章id 2:爬文章详情页 请求地址:https://www.huxiu.com/article/231552.html 请求方式:get 请求参数:无 请求头: User-Agent 返回数据:html文档 使用jsoup解析html文档,获取文章信息(标题、正文、作者、发布时间、评论数、点赞数、收藏数) 将文章信息入库 3:爬下一页 请求地址:https://www.huxiu.com/v2_action/article_list 请求方式:POST 请求参数: huxiu_hash_code :bc1acc4ae8cc354069293a255b8140fc //固定不变的 page:2 //变的,爬哪一页,从2开始 last_dateline:1516942440//访问上一页的时间戳,如果爬的是第二页,从首页获取,如果爬的是第三页,从第二页的返回结果中获取 请求头: User-Agent 返回数据: { result: 1, msg: 获取成功, data: <div class="mod-b mod-art" dat…v> </div> </div>, total_page: 1703, last_dateline: 1516788540 } 解析:使用gson解析json串,使用jsoup解析html文档,然后获取当前页文章的id 4:参考第二步 总结:思路大于代码,思路怎么来: 1:分析喽 2:猜测喽 3:f12调试喽 准备工作 1:新建普通maven工程 2:pom依赖 <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.6.RELEASE</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.41</version> </dependency> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.31</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.1</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>2.9.0</version> </dependency> java代码实现 package cn.itcast.huxiu; import java.io.IOException; import java.util.ArrayList; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class HuXiuTest { public static void main(String[] args) throws Exception { for (int i = 0; i < 100000; i++) { //爬取首页的信息 String indexHtml = getIndex(); //解析首页 得到首页里面的所有的id(根据id来查询每一个页面的信息) 存储到集合里面 ArrayList<String> ids = parseIndexHtml(indexHtml); //得到了所有详情文章的id 通过文章的id来查询每一篇文章的信息 并且把这些信息保存在自己的数据库里面 parseXianQingYeMian(ids); } } private static void parseXianQingYeMian(ArrayList<String> ids) throws IOException, ClientProtocolException { if(ids.size() != 0){ for (String pid : ids) { //遍历得到了 每一个页面的id ArticleDao articleDao = new ArticleDao(); int id = Integer.parseInt(pid); //创建发送请求 HttpGet httpGet = new HttpGet("https://www.huxiu.com/article/"+id+".html"); //消息头 httpGet.addHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"); CloseableHttpClient httpClient = HttpClients.createDefault(); //发送请求 CloseableHttpResponse execute = httpClient.execute(httpGet); //判断这个详细页面是否可以加载成功 if(execute.getStatusLine().getStatusCode() == 200){//200表示加载到了详情的页面的信息 HttpEntity entity = execute.getEntity(); String html = EntityUtils.toString(entity); Article article = new Article(); article.setId(id); //将详细页面的信息 转换为文档对象 Document document = Jsoup.parse(html); //获取文章的标题信息 String ownText = document.select(".t-h1").get(0).ownText(); article.setTitle(ownText); //获取作者 String author = document.select(".author-name").get(0).text(); article.setAuthor(author); //获取时间 根据页面上的信息可知时间有两种表示 Elements elements = document.select("span[class=article-time pull-left]"); if(elements.size() == 0){ String createTime = document.select(".article-time").get(0).ownText(); article.setCreateTime(createTime); }else{ String createTime = elements.get(0).ownText(); article.setCreateTime(createTime); } //获取文章内容 String content = document.select(".article-content-wrap").get(0).text(); article.setContent(content); //获取点赞 article.setZan(document.select(".num").get(0).ownText()); //获取评论 article.setPl(document.select(".article-pl").get(0).ownText()); System.out.println(article); // articleDao.save(article); } } } } //解析数据 得到url private static ArrayList<String> parseIndexHtml(String indexHtml) { // TODO Auto-generated method stub if(indexHtml != null){ ArrayList<String> urls = new ArrayList<String>(); //解析得到的页面的信息 将其变成文档对象 Document document = Jsoup.parse(indexHtml); //得到document对象后 就可以通过document对象来得到需要的东西 Elements elements = document.select(".mod-info-flow div[data-aid]"); for (Element element : elements) { String url = element.attr("data-aid"); urls.add(url); System.out.println(url); } return urls; } return null; } //首页的获取 private static String getIndex() throws Exception { String url = "https://www.huxiu.com"; //发起一个get请求 HttpGet httpGet = new HttpGet(url); //设置请求头 httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"); //返回页面的信息 return getHtml(httpGet); } //执行发送请求的方法 private static String getHtml(HttpGet httpGet) throws Exception { // TODO Auto-generated method stub String html = null; CloseableHttpClient httpClient = HttpClients.createDefault(); CloseableHttpResponse execute = httpClient.execute(httpGet); //判断响应码是否为200 if(execute.getStatusLine().getStatusCode() == 200){ HttpEntity entity = execute.getEntity(); html = EntityUtils.toString(entity); System.out.println(html);//返回的的页面的所有信息 } return html; } } 实体类代码 package cn.itcast.huxiu; public class Article { private int id; private String title; private String author; private String createTime; private String sc; private String zan; private String pl; private String content; private String url; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public String getSc() { return sc; } public void setSc(String sc) { this.sc = sc; } public String getZan() { return zan; } public void setZan(String zan) { this.zan = zan; } public String getPl() { return pl; } public void setPl(String pl) { this.pl = pl; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } @Override public String toString() { return "Article [id=" + id + ", title=" + title + ", author=" + author + ", createTime=" + createTime + ", sc=" + sc + ", zan=" + zan + ", pl=" + pl + ", content=" + content + ", url=" + url + "]"; } } 调数据库代码 package cn.itcast.huxiu; import org.springframework.jdbc.core.JdbcTemplate; import com.mchange.v2.c3p0.ComboPooledDataSource; public class ArticleDao extends JdbcTemplate{ public ArticleDao() { // 创建C3P0的datasource 1.配置 2.代码 ComboPooledDataSource dataSource = new ComboPooledDataSource(); // 1.url // 2.driver // 3.username&password dataSource.setUser("root"); dataSource.setPassword("123"); dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8"); setDataSource(dataSource); } public void save(Article article) { String sql = "INSERT INTO huxiu_article (id, title, author, createTime, zan, pl, sc, content, url ) VALUES( ?,?,?,?,?,?,?,?,?)"; update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl()); } }