一、Jsoup的简单介绍
1.1 简介
Jsoup
是一款用于处理HTML文档的Java库。它提供了一组简单而强大的API,使得在Java中处理HTML变得非常容易。Jsoup
允许你从HTML文档中提取数据、操作HTML元素,以及实现类似于Web爬虫的功能。
说人话:Jsoup就是Java用来做爬虫用的。
1.2 Jsoup解析HTML
1.2.1 引入Jsoup依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<!-- 爬虫相关 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
1.2.2 Jsoup常用函数
1)getElementById
因为在html中id都是唯一的,所以getElementById的返回值是一个Element元素
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JsoupExample {
public static void main(String[] args) {
String html = "<html><body><div id='myDiv'>This is a div with ID 'myDiv'</div></body></html>";
// 解析 HTML 文档
Document doc = Jsoup.parse(html);
// 通过 ID 获取元素
Element elementWithId = doc.getElementById("myDiv");
// 检查是否找到元素
if (elementWithId != null) {
System.out.println("Element with ID 'myDiv' found: " + elementWithId.text());
} else {
System.out.println("Element with ID 'myDiv' not found.");
}
}
}
2)getElementsByTag
通过getElementsByTag获取的是个Elements,需要遍历展开
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupExample {
public static void main(String[] args) {
String html = "<html><body><p>Paragraph 1</p><p>Paragraph 2</p><div><p>Paragraph 3</p></div></body></html>";
// 解析 HTML 文档
Document doc = Jsoup.parse(html);
// 通过标签获取元素
Elements paragraphs = doc.getElementsByTag("p");
// 遍历元素并输出文本内容
for (Element paragraph : paragraphs) {
System.out.println(paragraph.text());
}
}
}
注意这里返回的Elements元素继承了ArrayList,所以Elements可以看作是个list类型
3)getElementsByClass
通过指定class来获取Elements,也是需要遍历展开的
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupExample {
public static void main(String[] args) {
String html = "<html><body><div class='container'><p class='paragraph'>Paragraph 1</p><p class='paragraph'>Paragraph 2</p></div></body></html>";
// 解析 HTML 文档
Document doc = Jsoup.parse(html);
// 通过类名获取元素
Elements paragraphs = doc.getElementsByClass("paragraph");
// 遍历元素并输出文本内容
for (Element paragraph : paragraphs) {
System.out.println(paragraph.text());
}
}
}
二、Jsoup实战案例分享
2.1 依赖准备
因为在实际使用的过程中,我们需要把图片下载到本地,所以需要通过io流的方式下载图片,我这里使用的Hutool工具类。
引入依赖:
<!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
<!-- hutool工具类 -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.23</version>
</dependency>
下面以获取alexanderwang网站上的服装款式为例,网站地址:https://www.alexanderwang.cn/cn-zh/
案例代码:
package test;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.http.HttpUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class AlexanderWang {
private static String PREFIX_URL = "https://www.alexanderwang.cn";
private static String SAVE_PATH = "E:\\picture_output";
private static String RAW_URL = "https://www.alexanderwang.cn/cn-zh/search?cgid=%s&start=%d";
private static List<String> CATEGORY_ITEMS = new ArrayList<>();
static {
CATEGORY_ITEMS.add("women-newarrivals");
CATEGORY_ITEMS.add("women-resort24");
CATEGORY_ITEMS.add("women-essentials");
CATEGORY_ITEMS.add("men-newarrivals");
CATEGORY_ITEMS.add("men-giftsforhim");
}
public static void main(String[] args) {
for (String item : CATEGORY_ITEMS) {
int lastNum = 1;
while (true) {
// 每次获取12个单品
String forUrl = String.format(RAW_URL, item, lastNum * 12);
lastNum++;
System.out.println(forUrl);
try {
Document document = Jsoup.parse(new URL(forUrl), 30000);
Elements elementsByLi = document.getElementById("main").getElementsByTag("ul").get(0).getElementsByTag("li");
if (elementsByLi.size() == 0) {
break;
}
for (Element li : elementsByLi) {
String nextUrl = li.getElementsByTag("a").attr("href");
Document secDoc = Jsoup.parse(new URL(PREFIX_URL + nextUrl), 30000);
Element main = secDoc.getElementById("main");
Elements pictures = main.getElementsByClass("product-details_images").get(0)
.getElementsByTag("picture");
// 获取商品的名字和价格
Elements details = main.getElementsByClass("product-details-main-information_details");
String h1Name = details.first().getElementsByTag("h1").first().text();
String price = details.first().getElementsByTag("span").first().text();
System.out.println("name: " + h1Name + "; price: " + price);
int count = 1;
String prefix = RandomUtil.randomString(6);
// 获取商品的具体图片
for (Element picture : pictures) {
String thrUrl = picture.getElementsByTag("source").get(0).attr("data-srcset");
System.out.println(thrUrl);
// 下载图片
byte[] imageData;
try {
// 下载文件
imageData = HttpUtil.downloadBytes(getRawUrl(thrUrl));
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
// 保存图片到指定路径
StringBuilder name = new StringBuilder().append(h1Name).append("_")
.append(prefix).append("_")
.append(count++).append(".jpg");
System.out.println(name);
FileUtil.writeBytes(imageData, FileUtil.file(SAVE_PATH, name.toString()));
}
}
} catch (IOException e) {
System.out.println(e.getMessage());
break;
}
}
}
}
private static String getRawUrl(String thrUrl) {
// 把后面的1200参数去掉
return thrUrl.substring(0, thrUrl.indexOf("?"));
}
private static void extracted() {
try {
String url = "https://www.alexanderwang.cn/cn-zh/women-bags";
Document document = Jsoup.parse(new URL(url), 30000);
System.out.println(document);
document.getElementsByClass("product-grid-wrapper").get(0).getElementsByTag("ul").get(0).getElementsByTag("li").forEach(li -> {
String innerUrl = li.getElementsByClass("swiper-wrapper").get(0).getElementsByTag("a").get(0).attributes().get("href");
try {
Document parse = Jsoup.parse(new URL("https://www.alexanderwang.cn" + innerUrl), 30000);
Elements pictures = parse.getElementsByClass("product-details_images").get(0).getElementsByTag("picture");
for (Element picture : pictures) {
Element tag = picture.getElementsByTag("source").get(0);
System.out.println(tag.attr("data-srcset"));
String imageUrl = tag.attr("data-srcset");
String savePath = "C:\\Users\\zhangyl\\Desktop\\output";
String imgName = IdUtil.fastUUID() + ".jpg";
// 保存图片到指定路径
FileUtil.writeBytes(HttpUtil.downloadBytes(imageUrl), FileUtil.file(savePath, imgName));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
Attributes attributes = li.getElementsByTag("picture").get(0).getElementsByTag("img").get(1).attributes();
List<Attribute> attributes1 = attributes.asList();
System.out.println(attributes1.get(2));
String picUrl = attributes1.get(1).toString();
System.out.println(picUrl.substring(5, picUrl.length() - 8));
System.out.println("===============");
});
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}