1.xpath解析之xml
from lxml import etree
"""
树:整个HTML或者xml结构
节点:HTML中的每个标签,xml中标签就是节点
根节点:树的第一个节点,HTML的根节点就是HTML标签
属性:节点属性(html中就是标签属性)
"""
xml_str = """
<supermaket>
<name>永辉超市</name>
<address>肖家河大厦</address>
<goodslist>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goodslist>
<goods price="35" count="20">
<name>烟</name>
</goods>
<worker_list>
<cashier name="张三" pay="4000"></cashier>
<shoppingGuide name="李四" pay="3500"></shoppingGuide>
</worker_list>
</supermaket>
"""
supermarket = etree.XML(xml_str)
cashier = supermarket.xpath('/supermaket/worker_list/cashier')
print(cashier)
worker_list = supermarket.xpath('/supermaket/worker_list')[0]
print(worker_list)
result = worker_list.xpath('/worker_list/cashier')
print(result)
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier)
cashier = worker_list.xpath('./cashier')
print(cashier)
cashier = worker_list.xpath('cashier')
print(cashier)
result = supermarket.xpath('//cashier')
print(result)
result = supermarket.xpath('//goods')
print(result)
result = supermarket.xpath('//goodslist/goods')
print(result)
name = supermarket.xpath('./name/text()')
print(name)
names = supermarket.xpath('//name/text()')
print(names)
cashier = supermarket.xpath('//goods/@price')
print(cashier)
2.xpath解析之html
from lxml import etree
html = etree.HTML(open('files/test.html', encoding='utf-8').read())
h1 = html.xpath('//h1/text()')
print(h1)
h1 = html.xpath('./body/h1/text()')
print(h1)
p = html.xpath('./body/p[1]/text()')
print(p)
result = html.xpath('//li[1]/p/text()')
print(result)
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)
result = html.xpath('./body/div/p[@class]/text()')
print(result)
result = html.xpath('./body/div/p[@class="b"]/text()')
print(result)
result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)
result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)
result = html.xpath('./body/ul/li[p[1] = "面包"]/p/text()')
print(result)
result = html.xpath('./body/div[@id="div1"]/*')
print(result)
result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)
result = html.xpath('//*[@class="c1"]/text()')
print(result)
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[2]/text()')
print(result)
3.html测试数据
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>商店</title>
</head>
<body>
<h1>永辉超市</h1>
<p>肖家河大厦</p>
<p>营业中</p>
<ul>
<li>
<p class="name">泡面</p>
<p class="price">3.5</p>
<p class="count">15</p>
</li>
<li>
<p class="name">矿泉水</p>
<p class="price">2</p>
<p class="count">120</p>
</li>
<li>
<p class="name">面包</p>
<p class="price">5</p>
<p class="count">42</p>
</li>
<li>
<p class="name">充电宝</p>
<p class="price">150</p>
<p class="count">10</p>
</li>
</ul>
<div>
<p id="a">p1</p>
<p class="b">p2</p>
<p class="c1">p3</p>
<p class="d">p4</p>
</div>
<div id="div1">
<p class="c1">p1</p>
<p id="p2">p2</p>
<a href="">a1</a>
<span class="c1">span1</span>
<img src="https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fbpic.588ku.com%2Felement_origin_min_pic%2F17%2F06%2F13%2F5c5a1442f0ec72e59829ee10d891f224.jpg%21r650&refer=http%3A%2F%2Fbpic.588ku.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1631690803&t=ddfb673477426b3255f364e59966b2f1">
</div>
</body>
</html>