Bootstrap

pythonxpath解析和xml格式总结

1.xpath解析之xml

from lxml import etree

# 1.专业术语

"""
树:整个HTML或者xml结构
节点:HTML中的每个标签,xml中标签就是节点
根节点:树的第一个节点,HTML的根节点就是HTML标签
属性:节点属性(html中就是标签属性)
"""

# 2.xml数据格式
# json数据和xml数据是两种通用的数据格式,用于不同语言之间的数据交流

# 1). 准备数据
xml_str = """
<supermaket>
    <name>永辉超市</name>
    <address>肖家河大厦</address>
    <goodslist>
        <goods name="泡面" price="3.5" count="20"></goods>
        <goods name="矿泉水" price="2" count="50"></goods>
        <goods name="面包" price="5" count="15"></goods>
    </goodslist>
    <goods price="35" count="20">
    <name>烟</name>
    </goods>
    <worker_list>
        <cashier name="张三" pay="4000"></cashier>
        <shoppingGuide name="李四" pay="3500"></shoppingGuide>
    </worker_list>
</supermaket>
"""

# 2). 创建树对象,并获取数据的根节点

supermarket = etree.XML(xml_str)
# print(supermarket)

# 3). 获取节点
# 节点对象.xpath(路径) - 根据路径找到对应的节点,返回节点对象
# a.写绝对路径,不管xpath前面的节点对象是什么,路径从根节点开始写
# 写法:/绝对路径
cashier = supermarket.xpath('/supermaket/worker_list/cashier')
print(cashier)  # [<Element cashier at 0x26a1eb29200>]

worker_list = supermarket.xpath('/supermaket/worker_list')[0]
print(worker_list)  # <Element worker_list at 0x26a1eb29180>

result = worker_list.xpath('/worker_list/cashier')
print(result)   # []

# b.相对路径:用.来表示当前节点
# 注意:./可以省略
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier)  # [<Element cashier at 0x1a7a4ab9280>]

cashier = worker_list.xpath('./cashier')
print(cashier)  # [<Element cashier at 0x2432e913f00>]

cashier = worker_list.xpath('cashier')
print(cashier)  # [<Element cashier at 0x2432e913f00>]

# c.//路径  -  从任意位置开始全局搜索
# 查找方式和功能和xpath前的节点无关

result = supermarket.xpath('//cashier')
print(result)
# [<Element cashier at 0x10616ecc0>. ]
result = supermarket.xpath('//goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]
result = supermarket.xpath('//goodslist/goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]

# 4)获取节点内容
# 语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name)

names = supermarket.xpath('//name/text()')
print(names)

# 5)获取节点属性值
# 语法:获取节点的路径/@属性名
cashier = supermarket.xpath('//goods/@price')
print(cashier)

2.xpath解析之html

from lxml import etree

html = etree.HTML(open('files/test.html', encoding='utf-8').read())

h1 = html.xpath('//h1/text()')
print(h1)

h1 = html.xpath('./body/h1/text()')
print(h1)

# 1.加谓语(加条件)
# 选中标签的路径[谓语]
# a.[N]  -  获取同级的相同标签的第N个
p = html.xpath('./body/p[1]/text()')
print(p)

result = html.xpath('//li[1]/p/text()')
print(result)

# 2)
# [last()]  -  获取同层的最后一个标签
# [last()-N]  -  获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)

# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)


# 4) [@属性名]  -  获取有指定属性的标签
# p[@class]  - 有class属性的p标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)

# [@属性名='值']  -  获取有指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="b"]/text()')
print(result)

# 5)
# [标签 >/</>=/<=/= 数据]  -   将标签按照指定子标签的内容进行筛选

result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)

result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)

result = html.xpath('./body/ul/li[p[1] = "面包"]/p/text()')
print(result)


# 2.通配符 :*
# 1) 表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)

result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)

result = html.xpath('//*[@class="c1"]/text()')
print(result)

# 2) 表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)


# 3. 分支(获取若干路径)  - |
# 注意:一个竖线隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[2]/text()')
print(result)

3.html测试数据

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>商店</title>
</head>
<body>
<h1>永辉超市</h1>
    <p>肖家河大厦</p>
    <p>营业中</p>
    <ul>
        <li>
            <p class="name">泡面</p>
            <p class="price">3.5</p>
            <p class="count">15</p>
        </li>
        <li>
            <p class="name">矿泉水</p>
            <p class="price">2</p>
            <p class="count">120</p>
        </li>
        <li>
            <p class="name">面包</p>
            <p class="price">5</p>
            <p class="count">42</p>
        </li>

        <li>
            <p class="name">充电宝</p>
            <p class="price">150</p>
            <p class="count">10</p>
        </li>
    </ul>
    <div>
        <p id="a">p1</p>
        <p class="b">p2</p>
        <p class="c1">p3</p>
        <p class="d">p4</p>
    </div>
    <div id="div1">
        <p class="c1">p1</p>
        <p id="p2">p2</p>
        <a href="">a1</a>
        <span class="c1">span1</span>
        <img src="https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fbpic.588ku.com%2Felement_origin_min_pic%2F17%2F06%2F13%2F5c5a1442f0ec72e59829ee10d891f224.jpg%21r650&refer=http%3A%2F%2Fbpic.588ku.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1631690803&t=ddfb673477426b3255f364e59966b2f1">
    </div>

</body>
</html>
;