import os
import requests
import re
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import csv
创建驱动对象(Edge),需要注意要配置好Edge驱动(EdgeDriver),并且确保其路径正确添加到系统环境变量PATH中
driver = webdriver.Edge()
加载网址
driver.get(‘https://www.dy2018.com/html/gndy/dyzz/index.html’)
等待页面上某个特定元素出现,这里假设存在一个class为特定值的元素作为加载完成标志
wait = WebDriverWait(driver, 10)
page_source = driver.page_source
html = etree.HTML(page_source)
后续的xpath操作保持不变
p = html.xpath(‘//[@id=“header”]/div/div[3]/div[6]/div[2]/div[2]/div[2]/ul/table[]/tbody/tr[2]/td[2]/b/a’)
def save_to_csv(data):
header = [‘电影名称’, ‘电影类型’, ‘发布时间’, ‘评分’, ‘导演’, ‘主演’]
with open(‘电影天堂信息.csv’, ‘a’, encoding=‘utf - 8’, newline=‘’) as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=header)
if csvfile.tell() == 0:
writer.writeheader()
writer.writerow({
‘电影名称’: data[0],
‘电影类型’: data[1],
‘发布时间’: data[2],
‘评分’: data[3],
‘导演’: data[4],
‘主演’: data[5]
})
driver.close()
for i in p:
lo = ‘https://www.dy2018.com’ + i.xpath(‘./@href’)[0]
driver = webdriver.Edge()
driver.get(lo)
# driver.get(‘https://www.dy2018.com/i/113182.html’)
# 等待页面上某个特定元素出现,这里假设存在一个class为特定值的元素作为加载完成标志
wait = WebDriverWait(driver, 10)
page_source = driver.page_source
html = etree.HTML(page_source)
# 电影名称
try:
a = html.xpath(‘//[@id=“header”]/div/div[3]/div/div[6]/div[1]/h1/text()')[0]
#//[@id=“header”]/div/div[3]/div/div[6]/div[1]/h1
except IndexError:
a = “未知电影名称”
print(a)
# 电影类型
type_elements = ‘’.join(html.xpath(’//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[2]/a[1]/text()|//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[2]/a[2]/text()|//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[2]/a[3]/text()|//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[2]/a[4]/text()‘))
if type_elements:
b = type_elements
else:
b = “未知类型”
print(b)
# 发布时间
time_elements = html.xpath(’//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[3]/text()‘)
if time_elements:
c = time_elements[0]
else:
c = “未知发布时间”
print©
# 评分
score_elements = html.xpath(’//[@id=“header”]/div/div[3]/div/div[6]/div[2]/ul/div[1]/span[1]/strong/text()‘)
if score_elements:
d = score_elements[0]
else:
d = “无评分信息”
print(d)
# 导演
y = ‘’.join(html.xpath(’//[@id=“Zoom”]/text()‘))
k = re.sub(r’\u3000\n ', ‘’, y)
match = re.search(r’◎导 演(.?)◎’, k)
if match:
e = match.group(1).strip()
else:
e = “未知导演”
print(e)
# 主演
y = ‘’.join(html.xpath(‘//[@id=“Zoom”]/text()‘))
k = re.sub(r’\u3000\n ‘, ‘’, y)
f = re.sub(r’ ', ‘-’, k)
matc = re.search(r’◎主 演(.?)◎’, f)
if matc:
f = matc.group(1).strip()
else:
f = “未知主演”
print(f)
data = [a,b,c,d,e,f]
save_to_csv(data)
driver.close()
print(“数据保存完成!”)