python爬取 电影天堂 影视数据
我的第一个比较实用的爬虫,2019年8月3日测试可用,全部复制粘贴运行即可,需要安装C语言库,可使用 pip 安装:pip install lxml
from lxml import etree
import requests,time,random
BASE_DOMAIN = "https://dytt8.net"
HEADERS = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
# 每个电影的详情页爬取
def parse_detail_page(movie_url):
movie = {
}
print(movie_url)
response = requests.get(movie_url,headers = HEADERS)
html = etree.HTML(response.content.decode('gbk','replace'))
movie['◎']=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
zoom = html.xpath("//div[@id='Zoom']")[0]
try:
movie['海报']=zoom.xpath(".//img/@src")[0]
except IndexError:
movie['海报']="暂无海报"
try:
movie['截图']=zoom.xpath(".//img/@src")[1]
except IndexError:
movie['截图']="暂无截图"
infos=zoom.xpath(".//text()")
# 对抓取到的文本一行行解析
def parse_info(info,rule):
if info.startswith(rule):
info = info.replace(rule, "")