Bootstrap

python爬取 电影天堂 影视数据

python爬取 电影天堂 影视数据

我的第一个比较实用的爬虫,2019年8月3日测试可用,全部复制粘贴运行即可,需要安装C语言库,可使用 pip 安装:pip install lxml

from lxml import etree
import requests,time,random

BASE_DOMAIN = "https://dytt8.net"
HEADERS = {
   
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}

# 每个电影的详情页爬取
def parse_detail_page(movie_url):
    movie = {
   }
    print(movie_url)
    response = requests.get(movie_url,headers = HEADERS)
    html = etree.HTML(response.content.decode('gbk','replace'))
    movie['◎']=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    zoom = html.xpath("//div[@id='Zoom']")[0]
    try:
        movie['海报']=zoom.xpath(".//img/@src")[0]
    except IndexError:
        movie['海报']="暂无海报"
    try:
        movie['截图']=zoom.xpath(".//img/@src")[1]
    except IndexError:
        movie['截图']="暂无截图"
    infos=zoom.xpath(".//text()")

	# 对抓取到的文本一行行解析
    def parse_info(info,rule):
        if info.startswith(rule):
            info = info.replace(rule, "")