from lxml import etree
import requests
# url='https://www.dytt8.net/html/gndy/dyzz/index.html'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
# 爬取前八页的电影的链接
def Film_urls():
url='https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
Base_url='https://www.dytt8.net/'
urls=[]
for i in range(1,8):
print("正在爬取每页电影链接。。。。")
url=url.format(i)
response=requests.get(url,headers=headers)
html=etree.HTML(response.text)
urls+=(html.xpath("//table[@class='tbspan']//a/@href"))
urls=[Base_url+url for url in urls]
return urls
# 爬取每个电影的详细内容
def Film_content(url):
movie={}
response=requests.get(url,headers=headers)
response=response.content.decode(encoding='gbk',errors='ignore')
html=etree.HTML(response)
content=html.xpath("//div[@id='Zoom']")[0]
#定义筛选函数
def parse_info(info,str):
return info.replace(str,"").strip()
infos=content.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith("◎片 名"):
info=parse_info(info,"◎片 名")
movie['name']=info
elif info.startswith("◎年 代"):
info=parse_info(info,"◎年 代")
movie['year']=info
elif info.startswith("◎产 地"):
info=parse_info(info,"◎产 地")
movie['country']=info
elif info.startswith("◎类 别"):
info=parse_info(info,"◎类 别")
movie['category']=info
elif info.startswith("◎豆瓣评分"):
info=parse_info(info,"◎豆瓣评分")
movie['douban_rating']=info
elif info.startswith("◎片 长"):
info=parse_info(info,"◎片 长")
movie['duration']=info
elif info.startswith("◎导 演"):
info = parse_info(info, "◎导 演")
movie['director'] = info
elif info.startswith("◎主 演"):
info = parse_info(info, "◎主 演")
actors = [info]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie['actors'] = actors
elif info.startswith("◎简 介"):
info = parse_info(info, "◎简 介")
movie['profile']=infos[index+1].strip()
imgs=content.xpath(".//img/@src")
movie['img_url']=imgs[0]
download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
movie['download_url'] = download_url
return movie
def spider():
urls = Film_urls()
movies=[]
for url in urls[:10]:
print("正在爬取电影详细内容。。。")
movie=Film_content(url)
movies.append(movie)
return movies
if __name__ == '__main__':
movies=spider()
print(movies)
