Python(requests)爬取电影天堂
from lxml import etree
import requests
ul = "https://www.dytt8.net/html/gndy/dyzz/list_23_1.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
def get_first_url(url):
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
movie_href = html.xpath('//table[@class="tbspan"]//a/@href')
basic_domain = 'https://www.dytt8.net'
first_url = []
for href in movie_href:
first_url.append(basic_domain + href)
return first_url
def get_detail_html(move_list):
all_movie_list = []
for movie in move_list:
detail_html = requests.get(movie, headers=headers)
html_content = detail_html.content.decode('gb2312', 'ignore')
html_e = etree.HTML(html_content)
movie_information = {}
title = html_e.xpath('//title/text()')[0]
movie_information['title'] = title
img_srcs = html_e.xpath('//img/@src')
movie_information['img'] = img_srcs[0]
elements = html_e.xpath('//p[position()=1]')
ele_info = elements[0].xpath('//br')
index = 0
for ele in ele_info:
str_ele = etree.tostring(ele, encoding='utf-8').decode('utf-8')
if str_ele.startswith('<br />◎译 名'):
name = str_ele.replace("<br />◎译 名", "").strip()
movie_information['name'] = name
elif str_ele.startswith('<br />◎片 名'):
english_name = str_ele.replace('<br />◎片 名', '').strip()
movie_information['English_name'] = english_name
elif str_ele.startswith('<br />◎产 地'):
palace = str_ele.replace("<br />◎产 地", '').strip()
movie_information['palace'] = palace
elif str_ele.startswith('<br />◎类 别'):
type_movie = str_ele.replace('<br />◎类 别', '').strip()
movie_information['move_type'] = type_movie
elif str_ele.startswith('<br />◎编 剧'):
authors = []
author = str_ele.replace('<br />◎编 剧', '').strip()
authors.append(author)
for i in range(ele_info.index(ele), len(ele_info)):
str_ele2 = etree.tostring(ele_info[i + 1], encoding='utf-8').decode('utf-8')
if str_ele2.startswith('<br />◎主 演') == False:
clean_author = str_ele2.replace('<br />', '').strip()
authors.append(clean_author)
else:
movie_information['authors'] = authors
break
elif str_ele.startswith('<br />◎主 演'):
actors = []
actor = str_ele.replace('<br />◎主 演', '').strip()
actors.append(actor)
for i in range(ele_info.index(ele), len(ele_info)):
str_ele2 = etree.tostring(ele_info[i + 1], encoding='utf-8').decode('utf-8')
if str_ele2.startswith('<br />◎标 签') == False:
clean_actor = str_ele2.replace('<br />', '').strip()
actors.append(clean_actor)
else:
movie_information['actors'] = actors
break
all_movie_list.append(movie_information)
print(all_movie_list)
if __name__ == '__main__':
move_list = get_first_url(url)
get_detail_html(move_list)
