源代码:
import requests
from bs4 import BeautifulSoup
def get_pages(url):
"""
获取网页源代码
"""
response = requests.get(url)
content = response.text.encode('iso-8859-1').decode('gbk')#获取网页源代码并编码转换
return content
def get_data(html,class_name):
"""得到数据"""
soup = BeautifulSoup(html,'lxml')
container = soup.select(class_name+' ul tr')[1:]#得到class_name类名下的tr,注意返回列表
# print(container)
for tr in container:
a = tr.select('td a')#获取td下的a,返回列表
date = tr.select('td font')[0].text#得到日期
category = a[0].text#得到类别
url = 'http://www.dytt8.net'+ a[1]['href']#得到url
movie = a[1].text#得到电影名称
print(category,url,movie,date)
def main():
url = 'http://www.dytt8.net'
html = get_pages(url)#网页源代码
class_name = '.co_content8'#class名
get_data(html,class_name)#得到.co_content8类名下的数据
class_name = '.co_content3'
get_data(html,class_name)
if __name__ == '__main__':
main()