Bootstrap

网页爬虫(七):爬取某城市历史天气数据

import requests
from bs4 import BeautifulSoup
import pandas as pd
 
 
def save_a_month_data(url):
    """
    保存一个月的天气历史数据,即一个页面中的数据
    :param url: 页面url 保存有某年某月的月数据所在的页面
    :return: 返回当月的二维天气数据列表
    """
    # 返回的当月的天气数据列表a_month
    a_month = []
 
    # 请求头
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
    # 请求url,响应r
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    tianqi_zone = soup.find(class_='tian_three')
    try:
        tianqi_data = tianqi_zone.find(class_='thrui')
    except AttributeError:
        print(tianqi_zone)
        return a_month
    tianqi_data_a_month = tianqi_data.find_all('li')
 
    # 当月天气历史数据的处理
    for tianqi_data_a_day in tianqi_data_a_month:
        len_a_day_data = len(tianqi_data_a_day.text.split())
        if len_a_day_data == 7:
            a_day_data = tianqi_data_a_day.text.split()
            a_day_data.remove(a_day_data[1])
            a_month.append(a_day_data)
        else:
            a_month.append(tianqi_data_a_day.text.split())
 
    return a_month
 
 
# 所有的历史数据保存变量 all_data
all_data = []
 
# 遍历202001~202412之间的所有历史数据
for year in range(2020, 2025):
    for month in range(1, 13):
        if month < 10:
            url = f"http://lishi.tianqi.com/yangzhou/{year}0{month}.html"
            # print(url)
            a_month_data = save_a_month_data(url)
            all_data += a_month_data
        else:
            url = f"http://lishi.tianqi.com/yangzhou/{year}{month}.html"
            # print(url)
            a_month_data = save_a_month_data(url)
            all_data += a_month_data
 
df = pd.DataFrame(all_data)
# 保存所有的天气历史数据至本地excel文件中
df.to_excel('yangzhou-tianqi.xlsx', index=False)