使用python多线程高效采集书籍小说

前面sham学着用php和js来采集，但是效率非常慢，于是通过ai得到了python版，这里记录下。
注：Sham目前还不会python，纯记录备忘。另只用于学习，请勿随意采集他人网站
直接上代码
import asyncio
import aiohttp
import re
from bs4 import BeautifulSoup
import os

book_api = '书籍网站地址'
filter_key = '需要采集的小说类别，比如完本小说'
start_id = 1
#创建文件夹
def ensure_folder_exists(folder_path):
    #如果书本文件夹不存在，就创建
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"文件夹 '{folder_path}' 已创建。")

# 获取小说书籍主页面信息
async def get_book_info(url, bookid, session):
    global book_api
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        async with session.get(url, headers=headers) as response:
            soup = BeautifulSoup(await response.text(), 'html.parser')
            chapters = []
            # 获取书名
            booktitle = soup.find('h1').text.strip()
            #这里替换特殊字符，防止倒是创建文件夹有问题
            book_title = re.sub(r'[\/:*?"<>|]', "_", booktitle)
            #拼接书名，前面加id是为了后续方便判断是否已经采集
            book_name = f"{bookid}_{book_title}"

            # 获取状态
            book_status = soup.find('div', class_='small').find_all('span')[1].text.replace('状态：', '').strip()
            # 获取书籍类别信息
            path_div = soup.find('div', class_='path wap_none')
            if path_div:
                contents = path_div.get_text(strip=True, separator=' ')
                parts = contents.split('>')
                book_category = parts[1].strip() if len(parts) > 1 else "未知分类"
            else:
                book_category = "未知分类"
            
            #这里是判断只采集{filter_key}设置的内容，通过分类来判断是否跳过，可以根据自己要求替换成比如状态等  
            if book_category != f"{filter_key}":
                print(f"{book_name} 不是{filter_key}，跳过")
                return None, [], book_status, None
            else:
                #这里执行判断创建文件夹
                ensure_folder_exists(f"{book_name}")
                # 获取作者信息，通过class=small的div中，span标签内内容，同时替换掉”作者:",以获取作者是谁
                #这个要根据采集的页面的的html标签来自定义
                author = soup.find('div', class_='small').find_all('span')[0].text.replace('作者：', '').strip()
    
                # 获取更新时间，方法同上
                update_time = soup.find('div', class_='small').find_all('span', class_='last')[0].text.replace('更新：', '').strip()
    
                # 获取最新章节，方法同上
                latest_chapter = soup.find('div', class_='small').find_all('span', class_='last')[1].find('a').text.strip()
    
                # 获取简介，方法同上，这里加[0]是只获取第一个dd标签内容
                dd_tag = soup.find('div', class_='intro').find_all('dd')[0]
                # 提取<dd>标签中的所有文本
                all_text = dd_tag.get_text()
                # 找到<span class="noshow">标签
                noshow_span = dd_tag.find('span', class_='noshow')
                # 提取<span class="noshow">标签之前的文本
                if noshow_span:
                    # 获取<span class="noshow">标签在<dd>标签中的索引位置
                    noshow_index = all_text.index(noshow_span.get_text())
                    # 提取<noshow>标签之前的文本
                    desc_txt = all_text[:noshow_index].strip()
                else:
                    desc_txt = all_text.strip()
                #print(f'简介: {desc_txt}')
    
                #保存书本信息到txt
                with open(f"{book_name}/book_desc.txt", 'w', encoding='utf-8') as files:
                    files.write(f'书名:{book_name}\n类别:{book_category}\n封面:cover.jpg\n作者:{author}\n状态:{book_status}\n更新时间:{update_time}\n最新章节:{latest_chapter}\n简介:{desc_txt}')
                    print(f"已保存书本信息")
    
                # 获取封面图片链接并下载到本地
                cover_image_url = soup.find('div', class_='cover').find('img')['src']
                cover_save_path = f"{book_name}/cover.jpg"
                await download_cover_image(cover_image_url, cover_save_path, session)
    
                # 获取章节列表
                #这里加idx是当前采集的网站文章列表页最上面会有最新章节，会导致第一个文件是最新章节的情况
                idx = 0
                #循环获取a标签内的内容
                for chapter_link in soup.find_all('a', href=True):
                    #判断如果href中包含htm链接，同时包含book/，证明是章节链接，跳过js等其他链接
                    if '.htm' in chapter_link['href'] and 'book/' in chapter_link['href']:
                        #这里就是起那面加idx的作用，当第一个链接不是第一章时，跳过，然后idx加1
                        if idx == 0 and '1.htm' not in chapter_link['href']:
                            print(f"跳过章节: {chapter_link}（不包含 '/1.htm'）")
                            idx += 1  # 增加索引以指向下一个链接
                            continue  # 跳过当前循环迭代
                        #这里是或者章节链接中的章节名
                        chapter_title = re.sub(r'[\/:*?"<>|]', "_", chapter_link.text.strip())
                        #这里是章节链接，因为是站内链接，所以前面加
                        chapter_url = book_api + chapter_link['href']
                        chapters.append((chapter_title, chapter_url))
                        #保存章节    
                #获取章节列表数组，只保留章节标题
                formatted_chapters = [f"{title}" for title, url in chapters]
                #转成用换行符分隔的字符串
                string_content = "\n".join(formatted_chapters)
                # 将字符串内容写入到章节列表，这个列表是为了方便后面按顺序读取章节用
                with open(f"{book_name}/chapter_list.txt", 'w', encoding='utf-8') as file:
                    file.write(string_content)
                print(f"已保存章节列表。")       
                #返回数据
                return book_name, chapters, book_status, book_category
    #错误信息
    except Exception as e:
        print(f"获取书籍信息失败: {e}")
        return None, [], None, None

# 下载封面图片
async def download_cover_image(url, save_path, session, retries=3):
    for attempt in range(retries):
        try:
            async with session.get(url, timeout=10) as response:
                if response.status == 200:
                    with open(save_path, 'wb') as file:
                        file.write(await response.read())
                    print(f"封面图片已下载到 {save_path}")
                    return
                else:
                    print(f"封面图片下载失败，状态码: {response.status}")
        except Exception as e:
            print(f"下载封面图片时出错（尝试 {attempt + 1}/{retries} 次）: {e}")
            await asyncio.sleep(2)

    print(f"封面图片下载失败，已尝试 {retries} 次")

# 获取章节内容
async def get_chapter_content(url, session):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    async with session.get(url, headers=headers) as response:
        soup = BeautifulSoup(await response.text(), 'html.parser')
        #方法参考上面，获取div标签中id为chaptercontent里面的内容
        content_div = soup.find('div', id='chaptercontent')
        if content_div:
            #将html中的br替换成txt文件中的换行
            for br in content_div.find_all('br'):
                br.replace_with('\n')
            #获取所有行内容
            lines = content_div.get_text().splitlines()
            #这里过滤掉包含指定字符的行，用于替换掉广告等
            filtered_lines = [line for line in lines if '广告' not in line and '点此报' not in line]
            return "\n".join(filtered_lines)
        else:
            return ""

# 处理单本书籍
async def process_book(book_id, session):
    global book_api
    book_url = f'{book_api}/book/{book_id}/'
    book_name, chapters, book_status, book_category = await get_book_info(book_url, book_id, session)
    #当书本类型为指定筛选类型时执行后续操作
    if book_category == f"{filter_key}" and book_name and chapters:
        for chapter_title, chapter_url in chapters:
            chapter_path = f"{book_name}/{chapter_title}.txt"
            #如果已经存在章节txt同时不为空时，表示已经采集过了，跳过
            if os.path.exists(chapter_path) and os.path.getsize(chapter_path) > 0:
                print(f"章节 {chapter_title} 已存在，跳过")
                continue
            
            print(f"正在采集章节: {chapter_title}")
            #获取章节内容
            content = await get_chapter_content(chapter_url, session)
            #这里再次判断是否已经存在书籍文件夹
            ensure_folder_exists(book_name)
            #打开并写入当前章节txt
            with open(chapter_path, 'w', encoding='utf-8') as file:
                file.write(content)
                print(f"已保存章节: {chapter_title}")

        #当完成章节采集后，更新已采集的书本列表，这个用于获取已有书籍清单（不过有时会缺失或重复，建议后续全部采集完后通过获取books内文件名来重新生成
        if os.path.exists(f"已采集_in_{book_category}.txt"):
            with open(f"已采集_in_{book_category}.txt", 'a', encoding='utf-8') as file:
                file.write(f"{book_name}\n")
        else:
            with open(f"已采集_in_{book_category}.txt", 'w', encoding='utf-8') as file:
                file.write(f"{book_name}\n")

# 主循环，批量处理书籍，保证始终有100本在下载
async def main():
    #如果存在正在采集的idtxt，则获取里面的id，这个用于当采集中断时判断采集到哪了（这里存的是当前第100个任务书籍，如果中途断了，需要将id改成往前100本，避免中间章节缺失）
    global start_id
    if os.path.exists("正在采集的id.txt"):
        with open("正在采集的id.txt", 'r', encoding='utf-8') as file:
            start_id = int(file.read().strip())
    #定义aiohttp
    async with aiohttp.ClientSession() as session:
        current_id = start_id
        tasks = []
        
        # 保持队列中始终有100本书在下载
        while current_id < 180000:
            if len(tasks) < 100:
                task = asyncio.create_task(process_book(current_id, session))
                tasks.append((task, current_id))
                current_id += 1
                continue
            
            # 获取已完成的任务并移除非完本任务
            done, pending = await asyncio.wait([task for task, _ in tasks], return_when=asyncio.FIRST_COMPLETED)
            for completed_task in done:
                tasks = [(task, id) for task, id in tasks if task != completed_task]

            # 更新已采集的书本ID
            with open("正在采集的id.txt", 'w', encoding='utf-8') as file:
                file.write(str(current_id))

# 运行异步主程序
asyncio.run(main())
使用python多线程高效采集书籍小说

悦读