""" 确定目标网站:https://www.wxscs.com/book/9422/ 内容页: """ #引入网页请求模块 import requests #网页主界面 url = "https://www.wxscs.com/book/9422/" #伪造亲求头部 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0" } # 发起伪造请求 response = requests.get(url, headers=headers) # 设置响应编码 response.encoding = "UTF-8" # 查看响应数据 content = response.text #的打印html页面 print(content) import re # <a href="/book/9422/1874033.html" title="第九章 壶娱中秋节" target="_blank">第九章 壶娱中秋节</a> #写出对应正则表达式 p = r'<a href="(/book/9422/187.*?)"\s+title=".*?"\s+target="_blank">(第.*?)</a>' chs = re.findall(p,content) print(chs) chapter = {} for ch in chs: chapter_url = "https://www.wxscs.com" + ch[0] chapter_title = ch[1] chapter[chapter_title] = chapter_url # 最终链接数据 print(chapter) import json with open("chapters.txt",mode="wt",encoding="UTF-8") as file: json.dump(chapter,file) #得到一个文件 文件内是章节目录
""" 章节内数据 """ import requests,re import time,random import json #找到文件 with open("chapters.txt",encoding="UTF-8") as file: chs = json.load(file) # print(chs) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0" } #分离标题和网页链接 for title,url in chs.items(): print(f"准备采集{title}\n") response = requests.get(url,headers=headers) response.encoding = "UTF-8" html = response.text # print(html) print("---------------------") #正则找到想要的内容 p = r'<div id="cont-body"\s+class="cont-body 187.*?">.*?<script>.*?</script>(.*?)</div>' content = re.search(p,html,re.DOTALL) content = content.group(1).strip() # 数据清晰 p2 = r'(<p>|</p>)' content = re.sub(p2, '\n', content, re.X) # content = "\n".join(content) # print(content) #将数据输出为一个文件 with open("杨戬.txt",mode="at",encoding="UTF-8") as file: file.write("\n\n---------------\n\n") file.write("\n\n"+title+"\n\n") file.write(content) #休眠伪造真人操作 time.sleep(random.randint(5,10)) print(f"{title}采集完成")