import urllib.request
import time
import random
import os
def fetch_webpage(url):
# 设置User-Agent
user_agent = 'Sogou web spider/4.0 (+http://www.sogou.com/docs/help/webmasters.htm#07)'
# 创建请求对象,并设置User-Agent
request = urllib.request.Request(url=url, headers={'User-Agent': user_agent})
try:
# 发送请求并接收响应
response = urllib.request.urlopen(request)
# 解码并返回页面内容
page_content = response.read().decode('utf-8')
return page_content
except Exception as e:
print(f"Error occurred: {e}")
return None
# 目标网址
url = "http://www.123.com"
# D盘的文件保存路径
file_path = r"D:\web_data\output.txt"
# 确保文件所在的目录存在
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# 访问次数和间隔
num_requests = 10
sleep_range = (3, 6) # 每次请求之间的随机休眠时间范围(秒)
# 执行间歇式获取数据
for i in range(num_requests):
print(f"Fetching page {i+1}...")
content = fetch_webpage(url)
if content is not None:
# 将内容追加到文件
with open(file_path, 'a', encoding='utf-8') as file:
file.write(content)
file.write('\n--- End of Page ---\n')
# 模拟间歇时间,避免连续请求过于频繁
time.sleep(random.uniform(*sleep_range))
print("Data fetching complete.")