Bootstrap

python获取历史故事-2025最新

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time, datetime

from sqlalchemy import create_engine, Column, Integer, String, LargeBinary,DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()

# MySQL数据库连接字符串,根据实际情况修改参数
DATABASE_URI = 'mysql+mysqlconnector://root:[email protected]:3306/pb_cms_base?charset=utf8'

engine = create_engine(DATABASE_URI, echo=True)  # 设置echo=True以查看SQL语句

# 如果表不存在则创建新表
Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)


class WebPage(Base):
    __tablename__ = 'biz_zhonghua'

    id = Column(Integer, primary_key=True, autoincrement=True)
    url = Column(String(255), unique=True, nullable=False)
    title = Column(String(255))
    category = Column(String(255))
    content = Column(String(20000))
    content_en = Column(String(20000))
    audio_file = Column(LargeBinary, nullable=True)
    update_time = Column(DateTime, default=datetime.datetime.now())

    def __repr__(self):
        return f'WebPage(id={self.id}, url={self.url}, title={self.title})'

# 网站URL
list_url = 'https://不能说的秘密.com/'
num = 0
# 设置headers模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def fetch_url(url):
    try:
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return {'url': url, 'content': response.text}
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None


def save_to_db(session,new_page):
    try:
        session.add(new_page)
        session.commit()  # 提交事务
    except Exception as e:
        print(f"Error saving to database: {e}")
        session.rollback()  # 回滚事务
    finally:
        session.close()

def process_url(url):
    global num
    result = fetch_url(url)
    num += 1
    print(f'处理第{num}个链接{url}')
    if result:
        content = result['content']
        soup = BeautifulSoup(content, 'html.parser')
        title = soup.find('h5', class_='lh-base').text
        category = soup.select('li.breadcrumb-item')[-1].text if soup.select('li.breadcrumb-item') else None
        content = soup.find('div', class_='grap').text
        new_page = WebPage(url=result['url'], title=title,category=category, content=content)
        # 创建新的会话用于保存数据到数据库
        with Session() as session:
            save_to_db(session, new_page)
        time.sleep(1)  # 尊重服务器,适当延时

def get_urls():
    urls = []
    try:
        response = requests.get(list_url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            lis = soup.find_all('li', class_='list-inline-item')
            for li in lis:
                a_tag = li.find('a')
                if a_tag:
                    url = a_tag['href']
                    urls.append(url)
            return urls
        else:
            print(f"Failed to retrieve {list_url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching {list_url}: {e}")
        return None



def main():
    urls = get_urls()
    print(len(urls))
    if urls is None:
        return
    with ThreadPoolExecutor(max_workers=5) as executor:  # 设置最大线程数为5
        futures = [executor.submit(process_url, url) for url in urls]
        for future in as_completed(futures):
            future.result()  # 捕获异常


if __name__ == "__main__":
    main()
;