import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time, datetime
from sqlalchemy import create_engine, Column, Integer, String, LargeBinary,DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
DATABASE_URI = 'mysql+mysqlconnector://root:[email protected]:3306/pb_cms_base?charset=utf8'
engine = create_engine(DATABASE_URI, echo=True)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
class WebPage(Base):
__tablename__ = 'biz_zhonghua'
id = Column(Integer, primary_key=True, autoincrement=True)
url = Column(String(255), unique=True, nullable=False)
title = Column(String(255))
category = Column(String(255))
content = Column(String(20000))
content_en = Column(String(20000))
audio_file = Column(LargeBinary, nullable=True)
update_time = Column(DateTime, default=datetime.datetime.now())
def __repr__(self):
return f'WebPage(id={self.id}, url={self.url}, title={self.title})'
list_url = 'https://不能说的秘密.com/'
num = 0
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_url(url):
try:
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return {'url': url, 'content': response.text}
else:
print(f"Failed to retrieve {url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def save_to_db(session,new_page):
try:
session.add(new_page)
session.commit()
except Exception as e:
print(f"Error saving to database: {e}")
session.rollback()
finally:
session.close()
def process_url(url):
global num
result = fetch_url(url)
num += 1
print(f'处理第{num}个链接{url}')
if result:
content = result['content']
soup = BeautifulSoup(content, 'html.parser')
title = soup.find('h5', class_='lh-base').text
category = soup.select('li.breadcrumb-item')[-1].text if soup.select('li.breadcrumb-item') else None
content = soup.find('div', class_='grap').text
new_page = WebPage(url=result['url'], title=title,category=category, content=content)
with Session() as session:
save_to_db(session, new_page)
time.sleep(1)
def get_urls():
urls = []
try:
response = requests.get(list_url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
lis = soup.find_all('li', class_='list-inline-item')
for li in lis:
a_tag = li.find('a')
if a_tag:
url = a_tag['href']
urls.append(url)
return urls
else:
print(f"Failed to retrieve {list_url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error fetching {list_url}: {e}")
return None
def main():
urls = get_urls()
print(len(urls))
if urls is None:
return
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(process_url, url) for url in urls]
for future in as_completed(futures):
future.result()
if __name__ == "__main__":
main()