概述
- 请求库:requests
- HTML 解析:BeautifulSoup
- 词云:wordcloud
- 数据可视化:pyecharts
- 数据库:MongoDB
- 数据库连接:pymongo
爬虫思路&&页面解析
先爬取房某下深圳各个板块的数据,然后存进 MongoDB 数据库,最后再进行数据分析。
右键网页,查看页面源码,找出我们要爬取得部分
爬虫源代码实现
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
class HouseSpider:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.zfdb = self.client.zfdb
session = requests.Session()
baseUrl = "http://sz.zu.fang.com"
# 每个区域的url
urlDir = {
"不限": "/house/",
"宝安": "/house-a089/",
"龙岗": "/house-a090/",
"南山": "/house-a087/",
"福田": "/house-a085/",
"罗湖": "/house-a086/",
"盐田": "/house-a088/",
"龙华区": "/house-a013080/",
"坪山区": "/house-a013081/",
"光明新区": "/house-a013079/",
"大鹏新区": "/house-a013082/",
"惠州": "/house-a013058/",
"东莞": "/house-a013057/",
"深圳周边": "/house-a016375/",
}
region = "不限"
page = 100
# 通过名字获取 url 地址
def getRegionUrl(self, name="宝安", page=10):
urlList = []
for index in range(page):
if index == 0:
urlList.append(self.baseUrl + self.urlDir[name])
else:
urlList.append(self.baseUrl + self.urlDir[name] + "i3" + str(index + 1) + "/")
return urlList
# MongoDB 存储数据结构
def getRentMsg(self, title, rooms, area, price, address, traffic, region, direction):
return {
"title": title, # 标题
"rooms": rooms, # 房间数
"area": area, # 平方数
"price": price, # 价格
"address": address, # 地址
"traffic": traffic, # 交通描述
"region": region, # 区、(福田区、南山区)
"direction": direction, # 房子朝向(朝南、朝南北)
}
# 获取数据库 collection
def getCollection(self, name):
zfdb = self.zfdb
if name == "不限":
return zfdb.rent
if name == "宝安":
return zfdb.baoan
if name == "龙岗":
return zfdb.longgang
if name == "南山":
return zfdb.nanshan
if name == "福田":
return zfdb.futian
if name == "罗湖":
return zfdb.luohu
if name == "盐田":
return zfdb.yantian
if name == "龙华区":
return zfdb.longhuaqu
if name == "坪山区":
return zfdb.pingshanqu
if name == "光明新区":
return zfdb.guangmingxinqu
if name == "大鹏新区":
return zfdb.dapengxinqu
#
def getAreaList(self):
return ["不限","宝安","龙岗","南山","福田","罗湖","盐田","龙华区","坪山区","光明新区","大鹏新区",]
def getOnePageData(self, pageUrl, reginon="不限"):
rent = self.getCollection(self.region)
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (