Bootstrap

Python爬取房天下网站深圳房租信息入库并进行数据分析可视化

概述

  • 请求库:requests
  • HTML 解析:BeautifulSoup
  • 词云:wordcloud
  • 数据可视化:pyecharts
  • 数据库:MongoDB
  • 数据库连接:pymongo

爬虫思路&&页面解析

先爬取房某下深圳各个板块的数据,然后存进 MongoDB 数据库,最后再进行数据分析。 

右键网页,查看页面源码,找出我们要爬取得部分

爬虫源代码实现

import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

class HouseSpider:
    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.zfdb = self.client.zfdb

    session = requests.Session()
    baseUrl = "http://sz.zu.fang.com"

    # 每个区域的url
    urlDir = {
        "不限": "/house/",
        "宝安": "/house-a089/",
        "龙岗": "/house-a090/",
        "南山": "/house-a087/",
        "福田": "/house-a085/",
        "罗湖": "/house-a086/",
        "盐田": "/house-a088/",
        "龙华区": "/house-a013080/",
        "坪山区": "/house-a013081/",
        "光明新区": "/house-a013079/",
        "大鹏新区": "/house-a013082/",
        "惠州": "/house-a013058/",
        "东莞": "/house-a013057/",
        "深圳周边": "/house-a016375/",
    }

    region = "不限"
    page = 100
    # 通过名字获取 url 地址
    def getRegionUrl(self, name="宝安", page=10):
        urlList = []
        for index in range(page):
            if index == 0:
                urlList.append(self.baseUrl + self.urlDir[name])
            else:
                urlList.append(self.baseUrl + self.urlDir[name] + "i3" + str(index + 1) + "/")
        return urlList


    # MongoDB 存储数据结构
    def getRentMsg(self, title, rooms, area, price, address, traffic, region, direction):
        return {
            "title": title,  # 标题
            "rooms": rooms,  # 房间数
            "area": area,  # 平方数
            "price": price,  # 价格
            "address": address,  # 地址
            "traffic": traffic,  # 交通描述
            "region": region,  # 区、(福田区、南山区)
            "direction": direction,  # 房子朝向(朝南、朝南北)
        }

    # 获取数据库 collection
    def getCollection(self, name):
        zfdb = self.zfdb
        if name == "不限":
            return zfdb.rent
        if name == "宝安":
            return zfdb.baoan
        if name == "龙岗":
            return zfdb.longgang
        if name == "南山":
            return zfdb.nanshan
        if name == "福田":
            return zfdb.futian
        if name == "罗湖":
            return zfdb.luohu
        if name == "盐田":
            return zfdb.yantian
        if name == "龙华区":
            return zfdb.longhuaqu
        if name == "坪山区":
            return zfdb.pingshanqu
        if name == "光明新区":
            return zfdb.guangmingxinqu
        if name == "大鹏新区":
            return zfdb.dapengxinqu

    #
    def getAreaList(self):
        return ["不限","宝安","龙岗","南山","福田","罗湖","盐田","龙华区","坪山区","光明新区","大鹏新区",]

    def getOnePageData(self, pageUrl, reginon="不限"):
        rent = self.getCollection(self.region)
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (
;