Bootstrap

php操作selenuim+webdriver爬取基金历史数据

本文教大家怎么用php操作selenuim的webdriver来爬取基金数据

2022-08月更新,selenium4.4 版本的启动(把java跟chromedriver放在一起):

./software/phantomjs --port=4444

先搭建selenium环境:selenium需要jdk环境,自己安装上去就行了,java -version可以查看你是否安装了,我下载的selenium是

目前的稳定版本 selenium-server-standalone-3.141.59.jar  可以在selenuim的官网下载:Downloads | Selenium

我用的是chrome浏览器,版本 79.0.3945.36(正式版本) (64 位)然后找到对应的webdriver版本,找到你的:

https://chromedriver.storage.googleapis.com/index.html

注意:你的webdriver一定要跟浏览器的版本一样哦!!!

把你下载的chromedriver.exe跟selenium-server-standalone-3.141.59.jar 放在同一个文件夹

然后启动selenium : 

java -jar selenium-server-standalone-3.141.59.jar 

爬取基金数据思路:

1.先爬目前市面上所有的基金,把数据存到数据库

2.遍历所有基金,然后分别爬取基金的历史数据

3.工具:yaf框架+selenuim+webdriver爬取基金历史数据,

4.关于selenium和webdriver怎么使用的可以参考https://github.com/facebook/php-webdriver

5.注意事项,不要爬太快,慢慢爬,我是每翻一页就休眠0.1秒,后来我取消了休眠,爬几百个基金后就报错了,又重新启动他。

然后又爬几百个又报错了,还是翻一页休眠0.1秒吧,这样慢慢爬就没有问题了。

基金历史数据:

基金数量:

Talk is cheap , Show you my code .

/**
 * 爬取目前所有的基金数量,目前有一万多个    php index.php "index/juhe/fundHistory"
 */ 
public function fundTotalAction(){
        $url = "http://fund.eastmoney.com/js/fundcode_search.js";

        $res = https_request($url);

        $out = "";
        preg_match("/\[\[(.*)\]\]/",$res,$out);
        if(!isset($out[0])||!$out[0]){
            dd("spide数据有问题");
        }

        $funds = json_decode($out[0],true);

        foreach($funds as $k=>$v){
            if(!isset($v[0])||!isset($v[1])||!isset($v[2])||!isset($v[3])||!isset($v[4])){
                continue;
            }
            $insert = array(
                "fund_id" =>$v[0],
                "fund_code" =>$v[1],
                "fund_name" =>$v[2],
                "fund_type" =>$v[3],
                "pingyin" =>$v[3],
                "spide_time" =>$v[0],
                "spider_status" =>$v[0],
            );

            //查询基金是否存在,存在则更新,不存在就插入
            $fund_id = Db::name("fund_total")->where("fund_id='{$v[0]}'")->value("fund_id");
            if($fund_id){
                Db::name("fund_total")->where("fund_id='{$v[0]}'")->update($insert);
            }else{
                Db::name("fund_total")->insert($insert);
            }
            unset($insert);
        }

    }
    /**
     * 爬取遍历所有基金,分别爬取每个基金的历史数据爬了2天多,爬出来是800多万条
     * php index.php "index/juhe/fundHistory"
     */
    public function fundHistoryAction(){

        //$source = Db::name("fund")->where("status=1")->select();
        //$source = Db::name("fund_total")->where("id",">",8830)->select();
        $source = Db::name("fund_total")->select();
        //dd($source);
        if(!$source){
            dd("没有需要pa的数据");
        }
        $fundids = Select2array($source,'fund_id');
        header("Content-Type: text/html; charset=UTF-8");
        $host = 'http://localhost:4444/wd/hub'; // this is the default
        $driver = RemoteWebDriver::create($host, \Facebook\WebDriver\Remote\DesiredCapabilities::chrome(), 5000);
        //dd($fundids);

        $start = time();
        #set_time_limit(120);//用cli模式执行,不要设置时间
        header("Content-Type: text/html; charset=UTF-8");

        foreach($fundids as $k=>$v){
            $fund_id = $v;
            $rand = mt_rand(12372573208,42372573208);
            $time = time();
            $url = "https://stock.finance.sina.com.cn/fundInfo/api/openapi.php/CaihuiFundInfoService.getNav?callback=jQuery1112051806{$rand}_{$time}111&symbol={$fund_id}&datefrom=&dateto=&page=1&_={$time}254";

            $driver->get($url);
            $driver->wait(60);
            $html = $driver->getPageSource();

            preg_match("/\d\((.*)\)/",$html,$output);

            if(!isset($output[1])||!$output[1]){
                file_log("fund_history",$fund_id." 没有匹配到数据1");
                continue;
            }
            $detail = json_decode($output[1],true);
            if(!isset($detail['result']['data']['data'])||!$detail['result']['data']['data']){
                file_log("fund_history",$fund_id." 没有获取到数据1");
                continue;
            }

            //总条数
            $totalCount = isset($detail['result']['data']['total_num'])?$detail['result']['data']['total_num']:'';
            $fundDetail = isset($detail['result']['data']['data'])?$detail['result']['data']['data']:'';
            //p($totalCount);dd($fundDetail);
            foreach($fundDetail as $k2=>$v2){
                usleep(100000);//延迟0.1秒,还是得延迟啊,不延迟的话爬着爬着就被网站xx的哦!
                $fundDay = date("Y-m-d",strtotime($v2['fbrq']));
                $insert = array(
                    "fund_id"=>$fund_id,
                    "fbrq"=>$fundDay,
                    "jjjz"=>$v2['jjjz'],
                    "ljjz"=>$v2['ljjz'],
                );

                $where = "fund_id={$fund_id} and fbrq = '{$fundDay}'";
                //查询这条数据是否存在,如果存在就更新,不存在就插入
                $history_id = Db::name("fund_history")->where($where)->value("history_id");
                if($history_id){
                    Db::name("fund_history")->where($where)->update($insert);
                }else{
                    Db::name("fund_history")->insert($insert);
                }

                unset($insert);
            }

            //查看是否有更多的历史数据
            $totalPage = ceil($totalCount/20);
            for($i=2;$i<=$totalPage;$i++){
                $rand = mt_rand(12372573208,42372573208);
                $time = time();
                $url = "https://stock.finance.sina.com.cn/fundInfo/api/openapi.php/CaihuiFundInfoService.getNav?callback=jQuery1112051806{$rand}_{$time}111&symbol={$fund_id}&datefrom=&dateto=&page={$i}&_={$time}254";

                $driver->get($url);
                $driver->wait(60);

                $html = $driver->getPageSource();

                preg_match("/\d\((.*)\)/",$html,$output);

                if(!isset($output[1])||!$output[1]){
                    file_log("fund_history",$fund_id." 没有匹配到数据2");
                    continue;
                }
                $detail = json_decode($output[1],true);
                if(!isset($detail['result']['data']['data'])||!$detail['result']['data']['data']){
                    file_log("fund_history",$fund_id." 没有获取到数据2");
                    continue;
                }

                //数据详情
                $fundDetail = isset($detail['result']['data']['data'])?$detail['result']['data']['data']:'';

                foreach($fundDetail as $k3=>$v3){
                    if(!isset($v3['jjjz'])||!isset($v3['ljjz'])){
                        file_log("fund_history",$fund_id." 跳出了循环2");
                        continue;
                    }
                    $fundDay = date("Y-m-d",strtotime($v3['fbrq']));
                    $insert2 = array(
                        "fund_id"=>$fund_id,
                        "fbrq"=>$fundDay,
                        "jjjz"=>$v3['jjjz'],
                        "ljjz"=>$v3['ljjz'],
                    );

                    //查询这条数据是否存在,如果存在就更新,不存在就插入
                    $where = "fund_id={$fund_id} and fbrq = '{$fundDay}'";
                    $history_id = Db::name("fund_history")->where($where)->value("history_id");
                    if($history_id){
                        Db::name("fund_history")->where($where)->update($insert2);
                    }else{
                        Db::name("fund_history")->insert($insert2);
                    }
                    unset($insert2);
                }
            }
            
        }

        $driver->quit();
        $useTime = time()-$start;
        echo "消耗时间{$useTime}秒";exit();
    }

;