本文教大家怎么用php操作selenuim的webdriver来爬取基金数据
2022-08月更新,selenium4.4 版本的启动(把java跟chromedriver放在一起):
./software/phantomjs --port=4444
先搭建selenium环境:selenium需要jdk环境,自己安装上去就行了,java -version可以查看你是否安装了,我下载的selenium是
目前的稳定版本 selenium-server-standalone-3.141.59.jar 可以在selenuim的官网下载:Downloads | Selenium
我用的是chrome浏览器,版本 79.0.3945.36(正式版本) (64 位)然后找到对应的webdriver版本,找到你的:
https://chromedriver.storage.googleapis.com/index.html
注意:你的webdriver一定要跟浏览器的版本一样哦!!!
把你下载的chromedriver.exe跟selenium-server-standalone-3.141.59.jar 放在同一个文件夹
然后启动selenium :
java -jar selenium-server-standalone-3.141.59.jar
爬取基金数据思路:
1.先爬目前市面上所有的基金,把数据存到数据库
2.遍历所有基金,然后分别爬取基金的历史数据
3.工具:yaf框架+selenuim+webdriver爬取基金历史数据,
4.关于selenium和webdriver怎么使用的可以参考https://github.com/facebook/php-webdriver
5.注意事项,不要爬太快,慢慢爬,我是每翻一页就休眠0.1秒,后来我取消了休眠,爬几百个基金后就报错了,又重新启动他。
然后又爬几百个又报错了,还是翻一页休眠0.1秒吧,这样慢慢爬就没有问题了。
基金历史数据:
基金数量:
Talk is cheap , Show you my code .
/**
* 爬取目前所有的基金数量,目前有一万多个 php index.php "index/juhe/fundHistory"
*/
public function fundTotalAction(){
$url = "http://fund.eastmoney.com/js/fundcode_search.js";
$res = https_request($url);
$out = "";
preg_match("/\[\[(.*)\]\]/",$res,$out);
if(!isset($out[0])||!$out[0]){
dd("spide数据有问题");
}
$funds = json_decode($out[0],true);
foreach($funds as $k=>$v){
if(!isset($v[0])||!isset($v[1])||!isset($v[2])||!isset($v[3])||!isset($v[4])){
continue;
}
$insert = array(
"fund_id" =>$v[0],
"fund_code" =>$v[1],
"fund_name" =>$v[2],
"fund_type" =>$v[3],
"pingyin" =>$v[3],
"spide_time" =>$v[0],
"spider_status" =>$v[0],
);
//查询基金是否存在,存在则更新,不存在就插入
$fund_id = Db::name("fund_total")->where("fund_id='{$v[0]}'")->value("fund_id");
if($fund_id){
Db::name("fund_total")->where("fund_id='{$v[0]}'")->update($insert);
}else{
Db::name("fund_total")->insert($insert);
}
unset($insert);
}
}
/**
* 爬取遍历所有基金,分别爬取每个基金的历史数据爬了2天多,爬出来是800多万条
* php index.php "index/juhe/fundHistory"
*/
public function fundHistoryAction(){
//$source = Db::name("fund")->where("status=1")->select();
//$source = Db::name("fund_total")->where("id",">",8830)->select();
$source = Db::name("fund_total")->select();
//dd($source);
if(!$source){
dd("没有需要pa的数据");
}
$fundids = Select2array($source,'fund_id');
header("Content-Type: text/html; charset=UTF-8");
$host = 'http://localhost:4444/wd/hub'; // this is the default
$driver = RemoteWebDriver::create($host, \Facebook\WebDriver\Remote\DesiredCapabilities::chrome(), 5000);
//dd($fundids);
$start = time();
#set_time_limit(120);//用cli模式执行,不要设置时间
header("Content-Type: text/html; charset=UTF-8");
foreach($fundids as $k=>$v){
$fund_id = $v;
$rand = mt_rand(12372573208,42372573208);
$time = time();
$url = "https://stock.finance.sina.com.cn/fundInfo/api/openapi.php/CaihuiFundInfoService.getNav?callback=jQuery1112051806{$rand}_{$time}111&symbol={$fund_id}&datefrom=&dateto=&page=1&_={$time}254";
$driver->get($url);
$driver->wait(60);
$html = $driver->getPageSource();
preg_match("/\d\((.*)\)/",$html,$output);
if(!isset($output[1])||!$output[1]){
file_log("fund_history",$fund_id." 没有匹配到数据1");
continue;
}
$detail = json_decode($output[1],true);
if(!isset($detail['result']['data']['data'])||!$detail['result']['data']['data']){
file_log("fund_history",$fund_id." 没有获取到数据1");
continue;
}
//总条数
$totalCount = isset($detail['result']['data']['total_num'])?$detail['result']['data']['total_num']:'';
$fundDetail = isset($detail['result']['data']['data'])?$detail['result']['data']['data']:'';
//p($totalCount);dd($fundDetail);
foreach($fundDetail as $k2=>$v2){
usleep(100000);//延迟0.1秒,还是得延迟啊,不延迟的话爬着爬着就被网站xx的哦!
$fundDay = date("Y-m-d",strtotime($v2['fbrq']));
$insert = array(
"fund_id"=>$fund_id,
"fbrq"=>$fundDay,
"jjjz"=>$v2['jjjz'],
"ljjz"=>$v2['ljjz'],
);
$where = "fund_id={$fund_id} and fbrq = '{$fundDay}'";
//查询这条数据是否存在,如果存在就更新,不存在就插入
$history_id = Db::name("fund_history")->where($where)->value("history_id");
if($history_id){
Db::name("fund_history")->where($where)->update($insert);
}else{
Db::name("fund_history")->insert($insert);
}
unset($insert);
}
//查看是否有更多的历史数据
$totalPage = ceil($totalCount/20);
for($i=2;$i<=$totalPage;$i++){
$rand = mt_rand(12372573208,42372573208);
$time = time();
$url = "https://stock.finance.sina.com.cn/fundInfo/api/openapi.php/CaihuiFundInfoService.getNav?callback=jQuery1112051806{$rand}_{$time}111&symbol={$fund_id}&datefrom=&dateto=&page={$i}&_={$time}254";
$driver->get($url);
$driver->wait(60);
$html = $driver->getPageSource();
preg_match("/\d\((.*)\)/",$html,$output);
if(!isset($output[1])||!$output[1]){
file_log("fund_history",$fund_id." 没有匹配到数据2");
continue;
}
$detail = json_decode($output[1],true);
if(!isset($detail['result']['data']['data'])||!$detail['result']['data']['data']){
file_log("fund_history",$fund_id." 没有获取到数据2");
continue;
}
//数据详情
$fundDetail = isset($detail['result']['data']['data'])?$detail['result']['data']['data']:'';
foreach($fundDetail as $k3=>$v3){
if(!isset($v3['jjjz'])||!isset($v3['ljjz'])){
file_log("fund_history",$fund_id." 跳出了循环2");
continue;
}
$fundDay = date("Y-m-d",strtotime($v3['fbrq']));
$insert2 = array(
"fund_id"=>$fund_id,
"fbrq"=>$fundDay,
"jjjz"=>$v3['jjjz'],
"ljjz"=>$v3['ljjz'],
);
//查询这条数据是否存在,如果存在就更新,不存在就插入
$where = "fund_id={$fund_id} and fbrq = '{$fundDay}'";
$history_id = Db::name("fund_history")->where($where)->value("history_id");
if($history_id){
Db::name("fund_history")->where($where)->update($insert2);
}else{
Db::name("fund_history")->insert($insert2);
}
unset($insert2);
}
}
}
$driver->quit();
$useTime = time()-$start;
echo "消耗时间{$useTime}秒";exit();
}