#!/usr/env python
#-*- coding: utf-8 -*-
import urllib
import urllib2
import random
import requests
import os,sys
import MySQLdb
from sgmllib import SGMLParser
from BeautifulSoup import BeautifulSoup
import re
num=0
def main():
#try:
#conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")
#conn.query("set names utf8")
#except Exception,e:
#print e
#sys.exit()
#cursor=conn.cursor()
#for k in range(0,34):
url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"
#print k
html=requests.get(url)
preresult=html.content
soup=BeautifulSoup(preresult)
result=soup.prettify("utf-8")
#名称
pattern=re.compile('itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
data0=re.findall(pattern,result)
for items in data0:
print itemsunbalanced parenthesis
#制造商
pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
#data1=re.findall(pattern,result)
#for items in data1
#print items
#版本
pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
data2=re.findall(pattern,result)
print data2[0]
#更新时间
pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
data3=re.findall(pattern,result)
#print data3[0]
#文件大小
pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
data4=re.findall(pattern,result)
#print data4[0]
#支持固件
pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
data5=re.findall(pattern,result)
#print data5[0]
#说明
pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
data6=re.findall(pattern,result)
for items in data6:
pass#print re.sub('<br />',' ',items)
#sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
#for items in data6:
#if(data5):
#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
#else:
#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
#print values
#print sql % values
#cursor.execute(sql,values)
#conn.commit()
pattern=re.compile('<img class="cover-image" src="(.+?)" alt')
data=re.findall(pattern,result)
global num
for j in data:
temp=requests.get(j[1:-2])
f=file("googlemarket/"+str(num),"w+")
f.write(temp.content)
if __name__=="__main__":
main()
Traceback (most recent call last):
File "crawler0729.py", line 85, in <module>
main()
File "crawler0729.py", line 56, in main
pattern=re.compile('itemprop="description">[\s\S]*?<div>"([\s\S]*?)"</div>')
File "/usr/lib/python2.7/re.py", line 190, in compile
return _compile(pattern, flags)
File "/usr/lib/python2.7/re.py", line 242, in _compile
raise error, v # invalid expression
sre_constants.error: unbalanced parenthesis