用Python抓取大衆點評的用戶評論

大衆點評的知识产权声明可真是霸道啊!還是自己先保存一份。下面代碼先將評論及商戶保存到sqlite數據庫,如果需要還可以導出成CSV,這樣辦公軟件就能直接打開查看了。

from bs4 import BeautifulSoup
import sys,time,random,urllib,http.cookiejar,socket,sqlite3,csv


goOn=1
stopDate=''
UserID=''
review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'',
'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''}

def getHTML(url):
  print("Fetching "+url)
  request = urllib.request.Request(url)
  request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0")
  try:
    response = urllib.request.urlopen(request)
  except (urllib.error.HTTPError, socket.error,urllib.error.URLError) as e:
    print('Connection error occurred when inserting data.'+str(e))
  else:
    if response.code != 200:
      print("Error code:"+response.code)  
    else:
      html = response.read().decode('utf-8')
      return html

def getList(url):
  global review,goOn
  reviewList=getHTML(url)
  soupAll = BeautifulSoup(reviewList).find_all("div",{"class":"txt J_rptlist"})

  for soup in soupAll:
    shopLink = soup.find("a",{"class":"J_rpttitle"})
    review['shopName']=shopLink.text
    review['shopURL']=shopLink.get("href")
    
    shopAddr = soup.find("p",{"class":"col-exp"})
    review['shopAddr']=shopAddr.text
    
    reviewID = soup.find("a",{"class":"J_flower aheart"})
    review['reviewURL']="http://www.dianping.com/review/"+reviewID.get("data-id")
    
    reviewDateDiv = soup.find("div",{"class":"mode-tc info"})
    reviewDateSpan=reviewDateDiv.find("span",{"class":"col-exp"})
    reviewDate=str(reviewDateSpan.text)[3:]
    if(len(reviewDate)==8 and reviewDate>stopDate):
      getReview(review['reviewURL'])
      #抓取頻率
      time.sleep(random.randrange(5,10))
    else:
      goOn=0
  if(goOn==0):
    print("Finished.")
    exit()
    
      
    

def save():
  global review,UserID
  conn = sqlite3.connect('DZDB_'+UserID+'_Reviews.db')
  c = conn.cursor()
  c.execute("""create table if not exists reviews (ID integer primary key not NULL,shopName char(50),shopAddr char(100),shopURL char(100),reviewURL char(100),star char(1),starDetail char(15),costPerPeople char(15),reviewText TEXT,dishes char(100),reviewTime char(20))""")
  s="""insert into reviews (ID,shopName,shopAddr,shopURL,reviewURL,star,starDetail,costPerPeople,reviewText,dishes,reviewTime) VALUES (NULL,\'"""+review['shopName']+'\',\''+review['shopAddr']+'\',\''+review['shopURL']+'\',\''+review['reviewURL']+'\',\''+str(review['star'])+'\',\''+review['starDetail']+'\',\''+review['costPerPeople']+'\',\''+review['reviewText']+'\',\''+review['dishes']+'\',\''+review['reviewTime']+'\')'
  c.execute(s)
  conn.commit()
  c.close
  print("Record at "+review['shopName']+" saved to Datebase.")
  review={'shopName':'','shopAddr':'','shopURL':'','reviewURL':'','star':'',
'starDetail':'','costPerPeople':'','reviewText':'','dishes':'','reviewTime':''}

def getReview(url):
  global review
  reviewHTML=getHTML(url)
  reviewAll=BeautifulSoup(reviewHTML)
  shopInfo= reviewAll.find("ul",{"class":"contList-info"})
  star=str(shopInfo.find("li"))
  if("msstar50" in star):
    review['star']=5
  elif ("msstar40" in star):
    review['star']=4
  elif ("msstar30" in star):
    review['star']=3
  elif ("msstar20" in star):
    review['star']=2
  elif ("msstar10" in star):
    review['star']=1
  else:
    review['star']=0
  starDetails=shopInfo.find_all("span",{"class":"rst"})
  starDetail=""
  for s in starDetails:
    s1=s.text[0:3]
    starDetail=starDetail+s1
  review['starDetail']=starDetail
  
  reviewText= reviewAll.find("div",{"class":"contList-con"})
  review['reviewText']=reviewText.text
  units= reviewAll.find_all("div",{"class":"comment-unit"})
  for unit in units:
    unit=str(unit.text).replace('\n','')
    if("人均:" in unit):    
      review['costPerPeople']=unit[4:]
    elif("喜欢的菜:" in unit): 
      unit=unit.replace(' ','')
      unit=unit.replace('\xa0',' ')
      review['dishes']=unit[7:]
    
  reviewInfo= reviewAll.find("ul",{"class":"contList-fn"})  
  reviewTime=reviewInfo.find("li")
  review['reviewTime']=reviewTime.text
  save() 

def main():
  fun=int(input("请输入数字选择功能:\n[1]抓取数据,[2]导出数据: \n"))
  if(fun==1):
    fetchReview()
  elif(fun==2):
    sqliteToCSV()
  else:
    print("请输入1或2。")

    
def sqliteToCSV():
  dbFile=str(input("请输入数据库文件名:\n"))
  with open(dbFile+'.csv','w+',newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    conn=sqlite3.connect(dbFile)
    c = conn.cursor()
    spamwriter.writerow(['ID','shopName','shopAddr','shopURL','reviewURL','star',
'starDetail','costPerPeople','reviewText','dishes','reviewTime'])
    for row in c.execute('SELECT * FROM reviews'):
      spamwriter.writerow(row)
    c.close()
    print("CSV文件成功導出。")
    
def fetchReview():
  #抓取参数:用户ID,起始页,结束日期
  global stopDate,UserID
  UserID=str(input("请输入您的大众点评ID,可以在您大众点评主页的网址中看到,如23262500:\n"))
  startPageNo=int(input("开始的页码,如1:\n"))
  stopDate=str(input("请输入评论结束日期(yy-mm-dd),如00-00-00:\n"))
  
  urlBase="http://www.dianping.com/member/"+UserID+"/reviews?pg="
  startPageNo=startPageNo-1
  while(goOn==1):
    startPageNo=startPageNo+1
    getList(urlBase+str(startPageNo))
    
if __name__ == "__main__":
    main()
幾點說明
  • 抓取頻率不要過大,否則大衆點評會屏蔽IP。我在抓取到20頁左右的時候碰到過一次屏蔽IP。如果意外中斷,你可以設置參數繼續下載,附w3school的SQL基礎教程
  • BeautifulSoup真是個好工具,連Qpython3都自帶了,但是遺憾的是這個代碼在Qpython3上跑報NoneType錯誤。
  • 我用了幾次都沒問題。

quick cocos2d win7下环境搭建

所需软件:

  • Win7 x64
  • Quick-Cocos2d-x v3.2-RC1,正常安装即可。
  • JDK,我用的是1.7.0_65,这个最新版应该也可以。安装后设置环境变量,可参考Ubuntu安装SunJava
  • adt-bundle-windows-x86_64-20140702,打开eclipse,帮助-安装新文件填入http://download.eclipse.org/koneki/releases/stable,我是把搜到的都安装了。另外首次创建虚拟机的时候可能要连外网,我用的Psiphon,顺利下载了4.4.4的镜像。有一个很好用的代理mirrors.neusoft.edu.cn:80,速度超快,推荐用这个!记得勾上强制使用http的选项然后重启Android SDK Manager。
  • android-ndk-r9d-windows-x86_64,必须是r9d,因为还不支持r10。SDK 和 NDK 不能放在包含中文和空格的目录中。 SDK/NDK 必须和 quick 的文件放在同一个分区中。请参考编译 Android 工程

安装完成后,打开桌面上的player3,新建项目.选择位置输入包名,选择方向即可。将X:\\cocos\quick-cocos2d-x-3.2rc0\cocos\platform\android\java\src下的org文件夹复制到你项目的src下。也可先将这个导入工作区,然后在自己项目中引用,详细参见Quick-Coco2d-x开发环境搭建。然后运行proj.android文件夹中的build_native.bat。最后打开adt中的eclipse,导入安卓项目,选中proj.android导入。点菜单栏中的运行-运行就可以在虚拟机中调试了。

关于导出apk可参考在eclipse中将android项目生成apk并且给apk签名。需要一提的是,要在项目文件上右键,导出。

把照片重命名为拍摄时间

主要问题就是获取照片的EXIF信息中的拍摄时间。Java的话推荐Drew Noakes的metadata-extractor。python3用Pillow就行了。另外还有一个好用的python3库可以获取exif信息,EXIF.py Python3 port,短小精悍可以用在qpython3上。

实现了下java,成品和源码都在https://github.com/pggdt/rename-JPG-to-Date。先基于后缀名判断下是否是jpg文件,然后有exif信息的重命名为“年年月月日日-时时分分秒秒”格式,如果同一秒还有照片就在文件名后加1。如果勾选了重命名为最后修改时间,则会继续重命名没有exif信息的照片,将它们的名字改为图片的最后修改日期。

Eclipse用法

我是在ubuntu12.04上用的adt-bundle,安装好Sun Java后解压就可以使用了。关于ubuntu12.04上安装Sun Java,参考Ubuntu安装SunJava

为Eclipse安装WindowBuilder

In order to install WindowBuilder on Eclipse 4.3 (Kepler) Modeling let’s click on Help -> Install New Software… and choose (into “Work with:” field ) the main Kepler Update Site (http://download.eclipse.org/releases/kepler), expand the Category “General Purpose Tools” (make sure that “Group items by Category” flag be selected) and choose the following items:

  • SWT Designer
  • SWT Designer Core
  • WindowBuilder Core
  • WindowBuilder Core UI
  • WindowBuilder GroupLayout Support
  • WindowBuilder Java Core

Then press the Next button and follow the wizard until it asks to restart Eclipse. Let’s accept pressing Restart Now button. If you download instead the package Eclipse 4.3 (Kepler) for RCP and RAP Developers WindowBuilder is included.

github用法小记

github还是挺复杂的,记录下免得下次还得搜。首先按照官方文档在网页上建好项目,本地进入对应目录初始化号帐号。然后添加remote:git remote add origin_loc https://github.com/pggdt/项目名.git。然后git pull origin_loc master同步一下。最后将本地的修改提交到github:git commit可以查看本地修改过的文件。git add 文件名,可以添加等下要上传的文件。添加后再用git commit为修改做评价。最后git push origin_loc master就提交完成了。

我就是这么做的,不保证是最正确的做法,有更好意见的请提出来。

python3批量下图

又发现一个图集,用python3抓下:

import urllib.request def main(): url='http://cdn.test.com/downloads/character' path='/home/me/Pictures/paper/paper-' for i in range(1,400): ii='' if i<10: ii="00"+str(i) elif i<100: ii="0"+str(i) else: ii=str(i) url=url+ii+".jpg" print (url) h='' try: doc=urllib.request.urlopen(url) h=str(doc.info()) except urllib.error.HTTPError: print(ii+'is not exist') if('jpeg' in h): path=path+ii+".jpg" data = urllib.request.urlopen(url).read() f = open(path,"wb") f.write(data) f.close() path='/home/me/Pictures/paper/paper-' print (str(ii)+"OK") url='http://cdn.test.com/downloads/character' if __name__ == "__main__": main()[/code]

有两点改变,python2 import的是urllib,这里用到urlopen,需要import urllib.request。还有就file已经不用了,换成open就好了。getheader函数也没有了。可以对比下python2版本的另一篇python批量下图

python批量下图

有个网站图片很好看,网址也很有规律,但是有的是空链接。用python都下下来了。

import urllib
def main():
  url='http://static.host.com/wallpapers/picture-'
  path='/home/me/Pictures/wallpaper/picture-'
  for i in range(0,400):
    url=url+str(i)+".jpg"
    print url
    doc=urllib.urlopen(url)
    type=doc.info().getheader('Content-Type')
    if(type.find('jpeg')!=-1):
      path=path+str(i)+".jpg"
      data=urllib.urlopen(url).read()
      f=file(path,"wb")  
      f.write(data)  
      f.close()
      path='/home/me/Pictures/wallpaper/picture-'
      print str(i)+"OK"
    url='http://static.host.com/wallpapers/picture-'

if __name__ == "__main__":
  main()