第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:
#!/usr/bin/envpython
#-*-coding:utf-8-*-
importsys
importurllib
importurllib2
importcookielib
importbase64
importre
importjson
importhashlib
classweiboLogin:
cj=cookielib.LWPCookieJar()
cookie_support=urllib2.HTTPCookieProcessor(cj)
opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
postdata={
'entry':'weibo',
'gateway':'1',
'from':'',
'savestate':'7',
'userticket':'1',
'ssosimplelogin':'1',
'vsnf':'1',
'vsnval':'',
'su':'',
'service':'miniblog',
'servertime':'',
'nonce':'',
'pwencode':'wsse',
'sp':'',
'encoding':'UTF-8',
'url':'/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype':'META'
}
defget_servertime(self):
url='/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'
data=urllib2.urlopen(url).read()
p=pile('(.∗)')
try:
json_data=p.search(data).group(1)
data=json.loads(json_data)
servertime=str(data['servertime'])
nonce=data['nonce']
returnservertime,nonce
except:
print'Getsevertimeerror!'
returnNone
defget_pwd(self,pwd,servertime,nonce):
pwd1=hashlib.sha1(pwd).hexdigest()
pwd2=hashlib.sha1(pwd1).hexdigest()
pwd3_=pwd2+servertime+nonce
pwd3=hashlib.sha1(pwd3_).hexdigest()
returnpwd3
defget_user(self,username):
username_=urllib.quote(username)
username=base64.encodestring(username_)[:-1]
returnusername
deflogin(self,username,pwd):
url='/sso/login.php?client=ssologin.js(v1.3.18)'
try:
servertime,nonce=self.get_servertime()
except:
print'getservertimeerror!'
return
weiboLogin.postdata['servertime']=servertime
weiboLogin.postdata['nonce']=nonce
weiboLogin.postdata['su']=self.get_user(username)
weiboLogin.postdata['sp']=self.get_pwd(pwd,servertime,nonce)
weiboLogin.postdata=urllib.urlencode(weiboLogin.postdata)
headers={'User-Agent':'Mozilla/5.0(X11;Linuxi686;rv:8.0)Gecko/0101Firefox/8.0Chrome/20.0.1132.57Safari/536.11'}
req=urllib2.Request(
url=url,
data=weiboLogin.postdata,
headers=headers
)
result=urllib2.urlopen(req)
text=result.read()
p=pile('location\.replace\'(.∗?)\'')
try:
login_url=p.search(text).group(1)
urllib2.urlopen(login_url)
print"Loginsuccess!"
except:
print'Loginerror!'
然后创建main.py文件,输入以下代码:
#!/usr/bin/envpython
#-*-coding:utf-8-*-
importweiboLogin
importurllib
importurllib2
username='你的微博用户名'
pwd='你的微博密码'
WBLogin=weiboLogin.weiboLogin()
WBLogin.login(username,pwd)
注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。
接下来,考虑实现抓取微博的内容。
此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:
#!/usr/bin/envpython
#-*-coding:utf-8-*-
importurllib
importurllib2
importsys
importtime
reload(sys)
sys.setdefaultencoding('utf-8')
classgetWeiboPage:
body={
'__rnd':'',
'_k':'',
'_t':'0',
'count':'50',
'end_id':'',
'max_id':'',
'page':1,
'pagebar':'',
'pre_page':'0',
'uid':''
}
uid_list=[]
charset='utf8'
defget_msg(self,uid):
getWeiboPage.body['uid']=uid
url=self.get_url(uid)
self.get_firstpage(url)
self.get_secondpage(url)
self.get_thirdpage(url)
defget_firstpage(self,url):
getWeiboPage.body['pre_page']=getWeiboPage.body['page']-1
url=url+urllib.urlencode(getWeiboPage.body)
req=urllib2.Request(url)
result=urllib2.urlopen(req)
text=result.read()
self.writefile('./output/text1',text)
self.writefile('./output/result1',eval("u'''"+text+"'''"))
defget_secondpage(self,url):
getWeiboPage.body['count']='15'
#getWeiboPage.body['end_id']='3490160379905732'
#getWeiboPage.body['max_id']='3487344294660278'
getWeiboPage.body['pagebar']='0'
getWeiboPage.body['pre_page']=getWeiboPage.body['page']
url=url+urllib.urlencode(getWeiboPage.body)
req=urllib2.Request(url)
result=urllib2.urlopen(req)
text=result.read()
self.writefile('./output/text2',text)
self.writefile('./output/result2',eval("u'''"+text+"'''"))
defget_thirdpage(self,url):
getWeiboPage.body['count']='15'
getWeiboPage.body['pagebar']='1'
getWeiboPage.body['pre_page']=getWeiboPage.body['page']
url=url+urllib.urlencode(getWeiboPage.body)
req=urllib2.Request(url)
result=urllib2.urlopen(req)
text=result.read()
self.writefile('./output/text3',text)
self.writefile('./output/result3',eval("u'''"+text+"'''"))
defget_url(self,uid):
url='/'+uid+'?from=otherprofile&wvr=3.6&loc=tagweibo'
returnurl
defget_uid(self,filename):
fread=file(filename)
forlineinfread:
getWeiboPage.uid_list.append(line)
printline
time.sleep(1)
defwritefile(self,filename,content):
fw=file(filename,'w')
fw.write(content)
fw.close()
在刚刚的main.py中加入相应内容,完整内容为:
#!/usr/bin/envpython
#-*-coding:utf-8-*-
importweiboLogin
importgetWeiboMsg
importurllib
importurllib2
username='你的微博用户名'
pwd='你的微博密码'
WBLogin=weiboLogin.weiboLogin()
WBLogin.login(username,pwd)
WBmsg=getWeiboMsg.getWeiboMsg()
url='/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo'
WBmsg.get_firstpage(url)
WBmsg.get_secondpage(url)
WBmsg.get_thirdpage(url)