泻药 先上图看一下结果 这是帮朋友爬取的评论,只要知道用户名,就可以爬出他写的评论,前提是他写了评论。下面我讲一下开发思路,大家可以一起沟通。不是开发人员,可以单独找我沟通
ntro
一直想自己动手用框架搭起来一个搜索引擎,但是也一直不知道从哪里开始下手比较好。
最近一直在网易云音乐上听歌,决定从网易云上把评论全部爬下来,用评论做一个垂直搜索
Path
说干就开始吧 首先第一步得先把网易云上的评论爬下来吧,没有评论资源怎么开始干活啊!
就拿我最喜欢的歌手 Eason 陈奕迅 做例子吧
首先打开网页版的网易云音乐 搜索 十年 这首歌~
十年-Eason
仔细找找以后果然发现了这个API
还是一个挺奇怪的API哈,我们应该能看出来 R_SO_4_66842 是这首歌的ID,翻看了一下其他歌曲,果然 都是通过这个API去获取评论的
后面的csrf_token应该是防止跨域访问攻击的,不在我们讨论的范围
仔细看了一下这个API,需要哪些参数
需要两个什么鬼的参数
一个是params 另一个是encSecKey
显然网易云为了防止爬虫已经做了很多加密工作了,这就很难受
于是简单的“改装阅读”一下代码
EncryptUtil.py:
# -*- coding:utf-8 -*-
import os
import base64
import time
from Crypto.Cipher import AES
def createSecretKey(size):
return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:size]
def aesEncrypt(text, secKey):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(secKey, 2, '0102030405060708')
ciphertext = encryptor.encrypt(text)
ciphertext = base64.b64encode(ciphertext)
return ciphertext
def rsaEncrypt(text, pubKey, modulus):
text = text[::-1]
rs = int(text.encode('hex'), 16)**int(pubKey, 16)%int(modulus, 16)
return format(rs, 'x').zfill(256)
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
reTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return reTime
CrawlerComments
# -*- coding:utf-8 -*-
import EncryptUtil
import json
import requests
import time
import DataBase
import Logger
logger = Logger.Log()
class Crawler(object):
def __init__(self,id):
modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
self.nonce = '0CoJUm6Qyw8W8jud'
pubKey = '010001'
self.secKey = EncryptUtil.createSecretKey(16)
self.encSecKey = EncryptUtil.rsaEncrypt(self.secKey, pubKey, modulus)
self.mysql = DataBase.Mysql()
self.musicId = id
self.requestUrl = "/weapi/v1/resource/comments/R_SO_4_%d/"%int(id)
self.headers = {
'Host': '',
'Connection': 'keep-alive',
'Content-Length': '484',
'Cache-Control': 'max-age=0',
'Origin': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': '*/*',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
}
def getComment(self, offset):
text = {
'username': "",
'password': "",
'rememberLogin': 'true',
'offset': offset
}
text = json.dumps(text)
encText = EncryptUtil.aesEncrypt(EncryptUtil.aesEncrypt(text, self.nonce), self.secKey)
data = {
'params': encText,
'encSecKey': self.encSecKey
}
res = requests.post(self.requestUrl,headers=self.headers, data=data)
jsonData = res.json()
self.databaseSave(jsonData)
return int(jsonData["total"])
def databaseSave(self ,jsonData):
for comment in jsonData["comments"]:
commentData = {
'id': str(comment["commentId"]),
'user': str(comment["user"]["userId"]),
'content': comment["content"].encode('utf-8'),
'likeCount': str(comment["likedCount"]),
'commentTime': str(EncryptUtil.timeStamp(comment["time"])),
'musicId': str(self.musicId)
}
if not comment["beReplied"] == []:
commentData["reComment"] = str(comment["beReplied"][0]["user"]["userId"])
if self.mysql.insertData("comment",commentData) >= 0:
logger.info("Comment %s Saved."%commentData["id"])
userData = {
'id': str(comment["user"]["userId"]),
'username': comment["user"]["nickname"].encode('utf-8'),
'avatarUrl': comment["user"]["avatarUrl"].encode('utf-8')
}
if self.mysql.insertData("user",userData) >= 0:
logger.info("User %s Saved."%userData["id"])
def process(self, offset):
if offset == -1:
return
off = offset
total = self.getComment(off)
while off
off += 10
self.getComment(off)
self.taskSchedule.trigger(self.musicId,"-1")
def main():
c = Crawler(66842)
c.process(1)
if __name__ == '__main__':
main()
DataBase
# -*- coding: utf-8 -*-
import MySQLdb
import setting
import time
import Logger
logger = Logger.Log()
class Mysql:
#获取当前时间
def getCurrentTime(self):
return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))
#数据库初始化
def __init__(self):
try:
self.db = MySQLdb.connect(
host=setting.MYSQL_HOST,
port = int(setting.MYSQL_PORT),
user=setting.MYSQL_USER,
passwd=setting.MYSQL_PASSWD,
db =setting.MYSQL_DBNAME,
charset="utf8mb4"
)
self.cur = self.db.cursor()
except MySQLdb.Error,e:
logger.error("连接数据库错误")
#插入数据
def insertData(self, table, my_dict):
try:
self.db.set_character_set('utf8mb4')
cols = ', '.join(my_dict.keys())
values = '"," '.join(my_dict.values())
sql = "INSERT INTO %s (%s) VALUES (%s)" % (table, cols, '"'+values+'"')
try:
result = self.cur.execute(sql)
insert_id = self.db.insert_id()
mit()
#判断是否执行成功
if result:
return insert_id
else:
return 0
except MySQLdb.Error,e:
#发生错误时回滚
self.db.rollback()
#主键唯一,无法插入
if "key 'PRIMARY'" in e.args[1]:
logger.warning("数据已存在,未插入数据")
else:
logger.error("数据已存在,未插入数据"+str(my_dict))
return -1
except MySQLdb.Error,e:
logger.error("数据库错误")
return -1
if __name__=='__main__':
d = Mysql()
commentData = {
'id': '1',
'user': '1',
'content': '\xF0\x9F\x8E\xA4',
'likeCount': '1'
}
d.insertData('comment',commentData)
好了,我的初步爬取应该是没什么问题了,下一步就是利用代理池进行不同代理的爬虫防止IP被封锁问题
利用线程池进行多线程的爬虫提高效率
利用TaskSchedule记录爬虫的爬取进度