JD数据翻页抓取
前面已经做到评论分页抓取。但是JD搜索出的产品通常不止一页,我们也要对产品进行分页获取。
然后在对每个产品进行分页抓取评论。
实践
- index_page: 在这里进行搜索分页判断,如果有多个页面就 for i in range(1, int(page)) 来循环抓取
- detail_page: 这里是详情页面,跟之前的相比,优化了评论页数的判断。因为JD评论大于6页,会出现…无法获取全部的页数。
- 如果小于6页,就循环抓取 : comment_page
- 如果大于6页,就采用递归的方式。直到获取出的list为空,或者是已经到了最大100页面(JD展示maxpage是100页): comment_circle_page
Code:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-12-28 10:48:50
# Project: NB_U_JD
from pyspider.libs.base_handler import *
import re
import json
# sys print config
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import uniout
# user class import
sys.path.append(unicode("F:\\Manager\\大数据\\Code\\localgit\\python", "utf-8"))
import utils.tool
import db.notebookdb
class Handler(BaseHandler):
crawl_config = {
"headers": {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
}
# debug, True:save local db; otherwise save to remote db
DEBUG = True
def __init__(self):
# 搜索:华硕顽石畅玩版R414UV,按照价格排序,3000以上
search_word = "华硕顽石畅玩版R414UV"
price = "3000"
self.url = 'https://search.jd.com/Search?keyword=' + search_word + '&enc=utf-8&wq=' + search_word + '&ev=exprice_' + price + 'gt%5E&psort=1&click=0'
# comment url
self.comment_url = "https://sclub.jd.com/comment/productPageComments.action?productId="
# 翻页 url
self.turnpage_url = self.url + '&page='
# key word
self.keyword = "顽石畅玩版R414UV"
# product name
self.productName = "顽石畅玩版R414UV"
self.tool = utils.tool.Tool()
self.mysql = db.notebookdb.MySQLDBHelp(Handler.DEBUG)
TIME = 12 * 60 # 抓取频率, 有效期 (/分)
@every(minutes=TIME) # 每小时
def on_start(self):
self.crawl(self.url, callback=self.index_page, validate_cert=False)
# 搜索出的列表页面
@config(age=TIME * 60)
@config(priority=4)
def index_page(self, response):
page = response.doc('em > b').text()
if self.tool.isFloat(page) and float(page) > 1:
print "search has multi pages ", page
for i in range(1, int(page)):
num = i * 2 - 1
turn_page = self.turnpage_url + str(i)
print turn_page
self.crawl(turn_page, callback=self.turn_page, validate_cert=False, fetch_type='js')
else:
print "search has one page"
turn_page = self.turnpage_url + "1"
self.crawl(turn_page, callback=self.turn_page, validate_cert=False, fetch_type='js')
# 搜索翻页页面
@config(age=TIME * 60)
@config(priority=3)
def turn_page(self, response):
list = response.doc('.J-goods-list > .clearfix >li')
for each in list.items():
name = each.find('.p-name-type-2 em').text()
price = float(each.find('div > div.p-price > strong > i').text())
if self.keyword in name:
detail_url = each.find('.p-commit > strong > a').attr("href")
print "product:", price, ",", detail_url
self.crawl(detail_url, callback=self.detail_page, validate_cert=False, fetch_type='js')
@config(age=TIME * 60)
@config(priority=2)
def detail_page(self, response):
# 获取评论的页数
pages = response.doc('div.com-table-footer > div > div >a')
if pages:
# check if there are some pages and set max page number 100
num = self.getVisiableNumber(pages.items())
productId = re.findall("/\d+.html", response.url)[0]
productId = re.findall("\d+", productId)[0]
print "productId, visiable num: ", productId, ",", num
if num >= 6: # 超过6页会有...无法计算准确页面,采用递归方式获取
# set first comment page url
multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': 1})
elif num > 0: # 小于6页,采用循环的形式获取
isLastPage = False
for i in range(0, num):
multi_url = self.comment_url + productId + "&score=0&sortType=5&page=" + str(
i) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
print multi_url
if i == num - 1:
isLastPage = True
self.crawl(multi_url, callback=self.comment_page, validate_cert=False, fetch_type='js',
save={'isLastPage': isLastPage})
return {
"url": response.url,
"title": response.doc('title').text(),
}
# get visiable number in detail page
def getVisiableNumber(self, pages):
list = []
list.append(0)
for each in pages:
numstr = each.text()
if numstr.isdigit():
num = int(numstr)
list.append(num)
return max(list)
# we get comments through json api
@config(age=TIME * 60)
@config(priority=1)
def comment_page(self, response):
try:
# json parse
jsonstr = response.doc("body").text()
hjson = json.loads(jsonstr)
comments = hjson["comments"]
summary = hjson["productCommentSummary"]
productId = summary['productId']
# comments list infomation
commentslist = []
for each in comments:
time_int = self.tool.dateTimeToInt(each["creationTime"])
data = (productId, each["id"], each["nickname"], time_int, each["score"],
each["userLevelName"], each['userImgFlag'], each["content"])
commentslist.append(data)
# write comments to db
self.write_comments(commentslist, productId)
isLastPage = response.save.get('isLastPage')
# if last page, write product info summary to db
if isLastPage:
print "is last page, write product info"
# summary infomation
dict_sum = {}
dict_sum['productId'] = productId
dict_sum['goodCount'] = summary['goodCount']
dict_sum['generalCount'] = summary['generalCount']
dict_sum['poorCount'] = summary['poorCount']
self.write_product(dict_sum)
except Exception, e:
print "comment_page ex ", e
return {
"url": response.url,
}
# we get comments through json api
@config(age=TIME * 60)
@config(priority=1)
def comment_circle_page(self, response):
try:
page = response.save.get('page')
print "it's the page ", page
# json parse
jsonstr = response.doc("body").text()
hjson = json.loads(jsonstr)
comments = hjson["comments"]
summary = hjson["productCommentSummary"]
# comments list infomation
productId = summary['productId']
commentslist = []
commentsIds = []
for each in comments:
time_int = self.tool.dateTimeToInt(each["creationTime"])
data = (productId, each["id"], each["nickname"], time_int, each["score"],
each["userLevelName"], each['userImgFlag'], each["content"])
commentslist.append(data)
commentsIds.append(each["id"])
# save comments
if len(commentsIds) == 0 or page == 100: # it's the last page, write product in db
print "it's the last page, write product in db"
dict_sum = {}
dict_sum['productId'] = productId
dict_sum['goodCount'] = summary['goodCount']
dict_sum['generalCount'] = summary['generalCount']
dict_sum['poorCount'] = summary['poorCount']
self.write_product(dict_sum)
else:
# write comments to db
self.write_comments(commentslist, productId)
# goto next page
page = page + 1
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': page})
except Exception, e:
print "comment_circle_page ex ", e
# goto next page
page = page + 1
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': page})
return {
"url": response.url,
}
# query comment exist or not
def write_product(self, dict_sum):
dict_sum['productName'] = self.productName
dict_sum['updatetime'] = self.tool.getCurrentIntTime()
self.mysql.insert_product(dict_sum)
# write product info to db
def write_product(self, dict_sum):
dict_sum['productName'] = self.productName
dict_sum['updatetime'] = self.tool.getCurrentIntTime()
self.mysql.insert_product(dict_sum)
# write to db
def write_comments(self, commentslist, productId):
self.mysql.insert_comments(commentslist, productId)