舆情分析 - JD数据翻页抓取

Posted on By Vivian Sun

JD数据翻页抓取

前面已经做到评论分页抓取。但是JD搜索出的产品通常不止一页,我们也要对产品进行分页获取。

然后在对每个产品进行分页抓取评论。

实践

  • index_page: 在这里进行搜索分页判断,如果有多个页面就 for i in range(1, int(page)) 来循环抓取
  • detail_page: 这里是详情页面,跟之前的相比,优化了评论页数的判断。因为JD评论大于6页,会出现…无法获取全部的页数。
    • 如果小于6页,就循环抓取 : comment_page
    • 如果大于6页,就采用递归的方式。直到获取出的list为空,或者是已经到了最大100页面(JD展示maxpage是100页): comment_circle_page

Code:

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-12-28 10:48:50
# Project: NB_U_JD

from pyspider.libs.base_handler import *
import re
import json

# sys print config
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
import uniout

# user class import
sys.path.append(unicode("F:\\Manager\\大数据\\Code\\localgit\\python", "utf-8"))
import utils.tool
import db.notebookdb


class Handler(BaseHandler):
    crawl_config = {
        "headers": {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
        }
    }

    # debug, True:save local db; otherwise save to remote db
    DEBUG = True

    def __init__(self):
        # 搜索:华硕顽石畅玩版R414UV,按照价格排序,3000以上
        search_word = "华硕顽石畅玩版R414UV"
        price = "3000"
        self.url = 'https://search.jd.com/Search?keyword=' + search_word + '&enc=utf-8&wq=' + search_word + '&ev=exprice_' + price + 'gt%5E&psort=1&click=0'
        # comment url
        self.comment_url = "https://sclub.jd.com/comment/productPageComments.action?productId="
        # 翻页 url
        self.turnpage_url = self.url + '&page='
        # key word
        self.keyword = "顽石畅玩版R414UV"
        # product name
        self.productName = "顽石畅玩版R414UV"
        self.tool = utils.tool.Tool()
        self.mysql = db.notebookdb.MySQLDBHelp(Handler.DEBUG)

    TIME = 12 * 60  # 抓取频率, 有效期 (/分)

    @every(minutes=TIME)  # 每小时
    def on_start(self):
        self.crawl(self.url, callback=self.index_page, validate_cert=False)

    # 搜索出的列表页面
    @config(age=TIME * 60)
    @config(priority=4)
    def index_page(self, response):
        page = response.doc('em > b').text()
        if self.tool.isFloat(page) and float(page) > 1:
            print "search has multi pages ", page
            for i in range(1, int(page)):
                num = i * 2 - 1
                turn_page = self.turnpage_url + str(i)
                print turn_page
                self.crawl(turn_page, callback=self.turn_page, validate_cert=False, fetch_type='js')
        else:
            print "search has one page"
            turn_page = self.turnpage_url + "1"
            self.crawl(turn_page, callback=self.turn_page, validate_cert=False, fetch_type='js')

    # 搜索翻页页面
    @config(age=TIME * 60)
    @config(priority=3)
    def turn_page(self, response):
        list = response.doc('.J-goods-list > .clearfix >li')
        for each in list.items():
            name = each.find('.p-name-type-2 em').text()
            price = float(each.find('div > div.p-price > strong > i').text())
            if self.keyword in name:
                detail_url = each.find('.p-commit > strong > a').attr("href")
                print "product:", price, ",", detail_url
                self.crawl(detail_url, callback=self.detail_page, validate_cert=False, fetch_type='js')

    @config(age=TIME * 60)
    @config(priority=2)
    def detail_page(self, response):
        # 获取评论的页数
        pages = response.doc('div.com-table-footer > div > div >a')
        if pages:
            # check if there are some pages and set max page number 100
            num = self.getVisiableNumber(pages.items())
            productId = re.findall("/\d+.html", response.url)[0]
            productId = re.findall("\d+", productId)[0]
            print "productId, visiable num: ", productId, ",", num
            if num >= 6:  # 超过6页会有...无法计算准确页面,采用递归方式获取
                # set first comment page url
                multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
                self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
                           save={'page': 1})
            elif num > 0:  # 小于6页,采用循环的形式获取
                isLastPage = False
                for i in range(0, num):
                    multi_url = self.comment_url + productId + "&score=0&sortType=5&page=" + str(
                        i) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
                    print multi_url
                    if i == num - 1:
                        isLastPage = True
                    self.crawl(multi_url, callback=self.comment_page, validate_cert=False, fetch_type='js',
                               save={'isLastPage': isLastPage})

        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }

    # get visiable number in detail page
    def getVisiableNumber(self, pages):
        list = []
        list.append(0)
        for each in pages:
            numstr = each.text()
            if numstr.isdigit():
                num = int(numstr)
                list.append(num)
        return max(list)

    # we get comments through json api
    @config(age=TIME * 60)
    @config(priority=1)
    def comment_page(self, response):
        try:
            # json parse
            jsonstr = response.doc("body").text()
            hjson = json.loads(jsonstr)
            comments = hjson["comments"]
            summary = hjson["productCommentSummary"]
            productId = summary['productId']
            # comments list infomation
            commentslist = []
            for each in comments:
                time_int = self.tool.dateTimeToInt(each["creationTime"])
                data = (productId, each["id"], each["nickname"], time_int, each["score"],
                        each["userLevelName"], each['userImgFlag'], each["content"])
                commentslist.append(data)

            # write comments to db
            self.write_comments(commentslist, productId)

            isLastPage = response.save.get('isLastPage')
            # if last page, write product info summary to db
            if isLastPage:
                print "is last page, write product info"
                # summary infomation
                dict_sum = {}
                dict_sum['productId'] = productId
                dict_sum['goodCount'] = summary['goodCount']
                dict_sum['generalCount'] = summary['generalCount']
                dict_sum['poorCount'] = summary['poorCount']
                self.write_product(dict_sum)
        except Exception, e:
            print "comment_page ex ", e

        return {
            "url": response.url,
        }

    # we get comments through json api
    @config(age=TIME * 60)
    @config(priority=1)
    def comment_circle_page(self, response):
        try:
            page = response.save.get('page')
            print "it's the page ", page
            # json parse
            jsonstr = response.doc("body").text()
            hjson = json.loads(jsonstr)
            comments = hjson["comments"]
            summary = hjson["productCommentSummary"]
            # comments list infomation
            productId = summary['productId']
            commentslist = []
            commentsIds = []
            for each in comments:
                time_int = self.tool.dateTimeToInt(each["creationTime"])
                data = (productId, each["id"], each["nickname"], time_int, each["score"],
                        each["userLevelName"], each['userImgFlag'], each["content"])
                commentslist.append(data)
                commentsIds.append(each["id"])

            # save comments
            if len(commentsIds) == 0 or page == 100:  # it's the last page, write product in db
                print "it's the last page, write product in db"
                dict_sum = {}
                dict_sum['productId'] = productId
                dict_sum['goodCount'] = summary['goodCount']
                dict_sum['generalCount'] = summary['generalCount']
                dict_sum['poorCount'] = summary['poorCount']
                self.write_product(dict_sum)
            else:
                # write comments to db
                self.write_comments(commentslist, productId)
                # goto next page
                page = page + 1
                multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
                    page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
                self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
                           save={'page': page})
        except Exception, e:
            print "comment_circle_page ex ", e
            # goto next page
            page = page + 1
            multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
                page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
            self.crawl(multi_url, callback=self.comment_circle_page, validate_cert=False, fetch_type='js',
                       save={'page': page})

        return {
            "url": response.url,
        }

    # query comment exist or not
    def write_product(self, dict_sum):
        dict_sum['productName'] = self.productName
        dict_sum['updatetime'] = self.tool.getCurrentIntTime()
        self.mysql.insert_product(dict_sum)

        # write product info to db

    def write_product(self, dict_sum):
        dict_sum['productName'] = self.productName
        dict_sum['updatetime'] = self.tool.getCurrentIntTime()
        self.mysql.insert_product(dict_sum)

        # write to db

    def write_comments(self, commentslist, productId):
        self.mysql.insert_comments(commentslist, productId)