JD数据抓取优化

我们期望是在文件中指定制定产品信息和表名，程序可以依据这些信息自动抓取到对应的表中。

方案

我们的想法是：

定义config.json文件存放产品信息和表名
PySpider Project文件负责定时抓取
封装Operation基类负责解析网页和存储

1. config.json

config文件信息如下：

{
  "configs": [
    {
      "index": 0,
      "search_word": "小米air13.3",
      "price": "4000",
      "productName": "小米(MI)Air 13.3英寸全金属轻薄笔记本",
      "keyword": "13.3",
      "tables": [
        "compete_info",
        "compete_comments",
        "compete_comments_sub",
        "compete_comments_words"
      ]
    }
  ]
}

2. Operation基类

核心Code如下：

def index_page(self, client, response):
    page = response.doc('.fp-text > i').text()
    print "search total page:", page
    for i in range(1, int(page) + 1):
        num = i * 2 - 1
        turn_page = self.turnpage_url + str(i)
        print "search page index:", i, ",", turn_page
        client.crawl(turn_page, callback=client.turn_page, validate_cert=False, fetch_type='js')

def turn_page(self, client, response):
    plist = response.doc('.J-goods-list > .clearfix >li')
    index = 1
    for each in plist.items():
        name = each.find('.p-name-type-2 em').text()
        if self.keyword in name:
            detail_url = each.find('.p-commit > strong > a').attr("href")
            print "product index:", index, ",url", detail_url
            index = index + 1
            client.crawl(detail_url, callback=client.detail_page, validate_cert=False, fetch_type='js')

def detail_page(self, client, response):
    # 获取评论的页数
    pages = response.doc('div.com-table-footer > div > div >a')
    productId = re.findall("/\d+.html", response.url)[0]
    productId = re.findall("\d+", productId)[0]
    if pages:
        # check if there are some pages and set max page number 100
        page = self.getVisiableNumber(pages.items())
        print "productId:", productId, ",visible comment page:", page
        # write part product info to db
        dict = {}
        dict['productName'] = self.productName
        dict['productId'] = productId
        dict['displayName'] = response.doc('.sku-name').text()
        dict['detailUrl'] = response.url
        dict['price'] = response.doc('.p-price > span').eq(1).text()
        self.add_product(dict)
        if page >= 6:  # 超过6页会有...无法计算准确页面，采用递归方式获取
            # set first comment page url
            multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
            client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
                         save={'page': 0, 'productId': productId})
        elif page > 0:  # 小于6页，依据页数，采用递归方式获取
            multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
            client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
                         save={'page': page, 'pos': 0, 'productId': productId})
    else:
        print "productId:", productId, ",no comments"

def comment_page(self, client, response):
    try:
        # get page position information
        page = response.save.get('page')
        productId = response.save.get('productId')
        pos = int(response.save.get('pos'))
        print "total page:", page, "current page index:", pos, ",", response.url
    except Exception, e:
        print "comment_page params ex ", e
        return
    try:
        # json parse
        jsonstr = response.doc("body").text()
        hjson = json.loads(jsonstr)
        comments = hjson["comments"]
        summary = hjson["productCommentSummary"]
        # comments list infomation
        commentslist = []
        for each in comments:
            time_int = tool.dateTimeToInt(each["creationTime"])
            data = (productId, each["id"], each["nickname"], time_int, each["score"],
                    each["userLevelName"], each['userImgFlag'], each["content"])
            commentslist.append(data)
        # write comments to db
        self.add_comments(commentslist, productId)

        # if last page, write product info summary to db
        if len(commentslist) == 0 or pos >= page - 1:
            print "is last page, supplement product info"
            # summary infomation
            dict_sum = {}
            dict_sum['productId'] = productId
            dict_sum['goodCount'] = summary['goodCount']
            dict_sum['generalCount'] = summary['generalCount']
            dict_sum['poorCount'] = summary['poorCount']
            self.supplement_product(dict_sum)
        else:
            multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
                pos + 1) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
            client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
                         save={'page': page, 'pos': pos + 1, 'productId': productId})

    except Exception, e:
        print "comment_page ex ", e
        # goto next page
        multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
            pos + 1) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
        client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
                     save={'page': page, 'pos': pos + 1, 'productId': productId})

def comment_circle_page(self, client, response):
    try:
        # get page position information
        page = response.save.get('page')
        productId = response.save.get('productId')
        print "current page index:", page, ",productId", productId, ",", response.url

        # json parse
        jsonstr = response.doc("body").text()
        hjson = json.loads(jsonstr)
        comments = hjson["comments"]
        summary = hjson["productCommentSummary"]
        # comments list infomation
        commentslist = []
        commentsIds = []
        for each in comments:
            time_int = tool.dateTimeToInt(each["creationTime"])  # convert comment time to timestamp
            data = (productId, each["id"], each["nickname"], time_int, each["score"],
                    each["userLevelName"], each['userImgFlag'], each["content"])
            commentslist.append(data)
            commentsIds.append(each["id"])

        # save comments
        if len(commentsIds) == 0 or page == 100:  # it's the last page, write product in db
            print "it's the last page, write product in db"
            dict_sum = {}
            dict_sum['productId'] = productId
            dict_sum['goodCount'] = summary['goodCount']
            dict_sum['generalCount'] = summary['generalCount']
            dict_sum['poorCount'] = summary['poorCount']
            self.supplement_product(dict_sum)
        else:
            # write comments to db
            self.add_comments(commentslist, productId)
            # goto next page
            page = page + 1
            multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
                page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
            client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
                         save={'page': page, 'productId': productId})
    except Exception, e:
        print "comment_circle_page ex ", e
        # goto next page
        page = page + 1
        multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
            page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
        client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
                     save={'page': page, 'productId': productId})

3. PySpider项目

核心Code：

class Handler(BaseHandler):
    crawl_config = BaseOperation.crawl_config

    def __init__(self):
        configJson = tool.readConfigFromFile(CONFIG_PATH)
        cfg = configJson[cm.CONFIGS][POS]
        self.baseOperation = BaseOperation()
        self.baseOperation.initConfig(self, cfg, DEBUG)

    @every(minutes=TIME)  # 每小时
    def on_start(self):
        self.crawl(self.baseOperation.url, callback=self.index_page, validate_cert=False, fetch_type='js')

    # 搜索出的列表页面
    @config(age=TIME * 60)
    @config(priority=4)
    def index_page(self, response):
        self.baseOperation.index_page(self, response)

    # 搜索翻页页面
    @config(age=TIME * 60)
    @config(priority=3)
    def turn_page(self, response):
        self.baseOperation.turn_page(self, response)

    @config(age=TIME * 60)
    @config(priority=2)
    def detail_page(self, response):
        self.baseOperation.detail_page(self, response)
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }

    # we get comments through json api
    @config(age=TIME * 60)
    @config(priority=1)
    def comment_page(self, response):
        self.baseOperation.comment_page(self, response)

    # we get comments through json api
    @config(age=TIME * 60)
    @config(priority=1)
    def comment_circle_page(self, response):
        self.baseOperation.comment_circle_page(self, response)

4. flow说明

Github源码下载地址，点击这里