JD数据抓取优化
我们期望是在文件中指定制定产品信息和表名,程序可以依据这些信息自动抓取到对应的表中。
方案
我们的想法是:
- 定义config.json文件存放产品信息和表名
- PySpider Project文件负责定时抓取
- 封装Operation基类负责解析网页和存储
1. config.json
config文件信息如下:
{
"configs": [
{
"index": 0,
"search_word": "小米air13.3",
"price": "4000",
"productName": "小米(MI)Air 13.3英寸全金属轻薄笔记本",
"keyword": "13.3",
"tables": [
"compete_info",
"compete_comments",
"compete_comments_sub",
"compete_comments_words"
]
}
]
}
2. Operation基类
核心Code如下:
def index_page(self, client, response):
page = response.doc('.fp-text > i').text()
print "search total page:", page
for i in range(1, int(page) + 1):
num = i * 2 - 1
turn_page = self.turnpage_url + str(i)
print "search page index:", i, ",", turn_page
client.crawl(turn_page, callback=client.turn_page, validate_cert=False, fetch_type='js')
def turn_page(self, client, response):
plist = response.doc('.J-goods-list > .clearfix >li')
index = 1
for each in plist.items():
name = each.find('.p-name-type-2 em').text()
if self.keyword in name:
detail_url = each.find('.p-commit > strong > a').attr("href")
print "product index:", index, ",url", detail_url
index = index + 1
client.crawl(detail_url, callback=client.detail_page, validate_cert=False, fetch_type='js')
def detail_page(self, client, response):
# 获取评论的页数
pages = response.doc('div.com-table-footer > div > div >a')
productId = re.findall("/\d+.html", response.url)[0]
productId = re.findall("\d+", productId)[0]
if pages:
# check if there are some pages and set max page number 100
page = self.getVisiableNumber(pages.items())
print "productId:", productId, ",visible comment page:", page
# write part product info to db
dict = {}
dict['productName'] = self.productName
dict['productId'] = productId
dict['displayName'] = response.doc('.sku-name').text()
dict['detailUrl'] = response.url
dict['price'] = response.doc('.p-price > span').eq(1).text()
self.add_product(dict)
if page >= 6: # 超过6页会有...无法计算准确页面,采用递归方式获取
# set first comment page url
multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': 0, 'productId': productId})
elif page > 0: # 小于6页,依据页数,采用递归方式获取
multi_url = self.comment_url + productId + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
save={'page': page, 'pos': 0, 'productId': productId})
else:
print "productId:", productId, ",no comments"
def comment_page(self, client, response):
try:
# get page position information
page = response.save.get('page')
productId = response.save.get('productId')
pos = int(response.save.get('pos'))
print "total page:", page, "current page index:", pos, ",", response.url
except Exception, e:
print "comment_page params ex ", e
return
try:
# json parse
jsonstr = response.doc("body").text()
hjson = json.loads(jsonstr)
comments = hjson["comments"]
summary = hjson["productCommentSummary"]
# comments list infomation
commentslist = []
for each in comments:
time_int = tool.dateTimeToInt(each["creationTime"])
data = (productId, each["id"], each["nickname"], time_int, each["score"],
each["userLevelName"], each['userImgFlag'], each["content"])
commentslist.append(data)
# write comments to db
self.add_comments(commentslist, productId)
# if last page, write product info summary to db
if len(commentslist) == 0 or pos >= page - 1:
print "is last page, supplement product info"
# summary infomation
dict_sum = {}
dict_sum['productId'] = productId
dict_sum['goodCount'] = summary['goodCount']
dict_sum['generalCount'] = summary['generalCount']
dict_sum['poorCount'] = summary['poorCount']
self.supplement_product(dict_sum)
else:
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
pos + 1) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
save={'page': page, 'pos': pos + 1, 'productId': productId})
except Exception, e:
print "comment_page ex ", e
# goto next page
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
pos + 1) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_page, validate_cert=False, fetch_type='js',
save={'page': page, 'pos': pos + 1, 'productId': productId})
def comment_circle_page(self, client, response):
try:
# get page position information
page = response.save.get('page')
productId = response.save.get('productId')
print "current page index:", page, ",productId", productId, ",", response.url
# json parse
jsonstr = response.doc("body").text()
hjson = json.loads(jsonstr)
comments = hjson["comments"]
summary = hjson["productCommentSummary"]
# comments list infomation
commentslist = []
commentsIds = []
for each in comments:
time_int = tool.dateTimeToInt(each["creationTime"]) # convert comment time to timestamp
data = (productId, each["id"], each["nickname"], time_int, each["score"],
each["userLevelName"], each['userImgFlag'], each["content"])
commentslist.append(data)
commentsIds.append(each["id"])
# save comments
if len(commentsIds) == 0 or page == 100: # it's the last page, write product in db
print "it's the last page, write product in db"
dict_sum = {}
dict_sum['productId'] = productId
dict_sum['goodCount'] = summary['goodCount']
dict_sum['generalCount'] = summary['generalCount']
dict_sum['poorCount'] = summary['poorCount']
self.supplement_product(dict_sum)
else:
# write comments to db
self.add_comments(commentslist, productId)
# goto next page
page = page + 1
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': page, 'productId': productId})
except Exception, e:
print "comment_circle_page ex ", e
# goto next page
page = page + 1
multi_url = self.comment_url + str(productId) + "&score=0&sortType=5&page=" + str(
page) + "&pageSize=10&isShadowSku=0&rid=0&fold=1"
client.crawl(multi_url, callback=client.comment_circle_page, validate_cert=False, fetch_type='js',
save={'page': page, 'productId': productId})
3. PySpider项目
核心Code:
class Handler(BaseHandler):
crawl_config = BaseOperation.crawl_config
def __init__(self):
configJson = tool.readConfigFromFile(CONFIG_PATH)
cfg = configJson[cm.CONFIGS][POS]
self.baseOperation = BaseOperation()
self.baseOperation.initConfig(self, cfg, DEBUG)
@every(minutes=TIME) # 每小时
def on_start(self):
self.crawl(self.baseOperation.url, callback=self.index_page, validate_cert=False, fetch_type='js')
# 搜索出的列表页面
@config(age=TIME * 60)
@config(priority=4)
def index_page(self, response):
self.baseOperation.index_page(self, response)
# 搜索翻页页面
@config(age=TIME * 60)
@config(priority=3)
def turn_page(self, response):
self.baseOperation.turn_page(self, response)
@config(age=TIME * 60)
@config(priority=2)
def detail_page(self, response):
self.baseOperation.detail_page(self, response)
return {
"url": response.url,
"title": response.doc('title').text(),
}
# we get comments through json api
@config(age=TIME * 60)
@config(priority=1)
def comment_page(self, response):
self.baseOperation.comment_page(self, response)
# we get comments through json api
@config(age=TIME * 60)
@config(priority=1)
def comment_circle_page(self, response):
self.baseOperation.comment_circle_page(self, response)
4. flow说明
Github源码下载地址,点击这里