0%

爬取小说章节混乱

用scrapy爬取的数据是无序的,如果要保存有序列的内容,需要根据自增id保存到字典里,然后在spider里使用closed方法进行保存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import scrapy
import io

class NovelspiderSpider(scrapy.Spider):
name = "novelSpider"
allowed_domains = ["bqg34.com"]
start_urls = ['https://www.bqg34.com/book_114717/']
title = ""
memory_dict = dict()

def parse(self, response):
url = response.url
chapter_id = 0
self.title = response.xpath("//h1/text()").extract_first()
for item in response.xpath("//ul[@class='mulu_list']/li/a"):
chapter_id = chapter_id + 1
link = item.xpath("@href").extract_first()
yield scrapy.Request(url="%s%s" %(url,link),meta={'chapter_id':chapter_id}, callback=self.parse_item)
def parse_item(self, response):
replace_str = u"一秒记住【笔趣阁小说网 www.bqg34.com】,精彩小说无弹窗免费阅读!"
title = response.xpath("//h1/text()").extract_first()
content = response.xpath("string(//div[@id='htmlContent'])").extract_first().replace(replace_str,"")
self.memory_dict[int(response.meta["chapter_id"])] = "##%s\n%s" %(title, content)

def closed(self, reason):
with io.open('%s.md' %(self.title), 'a+', encoding='utf-8') as fp:
for item in self.memory_dict.values():
fp.write(item)