用scrapy爬取的数据是无序的,如果要保存有序列的内容,需要根据自增id保存到字典里,然后在spider里使用closed方法进行保存
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| import scrapy import io
class NovelspiderSpider(scrapy.Spider): name = "novelSpider" allowed_domains = ["bqg34.com"] start_urls = ['https://www.bqg34.com/book_114717/'] title = "" memory_dict = dict()
def parse(self, response): url = response.url chapter_id = 0 self.title = response.xpath("//h1/text()").extract_first() for item in response.xpath("//ul[@class='mulu_list']/li/a"): chapter_id = chapter_id + 1 link = item.xpath("@href").extract_first() yield scrapy.Request(url="%s%s" %(url,link),meta={'chapter_id':chapter_id}, callback=self.parse_item) def parse_item(self, response): replace_str = u"一秒记住【笔趣阁小说网 www.bqg34.com】,精彩小说无弹窗免费阅读!" title = response.xpath("//h1/text()").extract_first() content = response.xpath("string(//div[@id='htmlContent'])").extract_first().replace(replace_str,"") self.memory_dict[int(response.meta["chapter_id"])] = "##%s\n%s" %(title, content)
def closed(self, reason): with io.open('%s.md' %(self.title), 'a+', encoding='utf-8') as fp: for item in self.memory_dict.values(): fp.write(item)
|