class MeizituSpider(scrapy.Spider): name = "meizitu" allowed_domains = ["27270.com"] start_urls = [] #获取全部翻页链接 for pn in range(2,3): url = 'http://www.27270.com/ent/meinvtupian/list_11_%s.html' % pn start_urls.append(url)
def parse(self, response):
sel = Selector(response)
for link in sel.xpath('//html/body/div[2]/div[10]/ul/li/a[2]/@href').extract():
request = scrapy.Request(link, callback=self.parse_item)
yield request
def parse_item(self, response):
l = ItemLoader(item=MeizituItem(), response=response)
l.add_xpath('name', '///html/head/title/text()')
l.add_xpath('tags', '//*[@id="body"]/div[1]/div[4]/div[3]/a/text()')
l.add_xpath('image_urls', '//*[@id="RightUrl"]/img/@src', Identity())
l.add_value('url', response.url)
return l.load_item()
我目前的代码。抓取分页内容到底是在 parse_item 里抓? 还是单独设定一个类?
我目前没找到现成的抓取内页分页的代码。求助
1
xiaoyu9527 OP UPUP
|
2
leopku 2016-08-22 21:54:12 +08:00
分开处理即可
```python def parse(self, reponse): sel = scrapy.Selector(response) for item_link in self.xpath('单个 item 的链接解析 xpath 填这里 <----'): yield scrapy.Request(item_url, callback=self.parse_item) for next_page in sel.xpath('//html/body/div[2]/div[10]/ul/li/a[2]/@href').extract(): yield request ``` |
3
xiaoyu9527 OP @leopku 这样 for next_page 并不会提交回去重新抓取 item 吧?
|
4
xiaoyu9527 OP 已经解决了。
我把代码贴上来 希望能帮助到别人 class MeizituSpider(scrapy.Spider): name = "meizitu" allowed_domains = ["27270.com"] start_urls = [] #获取全部翻页链接 for pn in range(2,3): url = 'http://www.27270.com/ent/meinvtupian/list_11_%s.html' % pn start_urls.append(url) def parse(self, response): sel = Selector(response) for link in sel.xpath('//html/body/div[2]/div[10]/ul/li/a[2]/@href').extract(): request = scrapy.Request(link, callback=self.parse_item) yield request def parse_item(self, response): sel = Selector(response) l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//html/body/div[3]/div[4]/div[1]/h1/text()') l.add_xpath('tags', '//html/body/div[3]/div[5]/dl/dd/a/text()') l.add_xpath('image_urls', '//*[@id="RightUrl"]/img/@src', Identity()) l.add_value('url', response.url) yield l.load_item() next_pages = sel.xpath('//*[@id="nl"]/a/@href').extract() if next_pages: full_url = response.urljoin(next_pages[0]) print '完整连接', full_url yield scrapy.Request(full_url, callback=self.parse_item) 这次弄完我会在我的博客 写一篇简单的教程。 |
5
xiaoyu9527 OP 最终的区别其实是 yield 和 return 的不同。
|