In [3]: len(response.xpath('//*[@class="quote"]') )
Out[3]: 10
In [4]: quotes = response.xpath('//*[@class="quote"]')
In [5]: quote = quotes[0]
In [6]: quote.xpath('.//*[@class="text"]/text()').get()
Out[6]: '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
In [7]: quote.xpath('.//*[@class="author"]/text()').get()
Out[7]: 'Albert Einstein'
In [8]: quote.xpath('.//*[@class="tag"]/text()').getall()
Out[8]: ['change', 'deep-thoughts', 'thinking', 'world']
quotes = response.xpath('//*[@class="quote"]')
for quote in quotes:
text = quote.xpath('.//*[@class="text"]/text()').get()
author = quote.xpath('.//*[@class="author"]/text()').get()
tags = quote.xpath('.//*[@class="tag"]/text()').getall()
yield {
"text": text,
"author": author,
"tags": tags
}
In [9]: response.xpath('//*[@class="next"]/a/@href').get()
Out[9]: '/page/2/'
next_page_url = response.xpath('//*[@class="next"]/a/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy import Request
class QuotesSpiderSpider(Spider):
name = 'quotes_spider'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.xpath('//*[@class="quote"]')
for quote in quotes:
text = quote.xpath('.//*[@class="text"]/text()').get()
author = quote.xpath('.//*[@class="author"]/text()').get()
tags = quote.xpath('.//*[@class="tag"]/text()').getall()
yield {
"text": text,
"author": author,
"tags": tags
}
next_page_url = response.xpath('//*[@class="next"]/a/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
$ scrapy crawl quotes_spider -o result.json
# brew install jq
$ cat result.json | jq | head -n 30
[
{
"text": "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”",
"author": "Albert Einstein",
"tags": [
"change",
"deep-thoughts",
"thinking",
"world"
]
},
{
"text": "“It is our choices, Harry, that show what we truly are, far more than our abilities.”",
"author": "J.K. Rowling",
"tags": [
"abilities",
"choices"
]
},
{
"text": "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”",
"author": "Albert Einstein",
"tags": [
"inspirational",
"life",
"live",
"miracle",
"miracles"
]
},