books = response.xpath('//h3/a/@href').getall()
for book in books:
abs_url = response.urljoin(book)
yield Request(abs_url, callback=self.parse_book)
In [15]: response.xpath('//a[text()="next"]/@href').get()
Out[15]: 'catalogue/page-2.html'
next_page_url = response.xpath('//a[text()="next"]/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
def parse(self, response):
books = response.xpath('//h3/a/@href').getall()
for book in books:
abs_url = response.urljoin(book)
yield Request(abs_url, callback=self.parse_book)
# If there is a next button on this page, move the crawler
next_page_url = response.xpath('//a[text()="next"]/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.http import Request
class BooksSpiderSpider(Spider):
name = 'books_spider'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
books = response.xpath('//h3/a/@href').getall()
for book in books:
abs_url = response.urljoin(book)
yield Request(abs_url, callback=self.parse_book)
# If there is a next button on this page, move the crawler
next_page_url = response.xpath('//a[text()="next"]/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
def parse_book(self, response):
titie = response.xpath('//h1/text()').get()
price = response.xpath('//*[@class="price_color"]/text()').get()
img_url = response.xpath('.//img/@src').get()
img_url = img_url.replace("../..", "http://books.toscrape.com/")
rating = response.xpath('//*[contains(@class, "star-rating")]/@class').get()
rating = rating.replace("star-rating ", "")
description = response.xpath('//*[@id="product_description"]/following-sibling::p/text()').get()
upc = response.xpath('//th[text()="UPC"]/following-sibling::td/text()').get()
product_type = response.xpath('//th[text()="Product Type"]/following-sibling::td/text()').get()
price_without_tax = response.xpath('//th[text()="Price (excl. tax)"]/following-sibling::td/text()').get()
price_with_tax = response.xpath('//th[text()="Price (incl. tax)"]/following-sibling::td/text()').get()
tax = response.xpath('//th[text()="Tax"]/following-sibling::td/text()').get()
availability = response.xpath('//th[text()="Availability"]/following-sibling::td/text()').get()
number_of_reviews = response.xpath('//th[text()="Number of reviews"]/following-sibling::td/text()').get()
yield {
"titie" : titie,
"price" : price,
"img_url" : img_url,
"rating" : rating,
"description" : description,
"upc" : upc,
"product_type" : product_type,
"price_without_tax" : price_without_tax,
"price_with_tax" : price_with_tax,
"tax" : tax,
"availability" : availability,
"number_of_reviews" : number_of_reviews
}
$ cat result.json | jq | head -n 30
[
{
"titie": "Olio",
"price": "£23.88",
"img_url": "http://books.toscrape.com//media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg",
"rating": "One",
"description": "Part fact, part fiction, Tyehimba Jess's much anticipated second book weaves sonnet, song, and narrative to examine the lives of mostly unrecorded African American performers directly before and after the Civil War up to World War I. Olio is an effort to understand how they met, resisted, complicated, co-opted, and sometimes defeated attempts to minstrelize them.So, while Part fact, part fiction, Tyehimba Jess's much anticipated second book weaves sonnet, song, and narrative to examine the lives of mostly unrecorded African American performers directly before and after the Civil War up to World War I. Olio is an effort to understand how they met, resisted, complicated, co-opted, and sometimes defeated attempts to minstrelize them.So, while I lead this choir, I still find thatI'm being led…I'm a missionarymending my faith in the midst of this flock…I toil in their fields of praise. When folks seethese freedmen stand and sing, they hear their Godspeak in tongues. These nine dark mouths sing shelter;they echo a hymn's haven from slavery's weather.Detroit native Tyehimba Jess' first book of poetry, leadbelly, was a winner of the 2004 National Poetry Series. Jess, a Cave Canem and NYU Alumni, has received fellowships from the Whiting Foundation, National Endowment for the Arts, Illinois Arts Council, and the Provincetown Fine Arts Work Center. Jess is also a veteran of the 2000 and 2001 Green Mill Poetry Slam Team. He exhibited his poetry at the 2011 TEDxNashville Conference. Jess is an Associate Professor of English at College of Staten Island. ...more",
"upc": "feb7cc7701ecf901",
"product_type": "Books",
"price_without_tax": "£23.88",
"price_with_tax": "£23.88",
"tax": "£0.00",
"availability": "In stock (19 available)",
"number_of_reviews": "0"
},
{
"titie": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"price": "£22.60",
"img_url": "http://books.toscrape.com//media/cache/d1/2d/d12d26739b5369a6b5b3024e4d08f907.jpg",
"rating": "Four",
"description": "For readers of Laura Hillenbrand's Seabiscuit and Unbroken, the dramatic story of the American rowing team that stunned the world at Hitler's 1936 Berlin Olympics Daniel James Brown’s robust book tells the story of the University of Washington’s 1936 eight-oar crew and their epic quest for an Olympic gold medal, a team that transformed the sport and grabbed the attention o For readers of Laura Hillenbrand's Seabiscuit and Unbroken, the dramatic story of the American rowing team that stunned the world at Hitler's 1936 Berlin Olympics Daniel James Brown’s robust book tells the story of the University of Washington’s 1936 eight-oar crew and their epic quest for an Olympic gold medal, a team that transformed the sport and grabbed the attention of millions of Americans. The sons of loggers, shipyard workers, and farmers, the boys defeated elite rivals first from eastern and British universities and finally the German crew rowing for Adolf Hitler in the Olympic games in Berlin, 1936. The emotional heart of the story lies with one rower, Joe Rantz, a teenager without family or prospects, who rows not for glory, but to regain his shattered self-regard and to find a place he can call home. The crew is assembled by an enigmatic coach and mentored by a visionary, eccentric British boat builder, but it is their trust in each other that makes them a victorious team. They remind the country of what can be done when everyone quite literally pulls together—a perfect melding of commitment, determination, and optimism. Drawing on the boys’ own diaries and journals, their photos and memories of a once-in-a-lifetime shared dream, The Boys in the Boat is an irresistible story about beating the odds and finding hope in the most desperate of times—the improbable, intimate story of nine working-class boys from the American west who, in the depths of the Great Depression, showed the world what true grit really meant. It will appeal to readers of Erik Larson, Timothy Egan, James Bradley, and David Halberstam's The Amateurs. ...more",
"upc": "e10e1e165dc8be4a",
"product_type": "Books",
"price_without_tax": "£22.60",
"price_with_tax": "£22.60",
"tax": "£0.00",
"availability": "In stock (19 available)",
"number_of_reviews": "0"
},