from scrapy import Spider
from scrapy.http import Request
from sample_books_mysql.items import BooksMysqlItem #忘れずにインポートする
class BooksSpiderMysqlSpider(Spider):
name = 'books_spider_mysql'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
books = response.xpath('.//*[@class="product_pod"]')
for book in books:
item = BooksMysqlItem()
item["title"] = book.xpath('.//h3/a/@title').get()
item["price"] = book.xpath('.//*[@class="price_color"]/text()').get()
img_url = book.xpath('.//*[@class="image_container"]/a/@href').get()
item["detail_page_url"] = "http://books.toscrape.com/" + img_url
yield item
# If there is a next button on this page, move the crawler
next_page_url = response.xpath('//a[text()="next"]/@href').get()
abs_next_page_url = response.urljoin(next_page_url)
if abs_next_page_url is not None:
yield Request(abs_next_page_url, callback=self.parse)
import pymysql
class BooksMysqlPipeline:
def open_spider(self, spider):
self.connection = pymysql.connect(
host="localhost",
user="****", # DBにあわせて変更
passwd="****", # DBにあわせて変更
database="book_online",
charset="utf8mb4"
)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
# duplication check
check_title_id = item["title"]
find_qry = "SELECT `title` FROM `books` WHERE `title` = %s"
is_done = self.cursor.execute(find_qry, check_title_id)
# if already a record exists in database, return 1
if is_done == 0:
insert_qry = "INSERT INTO `books` (`title`, `price`, `detail_page_url`) VALUES (%s, %s, %s)"
self.cursor.execute(insert_qry, (item["title"], item["price"], item["detail_page_url"]))
self.connection.commit()
else:
pass
return item
def close_spider(self, spider):
self.connection.close()