import requests
import lxml.html
import pprint
def main():
start_url = 'http://books.toscrape.com/catalogue/page-49.html'
books_urls = book_link_extractor(start_url)
scrape(books_urls)
def fetch(url):
req = requests.get(url)
html = lxml.html.fromstring(req.text)
return html
def nextpage_link_extractor(html):
res = html.find('.//*[@class="next"]/a')
if res is None:
return None
else:
next_page_url = 'http://books.toscrape.com/catalogue/' + res.get('href')
return next_page_url
def book_link_extractor(start_url):
urls = []
while start_url is not None:
html = fetch(start_url)
for book in html.findall('.//*[@class="product_pod"]/h3/a'):
url = 'http://books.toscrape.com/catalogue/' + book.get('href')
urls.append(url)
start_url = nextpage_link_extractor(html)
return urls
def scrape(books_urls):
books_info = []
for book_url in books_urls:
html = fetch(book_url)
title = html.find('.//h1').text
books_info.append(
{"title": title}
)
pprint.pprint(books_info)
return books_info
if __name__ == '__main__':
main()
実行すると下記のように各書籍のタイトルが返されます。
$ python3 books_crawler3.py
【略】
{'title': "A Spy's Devotion (The Regency Spies of London #1)"},
{'title': "1st to Die (Women's Murder Club #1)"},
{'title': '1,000 Places to See Before You Die'}]
$ python3 books_crawler.py
[START]: 2020年06月13日 17:46:49
[Insert]:A Light in the Attic
[Insert]:Tipping the Velvet
[Insert]:Soumission
【略】
[Insert]:A Spy's Devotion (The Regency Spies of London #1)
[Insert]:1st to Die (Women's Murder Club #1)
[Insert]:1,000 Places to See Before You Die
[END]: 2020年06月13日 17:58:14
MySQLのテーブルも確認しておきます。1000件全ての書籍タイトルが取得できています。
mysql> select * from books limit 10;
+----+------------------------------------------------------------------------------------------------+
| id | title |
+----+------------------------------------------------------------------------------------------------+
| 1 | A Light in the Attic |
| 2 | Tipping the Velvet |
| 3 | Soumission |
| 4 | Sharp Objects |
| 5 | Sapiens: A Brief History of Humankind |
| 6 | The Requiem Red |
| 7 | The Dirty Little Secrets of Getting Your Dream Job |
| 8 | The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull |
| 9 | The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics |
| 10 | The Black Maria |
+----+------------------------------------------------------------------------------------------------+
10 rows in set (0.00 sec)
mysql> select count(1) from books;
+----------+
| count(1) |
+----------+
| 1000 |
+----------+
1 row in set (0.00 sec)