import requests
import lxml.html
import pprint
def main():
start_url = 'http://books.toscrape.com/catalogue/page-49.html'
books_urls = book_link_extractor(start_url)
scrape(books_urls)
def fetch(url):
req = requests.get(url)
html = lxml.html.fromstring(req.text)
return html
def nextpage_link_extractor(html):
res = html.find('.//*[@class="next"]/a')
if res is None:
return None
else:
next_page_url = 'http://books.toscrape.com/catalogue/' + res.get('href')
return next_page_url
def book_link_extractor(start_url):
urls = []
while start_url is not None:
html = fetch(start_url)
for book in html.findall('.//*[@class="product_pod"]/h3/a'):
url = 'http://books.toscrape.com/catalogue/' + book.get('href')
urls.append(url)
start_url = nextpage_link_extractor(html)
return urls
def scrape(books_urls):
books_info = []
for book_url in books_urls:
html = fetch(book_url)
title = html.find('.//h1').text
books_info.append(
{"title": title}
)
pprint.pprint(books_info)
return books_info
if __name__ == '__main__':
main()
$ python3 books_crawler3.py
【略】
{'title': "A Spy's Devotion (The Regency Spies of London #1)"},
{'title': "1st to Die (Women's Murder Club #1)"},
{'title': '1,000 Places to See Before You Die'}]
mysql> create database scraping default character set utf8mb4;
Query OK, 1 row affected (0.01 sec)
mysql> CREATE TABLE books(id INT NOT NULL AUTO_INCREMENT, title TEXT, PRIMARY KEY(id));
Query OK, 0 rows affected (0.06 sec)
mysql> select * from books;
Empty set (0.00 sec)
import requests
import lxml.html
import pymysql
import datetime
def main():
print('[START]:',datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))
start_url = 'http://books.toscrape.com/catalogue/page-1.html'
books_urls = book_link_extractor(start_url)
books = scrape(books_urls)
save(books)
print('[END]:',datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))
def fetch(url):
req = requests.get(url)
html = lxml.html.fromstring(req.text)
return html
def nextpage_link_extractor(html):
res = html.find('.//*[@class="next"]/a')
if res is None:
return None
else:
next_page_url = 'http://books.toscrape.com/catalogue/' + res.get('href')
return next_page_url
def book_link_extractor(start_url):
urls = []
while start_url is not None:
html = fetch(start_url)
for book in html.findall('.//*[@class="product_pod"]/h3/a'):
url = 'http://books.toscrape.com/catalogue/' + book.get('href')
urls.append(url)
start_url = nextpage_link_extractor(html)
return urls
def scrape(books_urls):
books_info = []
for book_url in books_urls:
html = fetch(book_url)
title = html.find('.//h1').text
books_info.append(
{"title": title}
)
return books_info
def save(books):
con = pymysql.connect(host="localhost",
db="scraping",
user="****",
passwd="****",
charset="utf8")
cur = con.cursor()
for book in books:
query = 'INSERT INTO books (title) VALUES (%s);'
title = book.get("title")
record = (title)
print('[Insert]:{}'.format(record))
cur.execute(query, record)
con.commit()
con.close()
if __name__ == '__main__':
main()
$ python3 books_crawler.py
[START]: 2020年06月13日 17:46:49
[Insert]:A Light in the Attic
[Insert]:Tipping the Velvet
[Insert]:Soumission
【略】
[Insert]:A Spy's Devotion (The Regency Spies of London #1)
[Insert]:1st to Die (Women's Murder Club #1)
[Insert]:1,000 Places to See Before You Die
[END]: 2020年06月13日 17:58:14
mysql> select * from books limit 10;
+----+------------------------------------------------------------------------------------------------+
| id | title |
+----+------------------------------------------------------------------------------------------------+
| 1 | A Light in the Attic |
| 2 | Tipping the Velvet |
| 3 | Soumission |
| 4 | Sharp Objects |
| 5 | Sapiens: A Brief History of Humankind |
| 6 | The Requiem Red |
| 7 | The Dirty Little Secrets of Getting Your Dream Job |
| 8 | The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull |
| 9 | The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics |
| 10 | The Black Maria |
+----+------------------------------------------------------------------------------------------------+
10 rows in set (0.00 sec)
mysql> select count(1) from books;
+----------+
| count(1) |
+----------+
| 1000 |
+----------+
1 row in set (0.00 sec)