foreword
When it comes to crawler frameworks, in addition to the crawler systems developed by each company, the public framework part will naturally mention Scrapy, which is a very powerful distributed asynchronous crawler framework.
This article will talk about the simple use of Scrapy.
combat
install dependencies
# install dependencies pip3 install Scrapy # Mysql pip3 install mysqlclient
Create projects and crawlers
Analyze the current address, obtain the website HOST and crawl address, use commands to create a crawler project and a crawler under a certain folder
# Create a crawler project scrapy startproject cqmmgo # open folder cd cqmmgo # Create a crawler scrapy genspider talk website HOST
Define the Item entity object
In the items.py file, define the data to be crawled as Item
For example, here you need to crawl the title of the post, the author, the number of readers, the number of comments, the URL of the post, and the release time
# items.py import scrapy # gossip class CqTalkItem(scrapy.Item): # title title = scrapy.Field() # author author = scrapy.Field() # view frequency watch_num = scrapy.Field() # Number of comments comment_num = scrapy.Field() # address address_url = scrapy.Field() # release time create_time = scrapy.Field()
Write a crawler
Write specific crawler logic in the crawler file under the spiders folder
Through analysis, it is found that the post data is directly rendered through the template, not dynamically loaded, so we directly analyze the data of the response
PS: It is recommended to use Xpath for the parsing method
The parsed data composes the Item entity defined above and adds it to the generator
# spiders/talk.py import scrapy from cqmmgo.items import CqTalkItem from cqmmgo.settings import talk_hour_before from cqmmgo.utils import calc_interval_hour class TalkSpider(scrapy.Spider): name = 'talk' allowed_domains = ['HOST'] # Page 1-5 data start_urls = ['https://HOST/forum-233-{}.html'.format(i + 1) for i in range(5)] def parse(self, response): # Direct Xpath parsing elements = response.xpath('//div[contains(@class,"list-data-item")]') for element in elements: item = CqTalkItem() title = element.xpath('.//*[@class="subject"]/a/@title').extract_first() author = element.xpath(".//span[@itemprop='post author']/text()").extract_first() watch_num = element.xpath(".//span[@class='num-read']/text()").extract_first() comment_num = element.xpath(".//span[@itemprop='replies']/text()").extract_first() address_url = "https:" + element.xpath('.//*[@class="subject"]/a/@href').extract_first() create_time = element.xpath('.//span[@class="author-time"]/text()').extract_first().strip() # Filter data older than a set hour if calc_interval_hour(create_time) > talk_hour_before: continue print( f"title:{title},author:{author},watch:{watch_num},Comment:{comment_num},address:{address_url},release time:{create_time}") item['title'] = title item['author'] = author item['watch_num'] = watch_num item['comment_num'] = comment_num item['address_url'] = address_url item['create_time'] = create_time yield item
Customize random UA download middleware
Customize the random User Agent download middleware in the middlewares.py file
# middlewares.py import random # import random modules class RandomUADownloaderMiddleware(object): def process_request(self, request, spider): # UA list USER_AGENT_LIST = [ 'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)', 'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)', 'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)', 'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)', 'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0', 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)', 'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)' ] # Randomly generate a UA agent = random.choice(USER_AGENT_LIST) # set to the request header request.headers['User_Agent'] = agent
Custom download pipeline Pipline
In the piplines.py file, customize two download pipelines to write data into local CSV files and Mysql data respectively
PS: For the convenience of demonstration, only the way to synchronously write to the Mysql database is shown here
# piplines.py from scrapy.exporters import CsvItemExporter from cqmmgo.items import CqTalkItem import MySQLdb # import database module class TalkPipeline(object): """gossip""" def __init__(self): self.file = open("./result/talk.csv", 'wb') self.exporter = CsvItemExporter(self.file, fields_to_export=[ 'title', 'author', 'watch_num', 'comment_num', 'create_time', 'address_url' ]) self.exporter.start_exporting() def process_item(self, item, spider): if isinstance(item, CqTalkItem): self.exporter.export_item(item) return item # close resource def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() # Data is stored in the database (synchronous) class MysqlPipeline(object): def __init__(self): # link mysql database self.conn = MySQLdb.connect("host", "root", "pwd", "cq", charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): table_name = 'talk' # sql statement insert_sql = """ insert into {}(title,author,watch_num,comment_num,address_url,create_time,insert_time) values(%s,%s,%s,%s,%s,%s,%s) """.format(table_name) # Get the data from the item, save it as the ancestor, and insert it into the database params = list() params.append(item.get("title", "")) params.append(item.get("author", "")) params.append(item.get("watch_num", 0)) params.append(item.get("comment_num", 0)) params.append(item.get("address_url", "")) params.append(item.get("create_time", "")) params.append(current_date()) # Execute the operation of inserting data into the database self.cursor.execute(insert_sql, tuple(params)) # Submit, save to database self.conn.commit() return item def close_spider(self, spider): """release database resources""" self.cursor.close() self.conn.close()
Of course, it is also possible to define a data pipeline for data deduplication here. Through the title of the post, the duplicate data can not be processed.
# piplines.py from scrapy.exceptions import DropItem class DuplicatesPipeline(object): """Pipline Deduplication""" def __init__(self): self.talk_set = set() def process_item(self, item, spider): name = item['title'] if name in self.talk_set: raise DropItem("duplicate data, discard:%s" % item) self.talk_set.add(name) return item
Configure the crawler configuration file
Open the settings.py file and edit the download delay time, default request header, download middleware, and data pipeline
# settings.py # Obey robots.txt rules ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 3 # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'HOST', 'Referer': 'https://HOST/forum-233-1.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', } DOWNLOADER_MIDDLEWARES = { 'cqmmgo.middlewares.RandomUADownloaderMiddleware': 543, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, } ITEM_PIPELINES = { 'cqmmgo.pipelines.TalkPipeline': 1, 'cqmmgo.pipelines.MysqlPipeline': 6, 'cqmmgo.pipelines.DuplicatesPipeline': 200, 'cqmmgo.pipelines.CqmmgoPipeline': 300, } # crawl time limit talk_hour_before = 24
Reptile main entrance
Create a file in the root directory of the crawler project and run a single crawler in the following way
# main.py from scrapy.cmdline import execute import sys, os def start(): sys.path.append(os.path.dirname(__file__)) # run a single crawler execute(["scrapy", "crawl", "talk"]) if __name__ == '__main__': start()
At last
If the Scrapy project contains multiple crawlers, we can use the CrawlerProcess class to execute multiple crawlers concurrently
# main.py from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess # Run multiple crawlers under the project at the same time def start(): setting = get_project_settings() process = CrawlerProcess(setting) # crawler not running spider_besides = ['other'] # all reptiles for spider_name in process.spiders.list(): if spider_name in spider_besides: continue print("Now execute the crawler:%s" % (spider_name)) process.crawl(spider_name) process.start() if __name__ == '__main__': start()