Scrapy easy to understand

foreword

When it comes to crawler frameworks, in addition to the crawler systems developed by each company, the public framework part will naturally mention Scrapy, which is a very powerful distributed asynchronous crawler framework.

This article will talk about the simple use of Scrapy.

combat

install dependencies

# install dependencies
pip3 install Scrapy

# Mysql
pip3 install mysqlclient

Create projects and crawlers

Analyze the current address, obtain the website HOST and crawl address, use commands to create a crawler project and a crawler under a certain folder

# Create a crawler project
scrapy startproject cqmmgo

# open folder
cd cqmmgo

# Create a crawler
scrapy genspider talk website HOST

Define the Item entity object

In the items.py file, define the data to be crawled as Item

For example, here you need to crawl the title of the post, the author, the number of readers, the number of comments, the URL of the post, and the release time

# items.py

import scrapy

# gossip
class CqTalkItem(scrapy.Item):
    # title
    title = scrapy.Field()

    # author
    author = scrapy.Field()

    # view frequency
    watch_num = scrapy.Field()

    # Number of comments
    comment_num = scrapy.Field()

    # address
    address_url = scrapy.Field()

    # release time
    create_time = scrapy.Field()

Write a crawler

Write specific crawler logic in the crawler file under the spiders folder

Through analysis, it is found that the post data is directly rendered through the template, not dynamically loaded, so we directly analyze the data of the response

PS: It is recommended to use Xpath for the parsing method

The parsed data composes the Item entity defined above and adds it to the generator

# spiders/talk.py

import scrapy
from cqmmgo.items import CqTalkItem
from cqmmgo.settings import talk_hour_before
from cqmmgo.utils import calc_interval_hour

class TalkSpider(scrapy.Spider):
    name = 'talk'
    allowed_domains = ['HOST']

    # Page 1-5 data
    start_urls = ['https://HOST/forum-233-{}.html'.format(i + 1) for i in range(5)]

    def parse(self, response):
        # Direct Xpath parsing
        elements = response.xpath('//div[contains(@class,"list-data-item")]')

        for element in elements:
            item = CqTalkItem()
            title = element.xpath('.//*[@class="subject"]/a/@title').extract_first()
            author = element.xpath(".//span[@itemprop='post author']/text()").extract_first()
            watch_num = element.xpath(".//span[@class='num-read']/text()").extract_first()
            comment_num = element.xpath(".//span[@itemprop='replies']/text()").extract_first()
            address_url = "https:" + element.xpath('.//*[@class="subject"]/a/@href').extract_first()
            create_time = element.xpath('.//span[@class="author-time"]/text()').extract_first().strip()

            # Filter data older than a set hour
            if calc_interval_hour(create_time) > talk_hour_before:
                continue

            print(
                f"title:{title},author:{author},watch:{watch_num},Comment:{comment_num},address:{address_url},release time:{create_time}")

            item['title'] = title
            item['author'] = author
            item['watch_num'] = watch_num
            item['comment_num'] = comment_num
            item['address_url'] = address_url
            item['create_time'] = create_time

            yield item

Customize random UA download middleware

Customize the random User Agent download middleware in the middlewares.py file

# middlewares.py

import random  # import random modules

class RandomUADownloaderMiddleware(object):
    def process_request(self, request, spider):
        # UA list
        USER_AGENT_LIST = [
            'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',
            'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',
            'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',
            'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',
            'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',
            'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',
            'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'
        ]

        # Randomly generate a UA
        agent = random.choice(USER_AGENT_LIST)

        # set to the request header
        request.headers['User_Agent'] = agent

Custom download pipeline Pipline

In the piplines.py file, customize two download pipelines to write data into local CSV files and Mysql data respectively

PS: For the convenience of demonstration, only the way to synchronously write to the Mysql database is shown here

# piplines.py

from scrapy.exporters import CsvItemExporter
from cqmmgo.items import CqTalkItem
import MySQLdb  # import database module

class TalkPipeline(object):
    """gossip"""

    def __init__(self):
        self.file = open("./result/talk.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, fields_to_export=[
            'title', 'author', 'watch_num', 'comment_num', 'create_time', 'address_url'
        ])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, CqTalkItem):
            self.exporter.export_item(item)
        return item

    # close resource
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

# Data is stored in the database (synchronous)
class MysqlPipeline(object):
    def __init__(self):
        # link mysql database
        self.conn = MySQLdb.connect("host", "root", "pwd", "cq", charset="utf8", use_unicode=True)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        table_name = 'talk'

        # sql statement
        insert_sql = """
            insert into  {}(title,author,watch_num,comment_num,address_url,create_time,insert_time) values(%s,%s,%s,%s,%s,%s,%s)  
        """.format(table_name)

        # Get the data from the item, save it as the ancestor, and insert it into the database
        params = list()
        params.append(item.get("title", ""))
        params.append(item.get("author", ""))
        params.append(item.get("watch_num", 0))
        params.append(item.get("comment_num", 0))
        params.append(item.get("address_url", ""))
        params.append(item.get("create_time", ""))
        params.append(current_date())

        # Execute the operation of inserting data into the database
        self.cursor.execute(insert_sql, tuple(params))

        # Submit, save to database
        self.conn.commit()

        return item

    def close_spider(self, spider):
        """release database resources"""
        self.cursor.close()
        self.conn.close()

Of course, it is also possible to define a data pipeline for data deduplication here. Through the title of the post, the duplicate data can not be processed.

# piplines.py

from scrapy.exceptions import DropItem

class DuplicatesPipeline(object):
    """Pipline Deduplication"""

    def __init__(self):
        self.talk_set = set()

    def process_item(self, item, spider):
        name = item['title']
        if name in self.talk_set:
            raise DropItem("duplicate data, discard:%s" % item)

        self.talk_set.add(name)
        return item

Configure the crawler configuration file

Open the settings.py file and edit the download delay time, default request header, download middleware, and data pipeline

# settings.py

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Host': 'HOST',
    'Referer': 'https://HOST/forum-233-1.html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}

DOWNLOADER_MIDDLEWARES = {
    'cqmmgo.middlewares.RandomUADownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}

ITEM_PIPELINES = {
    'cqmmgo.pipelines.TalkPipeline': 1,
    'cqmmgo.pipelines.MysqlPipeline': 6,
    'cqmmgo.pipelines.DuplicatesPipeline': 200,
    'cqmmgo.pipelines.CqmmgoPipeline': 300,
}

# crawl time limit
talk_hour_before = 24

Reptile main entrance

Create a file in the root directory of the crawler project and run a single crawler in the following way

# main.py

from scrapy.cmdline import execute
import sys, os

def start():
    sys.path.append(os.path.dirname(__file__))
    # run a single crawler
    execute(["scrapy", "crawl", "talk"])

if __name__ == '__main__':
    start()

At last

If the Scrapy project contains multiple crawlers, we can use the CrawlerProcess class to execute multiple crawlers concurrently

# main.py

from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess

# Run multiple crawlers under the project at the same time
def start():
    setting = get_project_settings()
    process = CrawlerProcess(setting)

    # crawler not running
    spider_besides = ['other']

    # all reptiles
    for spider_name in process.spiders.list():
        if spider_name in spider_besides:
            continue
        print("Now execute the crawler:%s" % (spider_name))
        process.crawl(spider_name)
    process.start()

if __name__ == '__main__':
    start()

Tags: Python crawler scrapy

Posted by Kevin3374 on Sat, 31 Dec 2022 16:42:15 +0530