# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Getting started with Python and Scrapy",
    "description": "Scrapes titles of websites using Scrapy.",
    "version": "0.0",
    "meta": {
        "templateId": "python-scrapy"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Python Scrapy Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "proxyConfiguration": {
            "sectionCaption": "Proxy and HTTP configuration",
            "title": "Proxy configuration",
            "type": "object",
            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
            "editor": "proxy",
            "prefill": { "useApifyProxy": true },
            "default": { "useApifyProxy": true }
        }
    },
    "required": ["startUrls"]
}

src/spiders/init.py

1"""
2Scrapy spiders package
3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
5and process data from websites.
6
7For detailed information on creating and utilizing spiders, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/spiders.html
9"""

src/spiders/title.py

1import scrapy
2import datetime
3import time
4class TitleSpider(scrapy.Spider):
5    name = "title"
6    allowed_domains = ["infodoanhnghiep.com"]
7    start_urls = [
8        # "https://infodoanhnghiep.com/Ha-Noi/",
9        # "https://infodoanhnghiep.com/TP-Ho-Chi-Minh/",
10        # "https://infodoanhnghiep.com/Can-Tho/",
11        # "https://infodoanhnghiep.com/Da-Nang/",
12        # "https://infodoanhnghiep.com/Hai-Phong/",
13        # "https://infodoanhnghiep.com/An-Giang/",
14        # "https://infodoanhnghiep.com/Ba-Ria-Vung-Tau/",
15        # "https://infodoanhnghiep.com/Bac-Giang/",
16        # "https://infodoanhnghiep.com/Bac-Kan/",
17        # "https://infodoanhnghiep.com/Bac-Lieu/",
18        # "https://infodoanhnghiep.com/Bac-Ninh/",
19        # "https://infodoanhnghiep.com/Ben-Tre/",
20        # "https://infodoanhnghiep.com/Binh-Dinh/",
21        # "https://infodoanhnghiep.com/Binh-Duong/",
22        # "https://infodoanhnghiep.com/Binh-Phuoc/",
23        # "https://infodoanhnghiep.com/Binh-Thuan/",
24        # "https://infodoanhnghiep.com/Ca-Mau/",
25        # "https://infodoanhnghiep.com/Cao-Bang/",
26        # "https://infodoanhnghiep.com/Dak-Lak/",
27        # "https://infodoanhnghiep.com/Dak-Nong/",
28        # "https://infodoanhnghiep.com/Dien-Bien/",
29        # "https://infodoanhnghiep.com/Dong-Nai/",
30        # "https://infodoanhnghiep.com/Dong-Thap/",
31        # "https://infodoanhnghiep.com/Gia-Lai/",
32        # "https://infodoanhnghiep.com/Ha-Giang/",
33        # "https://infodoanhnghiep.com/Ha-Nam/",
34        # "https://infodoanhnghiep.com/Ha-Tinh/",
35        # "https://infodoanhnghiep.com/Hai-Duong/",
36        # "https://infodoanhnghiep.com/Hau-Giang/",
37        # "https://infodoanhnghiep.com/Hoa-Binh/",
38        # "https://infodoanhnghiep.com/Hung-Yen/",
39        # "https://infodoanhnghiep.com/Khanh-Hoa/",
40        # "https://infodoanhnghiep.com/Kien-Giang/",
41        # "https://infodoanhnghiep.com/Kon-Tum/",
42        # "https://infodoanhnghiep.com/Lai-Chau/",
43        # "https://infodoanhnghiep.com/Lam-Dong/",
44        # "https://infodoanhnghiep.com/Lang-Son/",
45        # "https://infodoanhnghiep.com/Lao-Cai/",
46        # "https://infodoanhnghiep.com/Long-An/",
47        # "https://infodoanhnghiep.com/Nam-Dinh/",
48        # "https://infodoanhnghiep.com/Nghe-An/",
49        # "https://infodoanhnghiep.com/Ninh-Binh/",
50        # "https://infodoanhnghiep.com/Ninh-Thuan/",
51        # "https://infodoanhnghiep.com/Phu-Tho/",
52        # "https://infodoanhnghiep.com/Phu-Yen/",
53        # "https://infodoanhnghiep.com/Quang-Binh/",
54        # "https://infodoanhnghiep.com/Quang-Nam/",
55        # "https://infodoanhnghiep.com/Quang-Ngai/",
56        # "https://infodoanhnghiep.com/Quang-Ninh/",
57        # "https://infodoanhnghiep.com/Quang-Tri/",
58        # "https://infodoanhnghiep.com/Soc-Trang/",
59        # "https://infodoanhnghiep.com/Son-La/",
60        # "https://infodoanhnghiep.com/Tay-Ninh/",
61        # "https://infodoanhnghiep.com/Thai-Binh/",
62        # "https://infodoanhnghiep.com/Thai-Nguyen/",
63        # "https://infodoanhnghiep.com/Thanh-Hoa/",
64        # "https://infodoanhnghiep.com/Thua-Thien-Hue/",
65        # "https://infodoanhnghiep.com/Tien-Giang/",
66        # "https://infodoanhnghiep.com/Tra-Vinh/",
67        # "https://infodoanhnghiep.com/Tuyen-Quang/",
68        # "https://infodoanhnghiep.com/Vinh-Long/",
69        # "https://infodoanhnghiep.com/Vinh-Phuc/",
70        # "https://infodoanhnghiep.com/Yen-Bai/"
71    ]
72
73    def parse(self, response):
74        # # Lấy danh sách các doanh nghiệp
75        # for doanhnghiep in response.css('div.company-item'):
76        #     yield {
77        #         'name': doanhnghiep.css('h3.company-name a::text').get(),
78        #         'link': doanhnghiep.css('h3.company-name a::attr(href)').get(),
79        #         'tax_code': doanhnghiep.css('p::text').re_first(r'Mã số thuế:\s*(\S+)'),
80        #         'address': doanhnghiep.css('p::text').re_first(r'Địa chỉ:\s*(.+)').replace('Địa chỉ: ','')
81        #     }
82        # Lấy danh sách các doanh nghiệp
83        for doanhnghiep in response.css('div.company-item'):
84            link = doanhnghiep.css('h3.company-name a::attr(href)').get()
85            yield scrapy.Request(
86                url=link,
87                callback=self.parse_detail,
88                meta={'link': link}
89            )
90        # Tìm link đến trang tiếp theo
91        current_page = response.css('ul.pagination li.active a::attr(href)').get()
92        next_page = response.css('ul.pagination li.active + li a::attr(href)').get()
93        if next_page is not None:
94            yield response.follow(next_page, self.parse)
95            
96    def parse_detail(self, response):
97        # Lấy dữ liệu chi tiết từ trang doanh nghiệp
98        # Extract business information
99        description = response.css('div.description p::text').getall()
100        name = response.css('div.description strong::text').get()
101        tax_id = response.css('div.responsive-table-cell[itemprop="taxID"]::text').get()
102        status = response.css('div.responsive-table-cell:contains("Tình trạng hoạt động:") + div.responsive-table-cell::text').get()
103        address = response.css('div.responsive-table-cell[itemprop="address"]::text').get()
104        license_issue_date = response.css('div.responsive-table-cell:contains("Ngày cấp giấy phép:") + div.responsive-table-cell::text').get()
105        industry = response.css('div.responsive-table-cell::text').re_first(r'.*Ngành chính.*')
106        
107        yield {
108            'name': name,
109            'tax_id': tax_id,
110            'status': status,
111            'address': address,
112            'license_issue_date': license_issue_date,
113            'industry': industry.replace(' (Ngành chính)', '') if industry is not None else None,
114            'link': response.meta['link'],
115            'scraping_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
116        }

src/main.py

1"""
2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
7`scrapy crawl title_spider`.
8
9We recommend you do not modify this file unless you really know what you are doing.
10"""
11
12# We need to configure the logging first before we import anything else, so that nothing else imports
13# `scrapy.utils.log` before we patch it.
14from __future__ import annotations
15from logging import StreamHandler, getLogger
16from typing import Any
17from scrapy.utils import log as scrapy_logging
18from scrapy.utils.project import get_project_settings
19from apify.log import ActorLogFormatter
20
21# Define names of the loggers.
22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
28# a specific logger, do it in this file.
29settings = get_project_settings()
30LOGGING_LEVEL = settings['LOG_LEVEL']
31
32# Define a logging handler which will be used for the loggers.
33apify_handler = StreamHandler()
34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
38    """
39    Configure a logger with the specified settings.
40
41    Args:
42        logger_name: The name of the logger to be configured.
43        log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44        handlers: Optional list of logging handlers.
45    """
46    logger = getLogger(logger_name)
47    logger.setLevel(log_level)
48    logger.handlers = []
49
50    for handler in handlers:
51        logger.addHandler(handler)
52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
55# the `main.py` and Scrapy components.
56for logger_name in MAIN_LOGGER_NAMES:
57    configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
63# otherwise we would lose some log messages.
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68    """
69    We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
70    logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
71    loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
72    these four loggers and the root logger.
73    """
74    old_configure_logging(*args, **kwargs)
75
76    # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
77    # property within spiders. See details in the Spider logger property:
78    # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
79    configure_logger(None, LOGGING_LEVEL, apify_handler)
80
81    # We modify other loggers only by setting up their log level. A custom log handler is added
82    # only to the root logger to avoid duplicate log messages.
83    for logger_name in ALL_LOGGER_NAMES:
84        configure_logger(logger_name, LOGGING_LEVEL)
85
86    # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
87    # messages, especially when running on the platform.
88    configure_logger('httpx', 'WARNING')
89
90
91scrapy_logging.configure_logging = new_configure_logging
92
93# Now we can do the rest of the setup
94import asyncio
95import os
96import nest_asyncio
97from scrapy.utils.reactor import install_reactor
98from .main import main
99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,
103# otherwise, it will not work correctly on Windows.
104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
105nest_asyncio.apply()
106
107# Specify the path to the Scrapy project settings module
108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
109
110# Run the Apify main coroutine
111asyncio.run(main())

src/items.py

1# Define here the models for your scraped items
2#
3# See documentation in:
4# https://docs.scrapy.org/en/latest/topics/items.html
5
6import scrapy
7
8
9class MyprojectItem(scrapy.Item):
10    # define the fields for your item here like:
11    # name = scrapy.Field()
12    pass

src/main.py

1"""
2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine
3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by
4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline
5for pushing data to the Apify dataset.
6
7Customization:
8--------------
9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy
11components like spiders and handling Actor input. However, make sure you have a clear understanding of your
12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.
13
14Documentation:
15--------------
16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and
18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.
19"""
20
21from __future__ import annotations
22
23from scrapy.crawler import CrawlerProcess
24
25from apify import Actor
26from apify.scrapy.utils import apply_apify_settings
27
28# Import your Scrapy spider here
29from .spiders.title import TitleSpider as Spider
30
31# Default input values for local execution using `apify run`
32LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
33
34
35async def main() -> None:
36    """
37    Apify Actor main coroutine for executing the Scrapy spider.
38    """
39    async with Actor:
40        Actor.log.info('Actor is being executed...')
41
42        # Process Actor input
43        actor_input = await Actor.get_input() or {}
44        start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
45        proxy_config = actor_input.get('proxyConfiguration')
46
47        # Add start URLs to the request queue
48        rq = await Actor.open_request_queue()
49        for start_url in start_urls:
50            url = start_url.get('url')
51            await rq.add_request(request={'url': url, 'method': 'GET'})
52
53        # Apply Apify settings, it will override the Scrapy project settings
54        settings = apply_apify_settings(proxy_config=proxy_config)
55
56        # Execute the spider using Scrapy CrawlerProcess
57        process = CrawlerProcess(settings, install_root_handler=False)
58        process.crawl(Spider)
59        process.start()

src/middlewares.py

1"""
2Scrapy middlewares module
3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
5responses, typically used for adding custom headers, retrying requests, and handling exceptions.
6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information
8on creating and utilizing them, refer to the official documentation:
9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11"""
12
13from __future__ import annotations
14from typing import Generator, Iterable
15
16from scrapy import Request, Spider, signals
17from scrapy.crawler import Crawler
18from scrapy.http import Response
19
20# useful for handling different item types with a single interface
21from itemadapter import is_item, ItemAdapter
22
23
24class TitleSpiderMiddleware:
25    # Not all methods need to be defined. If a method is not defined,
26    # scrapy acts as if the spider middleware does not modify the
27    # passed objects.
28
29    @classmethod
30    def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:
31        # This method is used by Scrapy to create your spiders.
32        s = cls()
33        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
34        return s
35
36    def process_spider_input(self, response: Response, spider: Spider) -> None:
37        # Called for each response that goes through the spider
38        # middleware and into the spider.
39
40        # Should return None or raise an exception.
41        return None
42
43    def process_spider_output(
44        self,
45        response: Response,
46        result: Iterable,
47        spider: Spider,
48    ) -> Generator[Iterable[Request] | None, None, None]:
49        # Called with the results returned from the Spider, after
50        # it has processed the response.
51
52        # Must return an iterable of Request, or item objects.
53        for i in result:
54            yield i
55
56    def process_spider_exception(
57        self,
58        response: Response,
59        exception: BaseException,
60        spider: Spider,
61    ) -> Iterable[Request] | None:
62        # Called when a spider or process_spider_input() method
63        # (from other spider middleware) raises an exception.
64
65        # Should return either None or an iterable of Request or item objects.
66        pass
67
68    def process_start_requests(
69        self, start_requests: Iterable[Request], spider: Spider
70    ) -> Iterable[Request]:  # Called with the start requests of the spider, and works
71        # similarly to the process_spider_output() method, except
72        # that it doesn’t have a response associated.
73
74        # Must return only requests (not items).
75        for r in start_requests:
76            yield r
77
78    def spider_opened(self, spider: Spider) -> None:
79        pass
80
81
82class TitleDownloaderMiddleware:
83    # Not all methods need to be defined. If a method is not defined,
84    # scrapy acts as if the downloader middleware does not modify the
85    # passed objects.
86
87    @classmethod
88    def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:
89        # This method is used by Scrapy to create your spiders.
90        s = cls()
91        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
92        return s
93
94    def process_request(self, request: Request, spider: Spider) -> Request | Response | None:
95        # Called for each request that goes through the downloader
96        # middleware.
97
98        # Must either:
99        # - return None: continue processing this request
100        # - or return a Response object
101        # - or return a Request object
102        # - or raise IgnoreRequest: process_exception() methods of
103        #   installed downloader middleware will be called
104        return None
105
106    def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
107        # Called with the response returned from the downloader.
108
109        # Must either;
110        # - return a Response object
111        # - return a Request object
112        # - or raise IgnoreRequest
113        return response
114
115    def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:
116        # Called when a download handler or a process_request()
117        # (from other downloader middleware) raises an exception.
118
119        # Must either:
120        # - return None: continue processing this exception
121        # - return a Response object: stops process_exception() chain
122        # - return a Request object: stops process_exception() chain
123        pass
124
125    def spider_opened(self, spider: Spider) -> None:
126        pass

src/pipelines.py

1"""
2Scrapy item pipelines module
3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
5that handle the scraped items, typically used for cleaning, validating, and persisting data.
6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:
8http://doc.scrapy.org/en/latest/topics/item-pipeline.html
9"""
10
11from scrapy import Spider
12
13from .items import TitleItem
14
15
16class TitleItemPipeline:
17    """
18    This item pipeline defines processing steps for TitleItem objects scraped by spiders.
19    """
20
21    def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
22        # Do something with the item here, such as cleaning it or persisting it to a database
23        return item

src/settings.py

1# Scrapy settings for myproject project
2#
3# For simplicity, this file contains only settings considered important or
4# commonly used. You can find more settings consulting the documentation:
5#
6#     https://docs.scrapy.org/en/latest/topics/settings.html
7#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9import datetime
10BOT_NAME = "src"
11
12SPIDER_MODULES = ["src.spiders"]
13NEWSPIDER_MODULE = "src.spiders"
14
15
16# Crawl responsibly by identifying yourself (and your website) on the user-agent
17#USER_AGENT = "myproject (+http://www.yourdomain.com)"
18
19# Obey robots.txt rules
20ROBOTSTXT_OBEY = True
21
22# Configure maximum concurrent requests performed by Scrapy (default: 16)
23#CONCURRENT_REQUESTS = 32
24
25# Configure a delay for requests for the same website (default: 0)
26# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27# See also autothrottle settings and docs
28DOWNLOAD_DELAY = 0.2
29# The download delay setting will honor only one of:
30# CONCURRENT_REQUESTS_PER_DOMAIN = 16
31# CONCURRENT_REQUESTS_PER_IP = 16
32
33# Disable cookies (enabled by default)
34# COOKIES_ENABLED = False
35
36# Disable Telnet Console (enabled by default)
37#TELNETCONSOLE_ENABLED = False
38
39# Override the default request headers:
40#DEFAULT_REQUEST_HEADERS = {
41#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42#    "Accept-Language": "en",
43#}
44
45# Enable or disable spider middlewares
46# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47#SPIDER_MIDDLEWARES = {
48#    "myproject.middlewares.MyprojectSpiderMiddleware": 543,
49#}
50
51# Enable or disable downloader middlewares
52# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53#DOWNLOADER_MIDDLEWARES = {
54#    "myproject.middlewares.MyprojectDownloaderMiddleware": 543,
55#}
56
57# Enable or disable extensions
58# See https://docs.scrapy.org/en/latest/topics/extensions.html
59#EXTENSIONS = {
60#    "scrapy.extensions.telnet.TelnetConsole": None,
61#}
62
63# Configure item pipelines
64# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65#ITEM_PIPELINES = {
66#    "myproject.pipelines.MyprojectPipeline": 300,
67#}
68
69# Enable and configure the AutoThrottle extension (disabled by default)
70# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71#AUTOTHROTTLE_ENABLED = True
72# The initial download delay
73# AUTOTHROTTLE_START_DELAY = 5
74# The maximum download delay to be set in case of high latencies
75# AUTOTHROTTLE_MAX_DELAY = 60
76# The average number of requests Scrapy should be sending in parallel to
77# each remote server
78#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79# Enable showing throttling stats for every response received:
80#AUTOTHROTTLE_DEBUG = False
81
82# Enable and configure HTTP caching (disabled by default)
83# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84#HTTPCACHE_ENABLED = True
85#HTTPCACHE_EXPIRATION_SECS = 0
86#HTTPCACHE_DIR = "httpcache"
87#HTTPCACHE_IGNORE_HTTP_CODES = []
88#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89
90# Set settings whose default value is deprecated to a future-proof value
91REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
92TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
93FEED_EXPORT_ENCODING = "utf-8"
94
95# DOWNLOADER_MIDDLEWARES = {
96#     # ... Other middlewares
97#     # 'scratest.middlewares.UARotatorMiddleware': 400,
98#     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
99#     'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
100# }
101
102# # Desired file format
103# FEED_FORMAT = "csv"
104 
105# # Name of the file where data extracted is stored, time is appended to avoid overwriting
106# FEED_URI = "business_%(time)s.csv" % {'time': datetime.datetime.now().strftime('%Y%m%d%H%M%S')}
107
108
109# SCRAPEOPS_API_KEY = 'd74c7df8-a747-468b-b6bc-594fd691e6eb'
110# SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True
111# SCRAPEOPS_NUM_RESULTS= 100
112
113# DOWNLOADER_MIDDLEWARES = {
114#     'myproject.middlewares.MyprojectSpiderMiddleware': 400,
115#     'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550,
116#     'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
117# }
118# EXTENSIONS = {
119#     'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 
120# }
121# Desired file format
122# FEED_FORMAT = "csv"
123 
124# # Name of the file where data extracted is stored, time is appended to avoid overwriting
125# FEED_URI = "infodoanhnghiep_%(time)s.csv" % {'time': datetime.datetime.now().strftime('%Y%m%d%H%M%S')}
126
127RETRY_ENABLED = True
128RETRY_HTTP_CODES = [429]  # Thử lại khi gặp lỗi 429
129RETRY_TIMES = 5  # Số lần thử lại
130
131
132FEEDS = {
133    "s3://businessbucketscrapy/%(name)s/%(name)s_%(time)s.csv": {
134    "format": "csv",
135    }
136}
137
138AWS_ACCESS_KEY_ID = 'AKIA6DZW3JDPIZU4PYFX'
139AWS_SECRET_ACCESS_KEY = '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'

.dockerignore

# Git folder
.git

# IDE
.idea/
.vscode/
.DS_Store

# Apify storage folder
storage/

# Python virtual environment
.venv/
.env/

# Python (and python tools) cache files
__pycache__/
*.pyc
.ruff_cache/
.mypy_cache/
.pytest_cache/

# Python build files
__pypackages__/
dist/
build/
*.egg-info/
*.egg

# log files
*.log

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# IDE
.idea/
.vscode/
.DS_Store

# Apify storage folder
storage/

# Python virtual environment
.venv/
.env/

# Python (and python tools) cache files
__pycache__/
*.pyc
.ruff_cache/
.mypy_cache/
.pytest_cache/

# Python build files
__pypackages__/
dist/
build/
*.egg-info/
*.egg

# log files
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify[scrapy] ~= 1.7.0
5nest-asyncio ~= 1.5.8
6scrapy ~= 2.11.1
7botocore==1.34.143

scrapy.cfg

[settings]
default = src.settings

[deploy]
project = src

Google Map Business Scraper With Email

bhansalisoft/google-map-business-scraper

Google Map Business Extractor with Email – Powerful Google Map Scraper software to extract complete business details including name, address, phone, Email (if available on website), website, ratings, reviews, images, working hours, category, and GPS coordinates from google map.

bhansalisoft

440

3.0

(1)

Google Map Business Scraper

logical_scrapers/google-map-business-scraper

Scrapes business information from Google Maps including name, address, phone, website, ratings, opening Hours and more.

Goldmine

331

2.3

(3)

Google My Business Scraper Tool

moving_beacon-owner1/google-my-business-scraper-tool

This Python script automates data scraping from Google My Business. It collects essential information like business name, website, phone number, address, ratings, and photos, saving the data in Excel format and uploading it to an Apify dataset.

Jamshaid Arif

184

Yelp Advanced Business Scraper: Pay Per Result

delicious_zebu/yelp-advanced-business-scraper-pay-per-result

Effortlessly scrape detailed restaurant data from Yelp, including ratings, reviews, amenities, and operating hours. Perfect for building robust datasets for market analysis, apps, or research projects.

ВAH

329

3.9

(9)

Australian Business Scraper

proscraper/australian-business-scraper

Scrapes Australian businesses from any city, zip code, keywords and categories. Input locations and keywords, and the scraper will create all possible combinations and start the scrape. Flexible and affordable payment by 1000 results, so you only pay for the data you get.

Owais Nazir

Yelp Business Scraper

piotrv1001/yelp-business-scraper

The Yelp Business Scraper extracts business information from Yelp based on search and location filters. It captures names, phone numbers, URLs, ratings, review counts, and other key details—ideal for market research and lead generation.

Piotr Vassev

153

5.0

(1)

Funda In Business Scraper

nocodeventure/funda-in-business-scraper

Our plugin extracts business data from Funda efficiently. Simply enter the starting URL generated by your search query and specify the number of pages you'd like to extract. Please note that our scraper uses a Netherlands-based residential proxy to ensure smooth and reliable performance.

No-Code Venture

Better Business Bureau (BBB) Business Scraper

scraped/bbb

Scrape businesses off of BBB

scraped

5.0

(1)

Google Maps Business Scraper

tuguidragos/google-maps-business-scraper

Scrape verified business contact data from thousands of Google Maps locations worldwide. Extract company names, phone numbers, emails, websites, addresses, ratings, and reviews. Perfect for B2B prospecting, local lead generation, and sales outreach. Fast, reliable scraper. Export to CSV, Excel, JSON

Țugui Dragoș

5.0

(5)

Nextdoor Business Scraper

scraped/nextdoor-business-scraper

Scrape businesses from Nextdoor

scraped

5.0

(1)

Google Maps Business Scraper | $2.5 / 1k | Enterprise Grade

fatihtahta/google-maps-scraper-enterprise

Scrape Google Maps fast with best-in-class coverage. Get clean, rich place data including phone, website, hours, price, photos, attributes and more. Ideal for lead generation and market research. Priced at $2.5 per 1k.

Fatih Tahta

5.0

(3)

BusinessScraper

BusinessScraper

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/spiders/__init__.py

src/spiders/title.py

src/__main__.py

src/items.py

src/main.py

src/middlewares.py

src/pipelines.py

src/settings.py

.dockerignore

.editorconfig

.gitignore

requirements.txt

scrapy.cfg

You might also like

Google Map Business Scraper With Email

Google Map Business Scraper

Google My Business Scraper Tool

Yelp Advanced Business Scraper: Pay Per Result

Australian Business Scraper

Yelp Business Scraper

Funda In Business Scraper

Better Business Bureau (BBB) Business Scraper

Google Maps Business Scraper

Nextdoor Business Scraper

Google Maps Business Scraper | $2.5 / 1k | Enterprise Grade

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/spiders/__init__.py

src/spiders/title.py

src/__main__.py

src/items.py

src/main.py

src/middlewares.py

src/pipelines.py

src/settings.py

.dockerignore

.editorconfig

.gitignore

requirements.txt

scrapy.cfg

src/spiders/init.py

src/main.py

src/spiders/init.py

src/main.py