BusinessScraper2
Deprecated
Pricing
Pay per usage
Go to Store
BusinessScraper2
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
2
Monthly users
2
Last modified
a year ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "businessscraper2", "title": "Getting started with Python and Scrapy", "description": "Scrapes titles of websites using Scrapy.", "version": "0.0", "meta": { "templateId": "python-scrapy" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Python Scrapy Scraper", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "string", "description": "URLs to start with", "default": "https://infodoanhnghiep.com/Ha-Noi/", "editor": "textfield" }, "pageStart": { "title": "Page Start", "type": "integer", "description": "Page to start with", "default": 1, "editor": "number" }, "pageEnd": { "title": "Page End", "type": "integer", "description": "Page to end with", "default": 100, "editor": "number" }, "proxyConfiguration": { "sectionCaption": "Proxy and HTTP configuration", "title": "Proxy configuration", "type": "object", "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", "editor": "proxy", "prefill": { "useApifyProxy": true }, "default": { "useApifyProxy": true } } }, "required": ["startUrls", "pageStart", "pageEnd"]}
src/spiders/__init__.py
1"""2Scrapy spiders package3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape5and process data from websites.6
7For detailed information on creating and utilizing spiders, refer to the official documentation:8https://docs.scrapy.org/en/latest/topics/spiders.html9"""
src/spiders/title.py
1import scrapy2import datetime3import time4class TitleSpider(scrapy.Spider):5 name = "title"6 allowed_domains = ["infodoanhnghiep.com"]7
8 def __init__(self, **kw):9 super(TitleSpider, self).__init__(**kw)10 self.url = kw.get('start_urls')11 self.page_start = int(kw.get('page_start'))12 self.page_end = int(kw.get('page_end'))13
14 def start_requests(self):15 page_url = self.url + f'trang-{self.page_start}/'16 yield scrapy.Request(page_url, callback=self.parse)17
18 # start_urls = [19 # "https://infodoanhnghiep.com/Ha-Noi/",20 # "https://infodoanhnghiep.com/TP-Ho-Chi-Minh/",21 # "https://infodoanhnghiep.com/Can-Tho/",22 # "https://infodoanhnghiep.com/Da-Nang/",23 # "https://infodoanhnghiep.com/Hai-Phong/",24 # "https://infodoanhnghiep.com/An-Giang/",25 # "https://infodoanhnghiep.com/Ba-Ria-Vung-Tau/",26 # "https://infodoanhnghiep.com/Bac-Giang/",27 # "https://infodoanhnghiep.com/Bac-Kan/",28 # "https://infodoanhnghiep.com/Bac-Lieu/",29 # "https://infodoanhnghiep.com/Bac-Ninh/",30 # "https://infodoanhnghiep.com/Ben-Tre/",31 # "https://infodoanhnghiep.com/Binh-Dinh/",32 # "https://infodoanhnghiep.com/Binh-Duong/",33 # "https://infodoanhnghiep.com/Binh-Phuoc/",34 # "https://infodoanhnghiep.com/Binh-Thuan/",35 # "https://infodoanhnghiep.com/Ca-Mau/",36 # "https://infodoanhnghiep.com/Cao-Bang/",37 # "https://infodoanhnghiep.com/Dak-Lak/",38 # "https://infodoanhnghiep.com/Dak-Nong/",39 # "https://infodoanhnghiep.com/Dien-Bien/",40 # "https://infodoanhnghiep.com/Dong-Nai/",41 # "https://infodoanhnghiep.com/Dong-Thap/",42 # "https://infodoanhnghiep.com/Gia-Lai/",43 # "https://infodoanhnghiep.com/Ha-Giang/",44 # "https://infodoanhnghiep.com/Ha-Nam/",45 # "https://infodoanhnghiep.com/Ha-Tinh/",46 # "https://infodoanhnghiep.com/Hai-Duong/",47 # "https://infodoanhnghiep.com/Hau-Giang/",48 # "https://infodoanhnghiep.com/Hoa-Binh/",49 # "https://infodoanhnghiep.com/Hung-Yen/",50 # "https://infodoanhnghiep.com/Khanh-Hoa/",51 # "https://infodoanhnghiep.com/Kien-Giang/",52 # "https://infodoanhnghiep.com/Kon-Tum/",53 # "https://infodoanhnghiep.com/Lai-Chau/",54 # "https://infodoanhnghiep.com/Lam-Dong/",55 # "https://infodoanhnghiep.com/Lang-Son/",56 # "https://infodoanhnghiep.com/Lao-Cai/",57 # "https://infodoanhnghiep.com/Long-An/",58 # "https://infodoanhnghiep.com/Nam-Dinh/",59 # "https://infodoanhnghiep.com/Nghe-An/",60 # "https://infodoanhnghiep.com/Ninh-Binh/",61 # "https://infodoanhnghiep.com/Ninh-Thuan/",62 # "https://infodoanhnghiep.com/Phu-Tho/",63 # "https://infodoanhnghiep.com/Phu-Yen/",64 # "https://infodoanhnghiep.com/Quang-Binh/",65 # "https://infodoanhnghiep.com/Quang-Nam/",66 # "https://infodoanhnghiep.com/Quang-Ngai/",67 # "https://infodoanhnghiep.com/Quang-Ninh/",68 # "https://infodoanhnghiep.com/Quang-Tri/",69 # "https://infodoanhnghiep.com/Soc-Trang/",70 # "https://infodoanhnghiep.com/Son-La/",71 # "https://infodoanhnghiep.com/Tay-Ninh/",72 # "https://infodoanhnghiep.com/Thai-Binh/",73 # "https://infodoanhnghiep.com/Thai-Nguyen/",74 # "https://infodoanhnghiep.com/Thanh-Hoa/",75 # "https://infodoanhnghiep.com/Thua-Thien-Hue/",76 # "https://infodoanhnghiep.com/Tien-Giang/",77 # "https://infodoanhnghiep.com/Tra-Vinh/",78 # "https://infodoanhnghiep.com/Tuyen-Quang/",79 # "https://infodoanhnghiep.com/Vinh-Long/",80 # "https://infodoanhnghiep.com/Vinh-Phuc/",81 # "https://infodoanhnghiep.com/Yen-Bai/"82 # ]83
84 def parse(self, response):85 # # Lấy danh sách các doanh nghiệp86 # for doanhnghiep in response.css('div.company-item'):87 # yield {88 # 'name': doanhnghiep.css('h3.company-name a::text').get(),89 # 'link': doanhnghiep.css('h3.company-name a::attr(href)').get(),90 # 'tax_code': doanhnghiep.css('p::text').re_first(r'Mã số thuế:\s*(\S+)'),91 # 'address': doanhnghiep.css('p::text').re_first(r'Địa chỉ:\s*(.+)').replace('Địa chỉ: ','')92 # }93 # Lấy danh sách các doanh nghiệp94 for doanhnghiep in response.css('div.company-item'):95 link = doanhnghiep.css('h3.company-name a::attr(href)').get()96 yield scrapy.Request(97 url=link,98 callback=self.parse_detail,99 meta={'link': link}100 )101 # Tìm link đến trang tiếp theo102 # current_page = response.css('ul.pagination li.active a::attr(href)').get()103 # next_page = response.css('ul.pagination li.active + li a::attr(href)').get()104 # if next_page is not None:105 # yield response.follow(next_page, self.parse)106 # Tìm link đến trang tiếp theo, chỉ lấy trang trong khoảng page_start và page_end107 current_page_number = response.css('ul.pagination li.active a::text').get()108 current_page_number = int(current_page_number) if current_page_number else self.page_start109
110 if current_page_number < self.page_end:111 next_page_number = current_page_number + 1112 next_page_url = f"{self.url}trang-{next_page_number}/"113 yield scrapy.Request(url=next_page_url, callback=self.parse)114
115 116 def parse_detail(self, response):117 # Lấy dữ liệu chi tiết từ trang doanh nghiệp118 # Extract business information119 description = response.css('div.description p::text').getall()120 name = response.css('div.description strong::text').get()121 tax_id = response.css('div.responsive-table-cell[itemprop="taxID"]::text').get()122 status = response.css('div.responsive-table-cell:contains("Tình trạng hoạt động:") + div.responsive-table-cell::text').get()123 address = response.css('div.responsive-table-cell[itemprop="address"]::text').get()124 license_issue_date = response.css('div.responsive-table-cell:contains("Ngày cấp giấy phép:") + div.responsive-table-cell::text').get()125 industry = response.css('div.responsive-table-cell::text').re_first(r'.*Ngành chính.*')126 127 yield {128 'name': name,129 'tax_id': tax_id,130 'status': status,131 'address': address,132 'license_issue_date': license_issue_date,133 'industry': industry.replace(' (Ngành chính)', '') if industry is not None else None,134 'link': response.meta['link'],135 'scraping_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')136 }
src/__main__.py
1"""2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using7`scrapy crawl title_spider`.8
9We recommend you do not modify this file unless you really know what you are doing.10"""11
12# We need to configure the logging first before we import anything else, so that nothing else imports13# `scrapy.utils.log` before we patch it.14from __future__ import annotations15from logging import StreamHandler, getLogger16from typing import Any17from scrapy.utils import log as scrapy_logging18from scrapy.utils.project import get_project_settings19from apify.log import ActorLogFormatter20
21# Define names of the loggers.22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for28# a specific logger, do it in this file.29settings = get_project_settings()30LOGGING_LEVEL = settings['LOG_LEVEL']31
32# Define a logging handler which will be used for the loggers.33apify_handler = StreamHandler()34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:38 """39 Configure a logger with the specified settings.40
41 Args:42 logger_name: The name of the logger to be configured.43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).44 handlers: Optional list of logging handlers.45 """46 logger = getLogger(logger_name)47 logger.setLevel(log_level)48 logger.handlers = []49
50 for handler in handlers:51 logger.addHandler(handler)52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from55# the `main.py` and Scrapy components.56for logger_name in MAIN_LOGGER_NAMES:57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because63# otherwise we would lose some log messages.64old_configure_logging = scrapy_logging.configure_logging65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:68 """69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here72 these four loggers and the root logger.73 """74 old_configure_logging(*args, **kwargs)75
76 # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`77 # property within spiders. See details in the Spider logger property:78 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.79 configure_logger(None, LOGGING_LEVEL, apify_handler)80
81 # We modify other loggers only by setting up their log level. A custom log handler is added82 # only to the root logger to avoid duplicate log messages.83 for logger_name in ALL_LOGGER_NAMES:84 configure_logger(logger_name, LOGGING_LEVEL)85
86 # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless87 # messages, especially when running on the platform.88 configure_logger('httpx', 'WARNING')89
90
91scrapy_logging.configure_logging = new_configure_logging92
93# Now we can do the rest of the setup94import asyncio95import os96import nest_asyncio97from scrapy.utils.reactor import install_reactor98from .main import main99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,103# otherwise, it will not work correctly on Windows.104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')105nest_asyncio.apply()106
107# Specify the path to the Scrapy project settings module108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'109
110# Run the Apify main coroutine111asyncio.run(main())
src/items.py
1# Define here the models for your scraped items2#3# See documentation in:4# https://docs.scrapy.org/en/latest/topics/items.html5
6import scrapy7
8
9class MyprojectItem(scrapy.Item):10 # define the fields for your item here like:11 # name = scrapy.Field()12 pass
src/main.py
1"""2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline5for pushing data to the Apify dataset.6
7Customization:8--------------9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy11components like spiders and handling Actor input. However, make sure you have a clear understanding of your12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.13
14Documentation:15--------------16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.19"""20
21from __future__ import annotations22
23from scrapy.crawler import CrawlerProcess24
25from apify import Actor26from apify.scrapy.utils import apply_apify_settings27
28# Import your Scrapy spider here29from .spiders.title import TitleSpider as Spider30
31# Default input values for local execution using `apify run`32LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]33
34
35async def main() -> None:36 """37 Apify Actor main coroutine for executing the Scrapy spider.38 """39 async with Actor:40 Actor.log.info('Actor is being executed...')41
42 # Process Actor input43 actor_input = await Actor.get_input() or {}44 start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)45 46 page_start = actor_input.get('pageStart', 1)47 page_end = actor_input.get('pageEnd', 100)48 proxy_config = actor_input.get('proxyConfiguration')49 # start_urls=f"{start_urls}trang-{page_start}/"50 # Add start URLs to the request queue51 # rq = await Actor.open_request_queue()52 # for start_url in start_urls:53 # url = start_url.get('url')54 # await rq.add_request(request={'url': url, 'method': 'GET'})55
56 # Apply Apify settings, it will override the Scrapy project settings57 settings = apply_apify_settings(proxy_config=proxy_config)58
59 # Execute the spider using Scrapy CrawlerProcess60 process = CrawlerProcess(settings, install_root_handler=False)61 process.crawl('title', start_urls=start_urls, page_start=page_start, page_end=page_end)62 process.start()
src/middlewares.py
1"""2Scrapy middlewares module3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and5responses, typically used for adding custom headers, retrying requests, and handling exceptions.6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information8on creating and utilizing them, refer to the official documentation:9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html10https://docs.scrapy.org/en/latest/topics/spider-middleware.html11"""12
13from __future__ import annotations14from typing import Generator, Iterable15
16from scrapy import Request, Spider, signals17from scrapy.crawler import Crawler18from scrapy.http import Response19
20# useful for handling different item types with a single interface21from itemadapter import is_item, ItemAdapter22
23
24class TitleSpiderMiddleware:25 # Not all methods need to be defined. If a method is not defined,26 # scrapy acts as if the spider middleware does not modify the27 # passed objects.28
29 @classmethod30 def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:31 # This method is used by Scrapy to create your spiders.32 s = cls()33 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)34 return s35
36 def process_spider_input(self, response: Response, spider: Spider) -> None:37 # Called for each response that goes through the spider38 # middleware and into the spider.39
40 # Should return None or raise an exception.41 return None42
43 def process_spider_output(44 self,45 response: Response,46 result: Iterable,47 spider: Spider,48 ) -> Generator[Iterable[Request] | None, None, None]:49 # Called with the results returned from the Spider, after50 # it has processed the response.51
52 # Must return an iterable of Request, or item objects.53 for i in result:54 yield i55
56 def process_spider_exception(57 self,58 response: Response,59 exception: BaseException,60 spider: Spider,61 ) -> Iterable[Request] | None:62 # Called when a spider or process_spider_input() method63 # (from other spider middleware) raises an exception.64
65 # Should return either None or an iterable of Request or item objects.66 pass67
68 def process_start_requests(69 self, start_requests: Iterable[Request], spider: Spider70 ) -> Iterable[Request]: # Called with the start requests of the spider, and works71 # similarly to the process_spider_output() method, except72 # that it doesn’t have a response associated.73
74 # Must return only requests (not items).75 for r in start_requests:76 yield r77
78 def spider_opened(self, spider: Spider) -> None:79 pass80
81
82class TitleDownloaderMiddleware:83 # Not all methods need to be defined. If a method is not defined,84 # scrapy acts as if the downloader middleware does not modify the85 # passed objects.86
87 @classmethod88 def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:89 # This method is used by Scrapy to create your spiders.90 s = cls()91 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)92 return s93
94 def process_request(self, request: Request, spider: Spider) -> Request | Response | None:95 # Called for each request that goes through the downloader96 # middleware.97
98 # Must either:99 # - return None: continue processing this request100 # - or return a Response object101 # - or return a Request object102 # - or raise IgnoreRequest: process_exception() methods of103 # installed downloader middleware will be called104 return None105
106 def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:107 # Called with the response returned from the downloader.108
109 # Must either;110 # - return a Response object111 # - return a Request object112 # - or raise IgnoreRequest113 return response114
115 def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:116 # Called when a download handler or a process_request()117 # (from other downloader middleware) raises an exception.118
119 # Must either:120 # - return None: continue processing this exception121 # - return a Response object: stops process_exception() chain122 # - return a Request object: stops process_exception() chain123 pass124
125 def spider_opened(self, spider: Spider) -> None:126 pass
src/pipelines.py
1"""2Scrapy item pipelines module3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components5that handle the scraped items, typically used for cleaning, validating, and persisting data.6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:8http://doc.scrapy.org/en/latest/topics/item-pipeline.html9"""10
11from scrapy import Spider12
13from .items import TitleItem14
15
16class TitleItemPipeline:17 """18 This item pipeline defines processing steps for TitleItem objects scraped by spiders.19 """20
21 def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:22 # Do something with the item here, such as cleaning it or persisting it to a database23 return item
src/settings.py
1# Scrapy settings for myproject project2#3# For simplicity, this file contains only settings considered important or4# commonly used. You can find more settings consulting the documentation:5#6# https://docs.scrapy.org/en/latest/topics/settings.html7# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html8# https://docs.scrapy.org/en/latest/topics/spider-middleware.html9import datetime10BOT_NAME = "src"11
12SPIDER_MODULES = ["src.spiders"]13NEWSPIDER_MODULE = "src.spiders"14
15
16# Crawl responsibly by identifying yourself (and your website) on the user-agent17#USER_AGENT = "myproject (+http://www.yourdomain.com)"18
19# Obey robots.txt rules20ROBOTSTXT_OBEY = True21
22# Configure maximum concurrent requests performed by Scrapy (default: 16)23#CONCURRENT_REQUESTS = 3224
25# Configure a delay for requests for the same website (default: 0)26# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay27# See also autothrottle settings and docs28DOWNLOAD_DELAY = 0.229# The download delay setting will honor only one of:30# CONCURRENT_REQUESTS_PER_DOMAIN = 1631# CONCURRENT_REQUESTS_PER_IP = 1632
33# Disable cookies (enabled by default)34# COOKIES_ENABLED = False35
36# Disable Telnet Console (enabled by default)37#TELNETCONSOLE_ENABLED = False38
39# Override the default request headers:40#DEFAULT_REQUEST_HEADERS = {41# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",42# "Accept-Language": "en",43#}44
45# Enable or disable spider middlewares46# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html47#SPIDER_MIDDLEWARES = {48# "myproject.middlewares.MyprojectSpiderMiddleware": 543,49#}50
51# Enable or disable downloader middlewares52# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html53#DOWNLOADER_MIDDLEWARES = {54# "myproject.middlewares.MyprojectDownloaderMiddleware": 543,55#}56
57# Enable or disable extensions58# See https://docs.scrapy.org/en/latest/topics/extensions.html59#EXTENSIONS = {60# "scrapy.extensions.telnet.TelnetConsole": None,61#}62
63# Configure item pipelines64# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html65#ITEM_PIPELINES = {66# "myproject.pipelines.MyprojectPipeline": 300,67#}68
69# Enable and configure the AutoThrottle extension (disabled by default)70# See https://docs.scrapy.org/en/latest/topics/autothrottle.html71#AUTOTHROTTLE_ENABLED = True72# The initial download delay73# AUTOTHROTTLE_START_DELAY = 574# The maximum download delay to be set in case of high latencies75# AUTOTHROTTLE_MAX_DELAY = 6076# The average number of requests Scrapy should be sending in parallel to77# each remote server78#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.079# Enable showing throttling stats for every response received:80#AUTOTHROTTLE_DEBUG = False81
82# Enable and configure HTTP caching (disabled by default)83# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings84#HTTPCACHE_ENABLED = True85#HTTPCACHE_EXPIRATION_SECS = 086#HTTPCACHE_DIR = "httpcache"87#HTTPCACHE_IGNORE_HTTP_CODES = []88#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"89
90# Set settings whose default value is deprecated to a future-proof value91REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"92TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"93FEED_EXPORT_ENCODING = "utf-8"94
95# DOWNLOADER_MIDDLEWARES = {96# # ... Other middlewares97# # 'scratest.middlewares.UARotatorMiddleware': 400,98# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,99# 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,100# }101
102# # Desired file format103# FEED_FORMAT = "csv"104 105# # Name of the file where data extracted is stored, time is appended to avoid overwriting106# FEED_URI = "business_%(time)s.csv" % {'time': datetime.datetime.now().strftime('%Y%m%d%H%M%S')}107
108
109# SCRAPEOPS_API_KEY = 'd74c7df8-a747-468b-b6bc-594fd691e6eb'110# SCRAPEOPS_FAKE_USER_AGENT_ENABLED = True111# SCRAPEOPS_NUM_RESULTS= 100112
113# DOWNLOADER_MIDDLEWARES = {114# 'myproject.middlewares.MyprojectSpiderMiddleware': 400,115# 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550,116# 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,117# }118# EXTENSIONS = {119# 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 120# }121# Desired file format122# FEED_FORMAT = "csv"123 124# # Name of the file where data extracted is stored, time is appended to avoid overwriting125# FEED_URI = "infodoanhnghiep_%(time)s.csv" % {'time': datetime.datetime.now().strftime('%Y%m%d%H%M%S')}126
127RETRY_ENABLED = True128RETRY_HTTP_CODES = [429] # Thử lại khi gặp lỗi 429129RETRY_TIMES = 5 # Số lần thử lại130
131
132FEEDS = {133 "s3://businessbucketscrapy/%(name)s/%(name)s_%(time)s.csv": {134 "format": "csv",135 }136}137
138AWS_ACCESS_KEY_ID = 'AKIA6DZW3JDPIZU4PYFX'139AWS_SECRET_ACCESS_KEY = '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'
.dockerignore
# Git folder.git
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify[scrapy] ~= 1.7.05nest-asyncio ~= 1.5.86scrapy ~= 2.11.17botocore==1.34.143
scrapy.cfg
[settings]default = src.settings
[deploy]project = src