1"""Apify Actor integration for Scrapy projects.
2
3This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
4logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
5
6This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
7or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
8`scrapy crawl title_spider`.
9
10We recommend you do not modify this file unless you really know what you are doing.
11"""
12
13
14
15from __future__ import annotations
16from logging import StreamHandler, getLogger
17from typing import Any
18from scrapy.utils import log as scrapy_logging
19from scrapy.utils.project import get_project_settings
20from apify.log import ActorLogFormatter
21
22
23MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
24OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
25ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
26
27
28
29
30settings = get_project_settings()
31LOGGING_LEVEL = settings['LOG_LEVEL']
32
33
34apify_handler = StreamHandler()
35apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
36
37
38def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
39 """Configure a logger with the specified settings.
40
41 Args:
42 logger_name: The name of the logger to be configured.
43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44 handlers: Optional list of logging handlers.
45 """
46 logger = getLogger(logger_name)
47 logger.setLevel(log_level)
48 logger.handlers = []
49
50 for handler in handlers:
51 logger.addHandler(handler)
52
53
54
55
56for logger_name in MAIN_LOGGER_NAMES:
57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59
60
61
62
63
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68 """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
69
70 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
71 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
72 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
73 these four loggers and the root logger.
74 """
75 old_configure_logging(*args, **kwargs)
76
77
78
79
80 configure_logger(None, LOGGING_LEVEL, apify_handler)
81
82
83
84 for logger_name in ALL_LOGGER_NAMES:
85 configure_logger(logger_name, LOGGING_LEVEL)
86
87
88
89 configure_logger('httpx', 'WARNING')
90
91
92scrapy_logging.configure_logging = new_configure_logging
93
94
95import asyncio
96import os
97import nest_asyncio
98from scrapy.utils.reactor import install_reactor
99from .main import main
100
101
102
103
104
105
106
107install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
108nest_asyncio.apply()
109
110
111os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
112
113
114asyncio.run(main())