1"""
2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
7`scrapy crawl title_spider`.
8
9We recommend you do not modify this file unless you really know what you are doing.
10"""
11
12
13
14from __future__ import annotations
15from logging import StreamHandler, getLogger
16from typing import Any
17from scrapy.utils import log as scrapy_logging
18from scrapy.utils.project import get_project_settings
19from apify.log import ActorLogFormatter
20
21
22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
25
26
27
28
29settings = get_project_settings()
30LOGGING_LEVEL = settings['LOG_LEVEL']
31
32
33apify_handler = StreamHandler()
34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
38 """
39 Configure a logger with the specified settings.
40
41 Args:
42 logger_name: The name of the logger to be configured.
43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44 handlers: Optional list of logging handlers.
45 """
46 logger = getLogger(logger_name)
47 logger.setLevel(log_level)
48 logger.handlers = []
49
50 for handler in handlers:
51 logger.addHandler(handler)
52
53
54
55
56for logger_name in MAIN_LOGGER_NAMES:
57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59
60
61
62
63
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68 """
69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
72 these four loggers and the root logger.
73 """
74 old_configure_logging(*args, **kwargs)
75
76
77
78
79 configure_logger(None, LOGGING_LEVEL, apify_handler)
80
81
82
83 for logger_name in ALL_LOGGER_NAMES:
84 configure_logger(logger_name, LOGGING_LEVEL)
85
86
87
88 configure_logger('httpx', 'WARNING')
89
90
91scrapy_logging.configure_logging = new_configure_logging
92
93
94import asyncio
95import os
96import nest_asyncio
97from scrapy.utils.reactor import install_reactor
98from .main import main
99
100
101
102
103
104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
105nest_asyncio.apply()
106
107
108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
109
110
111asyncio.run(main())