1"""Scrapy middlewares module.
2
3This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
4responses, typically used for adding custom headers, retrying requests, and handling exceptions.
5
6There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information
7on creating and utilizing them, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
9https://docs.scrapy.org/en/latest/topics/spider-middleware.html
10"""
11
12from __future__ import annotations
13from typing import Generator, Iterable
14
15from scrapy import Request, Spider, signals
16from scrapy.crawler import Crawler
17from scrapy.http import Response
18
19# useful for handling different item types with a single interface
20from itemadapter import is_item, ItemAdapter
21
22
23class TitleSpiderMiddleware:
24 # Not all methods need to be defined. If a method is not defined,
25 # scrapy acts as if the spider middleware does not modify the
26 # passed objects.
27
28 @classmethod
29 def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:
30 # This method is used by Scrapy to create your spiders.
31 s = cls()
32 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
33 return s
34
35 def process_spider_input(self, response: Response, spider: Spider) -> None:
36 # Called for each response that goes through the spider
37 # middleware and into the spider.
38
39 # Should return None or raise an exception.
40 return None
41
42 def process_spider_output(
43 self,
44 response: Response,
45 result: Iterable,
46 spider: Spider,
47 ) -> Generator[Iterable[Request] | None, None, None]:
48 # Called with the results returned from the Spider, after
49 # it has processed the response.
50
51 # Must return an iterable of Request, or item objects.
52 for i in result:
53 yield i
54
55 def process_spider_exception(
56 self,
57 response: Response,
58 exception: BaseException,
59 spider: Spider,
60 ) -> Iterable[Request] | None:
61 # Called when a spider or process_spider_input() method
62 # (from other spider middleware) raises an exception.
63
64 # Should return either None or an iterable of Request or item objects.
65 pass
66
67 def process_start_requests(
68 self, start_requests: Iterable[Request], spider: Spider
69 ) -> Iterable[Request]: # Called with the start requests of the spider, and works
70 # similarly to the process_spider_output() method, except
71 # that it doesn’t have a response associated.
72
73 # Must return only requests (not items).
74 for r in start_requests:
75 yield r
76
77 def spider_opened(self, spider: Spider) -> None:
78 pass
79
80
81class TitleDownloaderMiddleware:
82 # Not all methods need to be defined. If a method is not defined,
83 # scrapy acts as if the downloader middleware does not modify the
84 # passed objects.
85
86 @classmethod
87 def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:
88 # This method is used by Scrapy to create your spiders.
89 s = cls()
90 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
91 return s
92
93 def process_request(self, request: Request, spider: Spider) -> Request | Response | None:
94 # Called for each request that goes through the downloader
95 # middleware.
96
97 # Must either:
98 # - return None: continue processing this request
99 # - or return a Response object
100 # - or return a Request object
101 # - or raise IgnoreRequest: process_exception() methods of
102 # installed downloader middleware will be called
103 return None
104
105 def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
106 # Called with the response returned from the downloader.
107
108 # Must either;
109 # - return a Response object
110 # - return a Request object
111 # - or raise IgnoreRequest
112 return response
113
114 def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:
115 # Called when a download handler or a process_request()
116 # (from other downloader middleware) raises an exception.
117
118 # Must either:
119 # - return None: continue processing this exception
120 # - return a Response object: stops process_exception() chain
121 # - return a Request object: stops process_exception() chain
122 pass
123
124 def spider_opened(self, spider: Spider) -> None:
125 pass