ScrapyFy avatar

ScrapyFy

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy

Scrapy Runner

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "main"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Getting started with Python and Scrapy",
5    "description": "Scrapes titles of websites using Scrapy.",
6    "version": "0.0",
7    "meta": {
8         "templateId": "python-scrapy"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "URLs and their titles",
16            "views": {
17                "titles": {
18                    "title": "URLs and their titles",
19                    "transformation": {
20                        "fields": [
21                            "url",
22                            "title"
23                        ]
24                    },
25                    "display": {
26                        "component": "table",
27                        "properties": {
28                            "url": {
29                                "label": "URL",
30                                "format": "text"
31                            },
32                            "title": {
33                                "label": "Title",
34                                "format": "text"
35                            }
36                        }
37                    }
38                }
39            }
40        }
41    }
42}

.actor/input_schema.json

1{
2    "title": "Python Scrapy Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "required": ["spiders_code"],
6    "properties": {
7        "spiders_code": {
8            "title": "Spiders Code",
9            "type": "string",
10            "description": "Define your spiders <i>(multiple spiders can be specified)</i>.",
11            "editor": "python",
12            "prefill": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n    name = 'title_spider'\r\n    allowed_domains = [\"apify.com\"]\r\n    start_urls = [\"https://apify.com\"]\r\n\r\n    custom_settings = {\r\n        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r\n        # Obey robots.txt rules\r\n        'ROBOTSTXT_OBEY'                        : True,\r\n        'DEPTH_LIMIT'                           : 2,\r\n        'LOG_ENABLED'                           : False,\r\n        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r\n        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r\n    }\r\n\r\n    def parse(self, response):\r\n        yield {\r\n            'url': response.url,\r\n            'title': response.css('title::text').extract_first(),\r\n        }\r\n        for link_href in response.css('a::attr(\"href\")'):\r\n            link_url = urljoin(response.url, link_href.get())\r\n            if link_url.startswith(('http://', 'https://')):\r\n                yield scrapy.Request(link_url)"
13        },
14
15    "AJAXCRAWL_ENABLED": {"title": "AJAXCRAWL_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
16    "ASYNCIO_EVENT_LOOP": {"title": "ASYNCIO_EVENT_LOOP","type": "string","editor": "textfield","description": ""},
17    "AUTOTHROTTLE_DEBUG": {"title": "AUTOTHROTTLE_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
18    "AUTOTHROTTLE_ENABLED": {"title": "AUTOTHROTTLE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
19    "AUTOTHROTTLE_MAX_DELAY": {"title": "AUTOTHROTTLE_MAX_DELAY","type": "integer","default": 60,"description": ""},
20    "AUTOTHROTTLE_START_DELAY": {"title": "AUTOTHROTTLE_START_DELAY","type": "integer","default": 5,"description": ""},
21    "AUTOTHROTTLE_TARGET_CONCURRENCY": {"title": "AUTOTHROTTLE_TARGET_CONCURRENCY","type": "integer","default": 1,"description": ""},
22    "BOT_NAME": {"title": "BOT_NAME","type": "string","default": "scrapybot","editor": "textfield","description": ""},
23    "CLOSESPIDER_ERRORCOUNT": {"title": "CLOSESPIDER_ERRORCOUNT","type": "integer","default": 0,"description": ""},
24    "CLOSESPIDER_ITEMCOUNT": {"title": "CLOSESPIDER_ITEMCOUNT","type": "integer","default": 0,"description": ""},
25    "CLOSESPIDER_PAGECOUNT": {"title": "CLOSESPIDER_PAGECOUNT","type": "integer","default": 0,"description": ""},
26    "CLOSESPIDER_TIMEOUT": {"title": "CLOSESPIDER_TIMEOUT","type": "integer","default": 0,"description": ""},
27    "COMMANDS_MODULE": {"title": "COMMANDS_MODULE","type": "string","default": "","editor": "textfield","description": ""},
28    "COMPRESSION_ENABLED": {"title": "COMPRESSION_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
29    "CONCURRENT_ITEMS": {"title": "CONCURRENT_ITEMS","type": "integer","default": 100,"description": ""},
30    "CONCURRENT_REQUESTS": {"title": "CONCURRENT_REQUESTS","type": "integer","default": 16,"description": ""},
31    "CONCURRENT_REQUESTS_PER_DOMAIN": {"title": "CONCURRENT_REQUESTS_PER_DOMAIN","type": "integer","default": 8,"description": ""},
32    "CONCURRENT_REQUESTS_PER_IP": {"title": "CONCURRENT_REQUESTS_PER_IP","type": "integer","default": 0,"description": ""},
33    "COOKIES_DEBUG": {"title": "COOKIES_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
34    "COOKIES_ENABLED": {"title": "COOKIES_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
35    "DEFAULT_ITEM_CLASS": {"title": "DEFAULT_ITEM_CLASS","type": "string","default": "scrapy.item.Item","editor": "textfield","description": ""},
36    "DEFAULT_REQUEST_HEADERS": {"title": "DEFAULT_REQUEST_HEADERS","type": "object","prefill": {
37            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
38            "Accept-Language": "en"
39        },"editor": "json","description": ""},
40    "DEPTH_LIMIT": {"title": "DEPTH_LIMIT","type": "integer","default": 0,"description": ""},
41    "DEPTH_PRIORITY": {"title": "DEPTH_PRIORITY","type": "integer","default": 0,"description": ""},
42    "DEPTH_STATS_VERBOSE": {"title": "DEPTH_STATS_VERBOSE","type": "boolean","default": false,"nullable": true,"description": ""},
43    "DNSCACHE_ENABLED": {"title": "DNSCACHE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
44    "DNSCACHE_SIZE": {"title": "DNSCACHE_SIZE","type": "integer","default": 10000,"description": ""},
45    "DNS_RESOLVER": {"title": "DNS_RESOLVER","type": "string","default": "scrapy.resolver.CachingThreadedResolver","editor": "textfield","description": ""},
46    "DNS_TIMEOUT": {"title": "DNS_TIMEOUT","type": "integer","default": 60,"description": ""},
47    "DOWNLOADER": {"title": "DOWNLOADER","type": "string","default": "scrapy.core.downloader.Downloader","editor": "textfield","description": ""},
48    "DOWNLOADER_CLIENTCONTEXTFACTORY": {"title": "DOWNLOADER_CLIENTCONTEXTFACTORY","type": "string","default": "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory","editor": "textfield","description": ""},
49    "DOWNLOADER_CLIENT_TLS_CIPHERS": {"title": "DOWNLOADER_CLIENT_TLS_CIPHERS","type": "string","default": "DEFAULT","editor": "textfield","description": ""},
50    "DOWNLOADER_CLIENT_TLS_METHOD": {"title": "DOWNLOADER_CLIENT_TLS_METHOD","type": "string","default": "TLS","editor": "textfield","description": ""},
51    "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING": {"title": "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING","type": "boolean","default": false,"nullable": true,"description": ""},
52    "DOWNLOADER_HTTPCLIENTFACTORY": {"title": "DOWNLOADER_HTTPCLIENTFACTORY","type": "string","default": "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory","editor": "textfield","description": ""},
53    "DOWNLOADER_MIDDLEWARES": {"title": "DOWNLOADER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
54    "DOWNLOADER_MIDDLEWARES_BASE": {"title": "DOWNLOADER_MIDDLEWARES_BASE","type": "object","prefill": {
55            "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
56            "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
57            "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
58            "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
59            "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
60            "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
61            "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
62            "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
63            "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
64            "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
65            "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
66            "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
67            "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
68            "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
69        },"editor": "json","description": ""},
70    "DOWNLOADER_STATS": {"title": "DOWNLOADER_STATS","type": "boolean","default": true,"nullable": true,"description": ""},
71    "DOWNLOAD_DELAY": {"title": "DOWNLOAD_DELAY","type": "integer","default": 0,"description": ""},
72    "DOWNLOAD_FAIL_ON_DATALOSS": {"title": "DOWNLOAD_FAIL_ON_DATALOSS","type": "boolean","default": true,"nullable": true,"description": ""},
73    "DOWNLOAD_HANDLERS": {"title": "DOWNLOAD_HANDLERS","type": "object","prefill": {},"editor": "json","description": ""},
74    "DOWNLOAD_HANDLERS_BASE": {"title": "DOWNLOAD_HANDLERS_BASE","type": "object","prefill": {
75            "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
76            "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
77            "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
78            "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
79            "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
80            "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
81        },"editor": "json","description": ""},
82    "DOWNLOAD_MAXSIZE": {"title": "DOWNLOAD_MAXSIZE","type": "integer","default": 1073741824,"description": ""},
83    "DOWNLOAD_TIMEOUT": {"title": "DOWNLOAD_TIMEOUT","type": "integer","default": 180,"description": ""},
84    "DOWNLOAD_WARNSIZE": {"title": "DOWNLOAD_WARNSIZE","type": "integer","default": 33554432,"description": ""},
85    "DUPEFILTER_CLASS": {"title": "DUPEFILTER_CLASS","type": "string","default": "scrapy.dupefilters.RFPDupeFilter","editor": "textfield","description": ""},
86    "EDITOR": {"title": "EDITOR","type": "string","default": "vi","editor": "textfield","description": ""},
87    "EXTENSIONS": {"title": "EXTENSIONS","type": "object","prefill": {},"editor": "json","description": ""},
88    "EXTENSIONS_BASE": {"title": "EXTENSIONS_BASE","type": "object","prefill": {
89            "scrapy.extensions.corestats.CoreStats": 0,
90            "scrapy.extensions.telnet.TelnetConsole": 0,
91            "scrapy.extensions.memusage.MemoryUsage": 0,
92            "scrapy.extensions.memdebug.MemoryDebugger": 0,
93            "scrapy.extensions.closespider.CloseSpider": 0,
94            "scrapy.extensions.feedexport.FeedExporter": 0,
95            "scrapy.extensions.logstats.LogStats": 0,
96            "scrapy.extensions.spiderstate.SpiderState": 0,
97            "scrapy.extensions.throttle.AutoThrottle": 0
98        },"editor": "json","description": ""},
99    "FEEDS": {"title": "FEEDS","type": "object","prefill": {},"editor": "json","description": ""},
100    "FEED_EXPORTERS": {"title": "FEED_EXPORTERS","type": "object","prefill": {},"editor": "json","description": ""},
101    "FEED_EXPORTERS_BASE": {"title": "FEED_EXPORTERS_BASE","type": "object","prefill": {
102            "json": "scrapy.exporters.JsonItemExporter",
103            "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
104            "jsonl": "scrapy.exporters.JsonLinesItemExporter",
105            "jl": "scrapy.exporters.JsonLinesItemExporter",
106            "csv": "scrapy.exporters.CsvItemExporter",
107            "xml": "scrapy.exporters.XmlItemExporter",
108            "marshal": "scrapy.exporters.MarshalItemExporter",
109            "pickle": "scrapy.exporters.PickleItemExporter"
110        },"editor": "json","description": ""},
111    "FEED_EXPORT_BATCH_ITEM_COUNT": {"title": "FEED_EXPORT_BATCH_ITEM_COUNT","type": "integer","default": 0,"description": ""},
112    "FEED_EXPORT_ENCODING": {"title": "FEED_EXPORT_ENCODING","type": "string","editor": "textfield","description": ""},
113    "FEED_EXPORT_FIELDS": {"title": "FEED_EXPORT_FIELDS","type": "string","editor": "textfield","description": ""},
114    "FEED_EXPORT_INDENT": {"title": "FEED_EXPORT_INDENT","type": "integer","default": 0,"description": ""},
115    "FEED_STORAGES": {"title": "FEED_STORAGES","type": "object","prefill": {},"editor": "json","description": ""},
116    "FEED_STORAGES_BASE": {"title": "FEED_STORAGES_BASE","type": "object","prefill": {
117            "": "scrapy.extensions.feedexport.FileFeedStorage",
118            "file": "scrapy.extensions.feedexport.FileFeedStorage",
119            "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
120            "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
121            "s3": "scrapy.extensions.feedexport.S3FeedStorage",
122            "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
123        },"editor": "json","description": ""},
124    "FEED_STORAGE_FTP_ACTIVE": {"title": "FEED_STORAGE_FTP_ACTIVE","type": "boolean","default": false,"nullable": true,"description": ""},
125    "FEED_STORAGE_GCS_ACL": {"title": "FEED_STORAGE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
126    "FEED_STORAGE_S3_ACL": {"title": "FEED_STORAGE_S3_ACL","type": "string","default": "","editor": "textfield","description": ""},
127    "FEED_STORE_EMPTY": {"title": "FEED_STORE_EMPTY","type": "boolean","default": false,"nullable": true,"description": ""},
128    "FEED_TEMPDIR": {"title": "FEED_TEMPDIR","type": "string","editor": "textfield","description": ""},
129    "FEED_URI_PARAMS": {"title": "FEED_URI_PARAMS","type": "string","editor": "textfield","description": ""},
130    "FILES_STORE_GCS_ACL": {"title": "FILES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
131    "FILES_STORE_S3_ACL": {"title": "FILES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
132    "FTP_PASSIVE_MODE": {"title": "FTP_PASSIVE_MODE","type": "boolean","default": true,"nullable": true,"description": ""},
133    "FTP_PASSWORD": {"title": "FTP_PASSWORD","type": "string","default": "guest","editor": "textfield","description": ""},
134    "FTP_USER": {"title": "FTP_USER","type": "string","default": "anonymous","editor": "textfield","description": ""},
135    "GCS_PROJECT_ID": {"title": "GCS_PROJECT_ID","type": "string","editor": "textfield","description": ""},
136    "HTTPCACHE_ALWAYS_STORE": {"title": "HTTPCACHE_ALWAYS_STORE","type": "boolean","default": false,"nullable": true,"description": ""},
137    "HTTPCACHE_DBM_MODULE": {"title": "HTTPCACHE_DBM_MODULE","type": "string","default": "dbm","editor": "textfield","description": ""},
138    "HTTPCACHE_DIR": {"title": "HTTPCACHE_DIR","type": "string","default": "httpcache","editor": "textfield","description": ""},
139    "HTTPCACHE_ENABLED": {"title": "HTTPCACHE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
140    "HTTPCACHE_EXPIRATION_SECS": {"title": "HTTPCACHE_EXPIRATION_SECS","type": "integer","default": 0,"description": ""},
141    "HTTPCACHE_GZIP": {"title": "HTTPCACHE_GZIP","type": "boolean","default": false,"nullable": true,"description": ""},
142    "HTTPCACHE_IGNORE_HTTP_CODES": {"title": "HTTPCACHE_IGNORE_HTTP_CODES","type": "array","prefill": [],"editor": "stringList","description": ""},
143    "HTTPCACHE_IGNORE_MISSING": {"title": "HTTPCACHE_IGNORE_MISSING","type": "boolean","default": false,"nullable": true,"description": ""},
144    "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": {"title": "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS","type": "array","prefill": [],"editor": "stringList","description": ""},
145    "HTTPCACHE_IGNORE_SCHEMES": {"title": "HTTPCACHE_IGNORE_SCHEMES","type": "array","prefill": [
146            "file"
147        ],"editor": "stringList","description": ""},
148    "HTTPCACHE_POLICY": {"title": "HTTPCACHE_POLICY","type": "string","default": "scrapy.extensions.httpcache.DummyPolicy","editor": "textfield","description": ""},
149    "HTTPCACHE_STORAGE": {"title": "HTTPCACHE_STORAGE","type": "string","default": "scrapy.extensions.httpcache.FilesystemCacheStorage","editor": "textfield","description": ""},
150    "HTTPPROXY_AUTH_ENCODING": {"title": "HTTPPROXY_AUTH_ENCODING","type": "string","default": "latin-1","editor": "textfield","description": ""},
151    "HTTPPROXY_ENABLED": {"title": "HTTPPROXY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
152    "IMAGES_STORE_GCS_ACL": {"title": "IMAGES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
153    "IMAGES_STORE_S3_ACL": {"title": "IMAGES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
154    "ITEM_PIPELINES": {"title": "ITEM_PIPELINES","type": "object","prefill": {},"editor": "json","description": ""},
155    "ITEM_PIPELINES_BASE": {"title": "ITEM_PIPELINES_BASE","type": "object","prefill": {},"editor": "json","description": ""},
156    "ITEM_PROCESSOR": {"title": "ITEM_PROCESSOR","type": "string","default": "scrapy.pipelines.ItemPipelineManager","editor": "textfield","description": ""},
157    "LOGSTATS_INTERVAL": {"title": "LOGSTATS_INTERVAL","type": "integer","default": 60,"description": ""},
158    "LOG_DATEFORMAT": {"title": "LOG_DATEFORMAT","type": "string","default": "%Y-%m-%d %H:%M:%S","editor": "textfield","description": ""},
159    "LOG_ENABLED": {"title": "LOG_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
160    "LOG_ENCODING": {"title": "LOG_ENCODING","type": "string","default": "utf-8","editor": "textfield","description": ""},
161    "LOG_FILE": {"title": "LOG_FILE","type": "string","editor": "textfield","description": ""},
162    "LOG_FILE_APPEND": {"title": "LOG_FILE_APPEND","type": "boolean","default": true,"nullable": true,"description": ""},
163    "LOG_FORMAT": {"title": "LOG_FORMAT","type": "string","default": "%(asctime)s [%(name)s] %(levelname)s: %(message)s","editor": "textfield","description": ""},
164    "LOG_FORMATTER": {"title": "LOG_FORMATTER","type": "string","default": "scrapy.logformatter.LogFormatter","editor": "textfield","description": ""},
165    "LOG_LEVEL": {"title": "LOG_LEVEL","type": "string","default": "DEBUG","editor": "textfield","description": ""},
166    "LOG_SHORT_NAMES": {"title": "LOG_SHORT_NAMES","type": "boolean","default": false,"nullable": true,"description": ""},
167    "LOG_STDOUT": {"title": "LOG_STDOUT","type": "boolean","default": false,"nullable": true,"description": ""},
168    "MAIL_FROM": {"title": "MAIL_FROM","type": "string","default": "scrapy@localhost","editor": "textfield","description": ""},
169    "MAIL_HOST": {"title": "MAIL_HOST","type": "string","default": "localhost","editor": "textfield","description": ""},
170    "MAIL_PASS": {"title": "MAIL_PASS","type": "string","editor": "textfield","description": ""},
171    "MAIL_PORT": {"title": "MAIL_PORT","type": "integer","default": 25,"description": ""},
172    "MAIL_USER": {"title": "MAIL_USER","type": "string","editor": "textfield","description": ""},
173    "MEMDEBUG_ENABLED": {"title": "MEMDEBUG_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
174    "MEMDEBUG_NOTIFY": {"title": "MEMDEBUG_NOTIFY","type": "array","prefill": [],"editor": "stringList","description": ""},
175    "MEMUSAGE_CHECK_INTERVAL_SECONDS": {"title": "MEMUSAGE_CHECK_INTERVAL_SECONDS","type": "integer","default": 60,"description": ""},
176    "MEMUSAGE_ENABLED": {"title": "MEMUSAGE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
177    "MEMUSAGE_LIMIT_MB": {"title": "MEMUSAGE_LIMIT_MB","type": "integer","default": 0,"description": ""},
178    "MEMUSAGE_NOTIFY_MAIL": {"title": "MEMUSAGE_NOTIFY_MAIL","type": "array","prefill": [],"editor": "stringList","description": ""},
179    "MEMUSAGE_WARNING_MB": {"title": "MEMUSAGE_WARNING_MB","type": "integer","default": 0,"description": ""},
180    "METAREFRESH_ENABLED": {"title": "METAREFRESH_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
181    "METAREFRESH_IGNORE_TAGS": {"title": "METAREFRESH_IGNORE_TAGS","type": "array","prefill": [],"editor": "stringList","description": ""},
182    "METAREFRESH_MAXDELAY": {"title": "METAREFRESH_MAXDELAY","type": "integer","default": 100,"description": ""},
183    "NEWSPIDER_MODULE": {"title": "NEWSPIDER_MODULE","type": "string","default": "","editor": "textfield","description": ""},
184    "RANDOMIZE_DOWNLOAD_DELAY": {"title": "RANDOMIZE_DOWNLOAD_DELAY","type": "boolean","default": true,"nullable": true,"description": ""},
185    "REACTOR_THREADPOOL_MAXSIZE": {"title": "REACTOR_THREADPOOL_MAXSIZE","type": "integer","default": 10,"description": ""},
186    "REDIRECT_ENABLED": {"title": "REDIRECT_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
187    "REDIRECT_MAX_TIMES": {"title": "REDIRECT_MAX_TIMES","type": "integer","default": 20,"description": ""},
188    "REDIRECT_PRIORITY_ADJUST": {"title": "REDIRECT_PRIORITY_ADJUST","type": "integer","default": 2,"description": ""},
189    "REFERER_ENABLED": {"title": "REFERER_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
190    "REFERRER_POLICY": {"title": "REFERRER_POLICY","type": "string","default": "scrapy.spidermiddlewares.referer.DefaultReferrerPolicy","editor": "textfield","description": ""},
191    "REQUEST_FINGERPRINTER_CLASS": {"title": "REQUEST_FINGERPRINTER_CLASS","type": "string","default": "scrapy.utils.request.RequestFingerprinter","editor": "textfield","description": ""},
192    "REQUEST_FINGERPRINTER_IMPLEMENTATION": {"title": "REQUEST_FINGERPRINTER_IMPLEMENTATION","type": "string","default": "2.6","editor": "textfield","description": ""},
193    "RETRY_ENABLED": {"title": "RETRY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
194    "RETRY_HTTP_CODES": {"title": "RETRY_HTTP_CODES","type": "array","prefill": [
195            500,
196            502,
197            503,
198            504,
199            522,
200            524,
201            408,
202            429
203        ],"editor": "stringList","description": ""},
204    "RETRY_PRIORITY_ADJUST": {"title": "RETRY_PRIORITY_ADJUST","type": "integer","default": -1,"description": ""},
205    "RETRY_TIMES": {"title": "RETRY_TIMES","type": "integer","default": 2,"description": ""},
206    "ROBOTSTXT_OBEY": {"title": "ROBOTSTXT_OBEY","type": "boolean","default": false,"nullable": true,"description": ""},
207    "ROBOTSTXT_PARSER": {"title": "ROBOTSTXT_PARSER","type": "string","default": "scrapy.robotstxt.ProtegoRobotParser","editor": "textfield","description": ""},
208    "ROBOTSTXT_USER_AGENT": {"title": "ROBOTSTXT_USER_AGENT","type": "string","editor": "textfield","description": ""},
209    "SCHEDULER": {"title": "SCHEDULER","type": "string","default": "scrapy.core.scheduler.Scheduler","editor": "textfield","description": ""},
210    "SCHEDULER_DEBUG": {"title": "SCHEDULER_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
211    "SCHEDULER_DISK_QUEUE": {"title": "SCHEDULER_DISK_QUEUE","type": "string","default": "scrapy.squeues.PickleLifoDiskQueue","editor": "textfield","description": ""},
212    "SCHEDULER_MEMORY_QUEUE": {"title": "SCHEDULER_MEMORY_QUEUE","type": "string","default": "scrapy.squeues.LifoMemoryQueue","editor": "textfield","description": ""},
213    "SCHEDULER_PRIORITY_QUEUE": {"title": "SCHEDULER_PRIORITY_QUEUE","type": "string","default": "scrapy.pqueues.ScrapyPriorityQueue","editor": "textfield","description": ""},
214    "SCRAPER_SLOT_MAX_ACTIVE_SIZE": {"title": "SCRAPER_SLOT_MAX_ACTIVE_SIZE","type": "integer","default": 5000000,"description": ""},
215    "SPIDER_CONTRACTS": {"title": "SPIDER_CONTRACTS","type": "object","prefill": {},"editor": "json","description": ""},
216    "SPIDER_CONTRACTS_BASE": {"title": "SPIDER_CONTRACTS_BASE","type": "object","prefill": {
217            "scrapy.contracts.default.UrlContract": 1,
218            "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
219            "scrapy.contracts.default.ReturnsContract": 2,
220            "scrapy.contracts.default.ScrapesContract": 3
221        },"editor": "json","description": ""},
222    "SPIDER_LOADER_CLASS": {"title": "SPIDER_LOADER_CLASS","type": "string","default": "scrapy.spiderloader.SpiderLoader","editor": "textfield","description": ""},
223    "SPIDER_LOADER_WARN_ONLY": {"title": "SPIDER_LOADER_WARN_ONLY","type": "boolean","default": false,"nullable": true,"description": ""},
224    "SPIDER_MIDDLEWARES": {"title": "SPIDER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
225    "SPIDER_MIDDLEWARES_BASE": {"title": "SPIDER_MIDDLEWARES_BASE","type": "object","prefill": {
226            "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
227            "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
228            "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
229            "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
230            "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
231        },"editor": "json","description": ""},
232    "SPIDER_MODULES": {"title": "SPIDER_MODULES","type": "array","prefill": [],"editor": "stringList","description": ""},
233    "STATSMAILER_RCPTS": {"title": "STATSMAILER_RCPTS","type": "array","prefill": [],"editor": "stringList","description": ""},
234    "STATS_CLASS": {"title": "STATS_CLASS","type": "string","default": "scrapy.statscollectors.MemoryStatsCollector","editor": "textfield","description": ""},
235    "STATS_DUMP": {"title": "STATS_DUMP","type": "boolean","default": true,"nullable": true,"description": ""},
236    "TELNETCONSOLE_ENABLED": {"title": "TELNETCONSOLE_ENABLED","type": "integer","default": 1,"description": ""},
237    "TELNETCONSOLE_HOST": {"title": "TELNETCONSOLE_HOST","type": "string","default": "127.0.0.1","editor": "textfield","description": ""},
238    "TELNETCONSOLE_PASSWORD": {"title": "TELNETCONSOLE_PASSWORD","type": "string","editor": "textfield","description": ""},
239    "TELNETCONSOLE_PORT": {"title": "TELNETCONSOLE_PORT","type": "array","prefill": [
240            6023,
241            6073
242        ],"editor": "stringList","description": ""},
243    "TELNETCONSOLE_USERNAME": {"title": "TELNETCONSOLE_USERNAME","type": "string","default": "scrapy","editor": "textfield","description": ""},
244    "TEMPLATES_DIR": {"title": "TEMPLATES_DIR","type": "string","default": "/usr/local/lib/python3.10/dist-packages/scrapy/templates","editor": "textfield","description": ""},
245    "TWISTED_REACTOR": {"title": "TWISTED_REACTOR","type": "string","editor": "textfield","description": ""},
246    "URLLENGTH_LIMIT": {"title": "URLLENGTH_LIMIT","type": "integer","default": 2083,"description": ""},
247    "USER_AGENT": {"title": "USER_AGENT","type": "string","default": "Scrapy/2.9.0 (+https://scrapy.org)","editor": "textfield","description": ""}
248    }
249}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23
24.scrapy
25*.log

main.py

1'''
2import logging
3from apify.log import ActorLogFormatter
4import scrapy.utils.log
5
6handler = logging.StreamHandler()
7handler.setFormatter(ActorLogFormatter())
8
9apify_logger = logging.getLogger('apify')
10apify_logger.setLevel(logging.INFO)
11apify_logger.addHandler(handler)
12
13apify_client_logger = logging.getLogger('apify_client')
14apify_client_logger.setLevel(logging.DEBUG)
15apify_client_logger.addHandler(handler)
16'''
17
18# We can't attach our log handler to the loggers normally,
19# because Scrapy would remove them in the `configure_logging` call here:
20# https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L95
21# (even though `disable_existing_loggers` is set to False :facepalm:).
22# We need to monkeypatch Scrapy's `configure_logging` method like this,
23# so that our handler is attached right after Scrapy calls the `configure_logging` method,
24# because otherwise we would lose some log messages.
25'''
26old_configure_logging = scrapy.utils.log.configure_logging
27
28def new_configure_logging(*args, **kwargs):
29    old_configure_logging(*args, **kwargs)
30
31    # Scrapy uses these four main loggers: https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L44-L61
32    logging.getLogger('scrapy').addHandler(handler)
33    logging.getLogger('twisted').addHandler(handler)
34    logging.getLogger('filelock').addHandler(handler)
35    logging.getLogger('hpack').addHandler(handler)
36
37scrapy.utils.log.configure_logging = new_configure_logging
38'''
39
40#################################################################################################################
41import asyncio
42import nest_asyncio
43import scrapy
44from scrapy.crawler import CrawlerProcess
45from scrapy.utils.reactor import install_reactor
46from apify import Actor
47
48
49# This is necessary so that twisted and asyncio work well together
50install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
51nest_asyncio.apply()
52
53SPIDERS = []
54
55class ITEM_PIPELINE:
56    def open_spider(self, spider):
57        SPIDERS.append(spider)
58    
59    def close_spider(self, spider):
60        pass # print('CLOSE', spider)
61
62    async def process_item(self, item, spider):
63        await Actor.push_data(item)
64        return item
65
66async def periodic():
67    while True:
68        for s in SPIDERS:
69            count = s.crawler.stats.get_value('item_scraped_count') or 0
70            print(f'⌛ ({s.__class__.__name__}) item_scraped_count: {count}')
71        await asyncio.sleep(5)
72
73async def main():
74    async with Actor:
75        GLOBE = {'scrapy': scrapy}
76
77        actor_input = await Actor.get_input() or {}
78        
79        # code
80        assert (spiders_code := actor_input.pop('spiders_code',0)), 'parameter required => spiders_code'
81        exec(spiders_code, GLOBE)
82  
83        # get spiders
84        start = []
85        for k,v in GLOBE.items():
86            if isinstance(v,type) and issubclass(v, scrapy.Spider):
87                print('*** SPIDER:', k)         
88                start.append(v)
89
90        # start crawlers
91        if start:       
92            process = CrawlerProcess({
93                'TWISTED_REACTOR'   : 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
94                'ITEM_PIPELINES'    : { ITEM_PIPELINE: 1 },
95                **actor_input
96            }, install_root_handler=False)
97            
98            for t in start: process.crawl(t)
99
100            timer = asyncio.get_event_loop().create_task(periodic()) # periodic status
101            process.start()
102        
103        print('Done.')
104
105asyncio.run(main())

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5nest-asyncio ~= 1.5.6
6scrapy ~= 2.9.0
Developer
Maintained by Community