ScrapyFy avatar
ScrapyFy
Under maintenance
Try for free

No credit card required

View all Actors
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy
Try for free

No credit card required

Scrapy Runner

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "main"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Getting started with Python and Scrapy",
5    "description": "Scrapes titles of websites using Scrapy.",
6    "version": "0.0",
7    "meta": {
8         "templateId": "python-scrapy"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "URLs and their titles",
16            "views": {
17                "titles": {
18                    "title": "URLs and their titles",
19                    "transformation": {
20                        "fields": [
21                            "url",
22                            "title"
23                        ]
24                    },
25                    "display": {
26                        "component": "table",
27                        "properties": {
28                            "url": {
29                                "label": "URL",
30                                "format": "text"
31                            },
32                            "title": {
33                                "label": "Title",
34                                "format": "text"
35                            }
36                        }
37                    }
38                }
39            }
40        }
41    }
42}

.actor/input_schema.json

1{
2    "title": "Python Scrapy Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "required": ["spiders_code"],
6    "properties": {
7        "spiders_code": {
8            "title": "Spiders Code",
9            "type": "string",
10            "description": "Define your spiders <i>(multiple spiders can be specified)</i>.",
11            "editor": "python",
12            "prefill": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n    name = 'title_spider'\r\n    allowed_domains = [\"apify.com\"]\r\n    start_urls = [\"https://apify.com\"]\r\n\r\n    custom_settings = {\r\n        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r\n        # Obey robots.txt rules\r\n        'ROBOTSTXT_OBEY'                        : True,\r\n        'DEPTH_LIMIT'                           : 2,\r\n        'LOG_ENABLED'                           : False,\r\n        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r\n        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r\n    }\r\n\r\n    def parse(self, response):\r\n        yield {\r\n            'url': response.url,\r\n            'title': response.css('title::text').extract_first(),\r\n        }\r\n        for link_href in response.css('a::attr(\"href\")'):\r\n            link_url = urljoin(response.url, link_href.get())\r\n            if link_url.startswith(('http://', 'https://')):\r\n                yield scrapy.Request(link_url)"
13        },
14
15    "AJAXCRAWL_ENABLED": {"title": "AJAXCRAWL_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
16    "ASYNCIO_EVENT_LOOP": {"title": "ASYNCIO_EVENT_LOOP","type": "string","editor": "textfield","description": ""},
17    "AUTOTHROTTLE_DEBUG": {"title": "AUTOTHROTTLE_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
18    "AUTOTHROTTLE_ENABLED": {"title": "AUTOTHROTTLE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
19    "AUTOTHROTTLE_MAX_DELAY": {"title": "AUTOTHROTTLE_MAX_DELAY","type": "integer","default": 60,"description": ""},
20    "AUTOTHROTTLE_START_DELAY": {"title": "AUTOTHROTTLE_START_DELAY","type": "integer","default": 5,"description": ""},
21    "AUTOTHROTTLE_TARGET_CONCURRENCY": {"title": "AUTOTHROTTLE_TARGET_CONCURRENCY","type": "integer","default": 1,"description": ""},
22    "BOT_NAME": {"title": "BOT_NAME","type": "string","default": "scrapybot","editor": "textfield","description": ""},
23    "CLOSESPIDER_ERRORCOUNT": {"title": "CLOSESPIDER_ERRORCOUNT","type": "integer","default": 0,"description": ""},
24    "CLOSESPIDER_ITEMCOUNT": {"title": "CLOSESPIDER_ITEMCOUNT","type": "integer","default": 0,"description": ""},
25    "CLOSESPIDER_PAGECOUNT": {"title": "CLOSESPIDER_PAGECOUNT","type": "integer","default": 0,"description": ""},
26    "CLOSESPIDER_TIMEOUT": {"title": "CLOSESPIDER_TIMEOUT","type": "integer","default": 0,"description": ""},
27    "COMMANDS_MODULE": {"title": "COMMANDS_MODULE","type": "string","default": "","editor": "textfield","description": ""},
28    "COMPRESSION_ENABLED": {"title": "COMPRESSION_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
29    "CONCURRENT_ITEMS": {"title": "CONCURRENT_ITEMS","type": "integer","default": 100,"description": ""},
30    "CONCURRENT_REQUESTS": {"title": "CONCURRENT_REQUESTS","type": "integer","default": 16,"description": ""},
31    "CONCURRENT_REQUESTS_PER_DOMAIN": {"title": "CONCURRENT_REQUESTS_PER_DOMAIN","type": "integer","default": 8,"description": ""},
32    "CONCURRENT_REQUESTS_PER_IP": {"title": "CONCURRENT_REQUESTS_PER_IP","type": "integer","default": 0,"description": ""},
33    "COOKIES_DEBUG": {"title": "COOKIES_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
34    "COOKIES_ENABLED": {"title": "COOKIES_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
35    "DEFAULT_ITEM_CLASS": {"title": "DEFAULT_ITEM_CLASS","type": "string","default": "scrapy.item.Item","editor": "textfield","description": ""},
36    "DEFAULT_REQUEST_HEADERS": {"title": "DEFAULT_REQUEST_HEADERS","type": "object","prefill": {
37            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
38            "Accept-Language": "en"
39        },"editor": "json","description": ""},
40    "DEPTH_LIMIT": {"title": "DEPTH_LIMIT","type": "integer","default": 0,"description": ""},
41    "DEPTH_PRIORITY": {"title": "DEPTH_PRIORITY","type": "integer","default": 0,"description": ""},
42    "DEPTH_STATS_VERBOSE": {"title": "DEPTH_STATS_VERBOSE","type": "boolean","default": false,"nullable": true,"description": ""},
43    "DNSCACHE_ENABLED": {"title": "DNSCACHE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
44    "DNSCACHE_SIZE": {"title": "DNSCACHE_SIZE","type": "integer","default": 10000,"description": ""},
45    "DNS_RESOLVER": {"title": "DNS_RESOLVER","type": "string","default": "scrapy.resolver.CachingThreadedResolver","editor": "textfield","description": ""},
46    "DNS_TIMEOUT": {"title": "DNS_TIMEOUT","type": "integer","default": 60,"description": ""},
47    "DOWNLOADER": {"title": "DOWNLOADER","type": "string","default": "scrapy.core.downloader.Downloader","editor": "textfield","description": ""},
48    "DOWNLOADER_CLIENTCONTEXTFACTORY": {"title": "DOWNLOADER_CLIENTCONTEXTFACTORY","type": "string","default": "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory","editor": "textfield","description": ""},
49    "DOWNLOADER_CLIENT_TLS_CIPHERS": {"title": "DOWNLOADER_CLIENT_TLS_CIPHERS","type": "string","default": "DEFAULT","editor": "textfield","description": ""},
50    "DOWNLOADER_CLIENT_TLS_METHOD": {"title": "DOWNLOADER_CLIENT_TLS_METHOD","type": "string","default": "TLS","editor": "textfield","description": ""},
51    "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING": {"title": "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING","type": "boolean","default": false,"nullable": true,"description": ""},
52    "DOWNLOADER_HTTPCLIENTFACTORY": {"title": "DOWNLOADER_HTTPCLIENTFACTORY","type": "string","default": "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory","editor": "textfield","description": ""},
53    "DOWNLOADER_MIDDLEWARES": {"title": "DOWNLOADER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
54    "DOWNLOADER_MIDDLEWARES_BASE": {"title": "DOWNLOADER_MIDDLEWARES_BASE","type": "object","prefill": {
55            "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
56            "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
57            "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
58            "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
59            "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
60            "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
61            "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
62            "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
63            "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
64            "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
65            "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
66            "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
67            "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
68            "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
69        },"editor": "json","description": ""},
70    "DOWNLOADER_STATS": {"title": "DOWNLOADER_STATS","type": "boolean","default": true,"nullable": true,"description": ""},
71    "DOWNLOAD_DELAY": {"title": "DOWNLOAD_DELAY","type": "integer","default": 0,"description": ""},
72    "DOWNLOAD_FAIL_ON_DATALOSS": {"title": "DOWNLOAD_FAIL_ON_DATALOSS","type": "boolean","default": true,"nullable": true,"description": ""},
73    "DOWNLOAD_HANDLERS": {"title": "DOWNLOAD_HANDLERS","type": "object","prefill": {},"editor": "json","description": ""},
74    "DOWNLOAD_HANDLERS_BASE": {"title": "DOWNLOAD_HANDLERS_BASE","type": "object","prefill": {
75            "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
76            "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
77            "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
78            "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
79            "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
80            "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
81        },"editor": "json","description": ""},
82    "DOWNLOAD_MAXSIZE": {"title": "DOWNLOAD_MAXSIZE","type": "integer","default": 1073741824,"description": ""},
83    "DOWNLOAD_TIMEOUT": {"title": "DOWNLOAD_TIMEOUT","type": "integer","default": 180,"description": ""},
84    "DOWNLOAD_WARNSIZE": {"title": "DOWNLOAD_WARNSIZE","type": "integer","default": 33554432,"description": ""},
85    "DUPEFILTER_CLASS": {"title": "DUPEFILTER_CLASS","type": "string","default": "scrapy.dupefilters.RFPDupeFilter","editor": "textfield","description": ""},
86    "EDITOR": {"title": "EDITOR","type": "string","default": "vi","editor": "textfield","description": ""},
87    "EXTENSIONS": {"title": "EXTENSIONS","type": "object","prefill": {},"editor": "json","description": ""},
88    "EXTENSIONS_BASE": {"title": "EXTENSIONS_BASE","type": "object","prefill": {
89            "scrapy.extensions.corestats.CoreStats": 0,
90            "scrapy.extensions.telnet.TelnetConsole": 0,
91            "scrapy.extensions.memusage.MemoryUsage": 0,
92            "scrapy.extensions.memdebug.MemoryDebugger": 0,
93            "scrapy.extensions.closespider.CloseSpider": 0,
94            "scrapy.extensions.feedexport.FeedExporter": 0,
95            "scrapy.extensions.logstats.LogStats": 0,
96            "scrapy.extensions.spiderstate.SpiderState": 0,
97            "scrapy.extensions.throttle.AutoThrottle": 0
98        },"editor": "json","description": ""},
99    "FEEDS": {"title": "FEEDS","type": "object","prefill": {},"editor": "json","description": ""},
100    "FEED_EXPORTERS": {"title": "FEED_EXPORTERS","type": "object","prefill": {},"editor": "json","description": ""},
101    "FEED_EXPORTERS_BASE": {"title": "FEED_EXPORTERS_BASE","type": "object","prefill": {
102            "json": "scrapy.exporters.JsonItemExporter",
103            "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
104            "jsonl": "scrapy.exporters.JsonLinesItemExporter",
105            "jl": "scrapy.exporters.JsonLinesItemExporter",
106            "csv": "scrapy.exporters.CsvItemExporter",
107            "xml": "scrapy.exporters.XmlItemExporter",
108            "marshal": "scrapy.exporters.MarshalItemExporter",
109            "pickle": "scrapy.exporters.PickleItemExporter"
110        },"editor": "json","description": ""},
111    "FEED_EXPORT_BATCH_ITEM_COUNT": {"title": "FEED_EXPORT_BATCH_ITEM_COUNT","type": "integer","default": 0,"description": ""},
112    "FEED_EXPORT_ENCODING": {"title": "FEED_EXPORT_ENCODING","type": "string","editor": "textfield","description": ""},
113    "FEED_EXPORT_FIELDS": {"title": "FEED_EXPORT_FIELDS","type": "string","editor": "textfield","description": ""},
114    "FEED_EXPORT_INDENT": {"title": "FEED_EXPORT_INDENT","type": "integer","default": 0,"description": ""},
115    "FEED_STORAGES": {"title": "FEED_STORAGES","type": "object","prefill": {},"editor": "json","description": ""},
116    "FEED_STORAGES_BASE": {"title": "FEED_STORAGES_BASE","type": "object","prefill": {
117            "": "scrapy.extensions.feedexport.FileFeedStorage",
118            "file": "scrapy.extensions.feedexport.FileFeedStorage",
119            "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
120            "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
121            "s3": "scrapy.extensions.feedexport.S3FeedStorage",
122            "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
123        },"editor": "json","description": ""},
124    "FEED_STORAGE_FTP_ACTIVE": {"title": "FEED_STORAGE_FTP_ACTIVE","type": "boolean","default": false,"nullable": true,"description": ""},
125    "FEED_STORAGE_GCS_ACL": {"title": "FEED_STORAGE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
126    "FEED_STORAGE_S3_ACL": {"title": "FEED_STORAGE_S3_ACL","type": "string","default": "","editor": "textfield","description": ""},
127    "FEED_STORE_EMPTY": {"title": "FEED_STORE_EMPTY","type": "boolean","default": false,"nullable": true,"description": ""},
128    "FEED_TEMPDIR": {"title": "FEED_TEMPDIR","type": "string","editor": "textfield","description": ""},
129    "FEED_URI_PARAMS": {"title": "FEED_URI_PARAMS","type": "string","editor": "textfield","description": ""},
130    "FILES_STORE_GCS_ACL": {"title": "FILES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
131    "FILES_STORE_S3_ACL": {"title": "FILES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
132    "FTP_PASSIVE_MODE": {"title": "FTP_PASSIVE_MODE","type": "boolean","default": true,"nullable": true,"description": ""},
133    "FTP_PASSWORD": {"title": "FTP_PASSWORD","type": "string","default": "guest","editor": "textfield","description": ""},
134    "FTP_USER": {"title": "FTP_USER","type": "string","default": "anonymous","editor": "textfield","description": ""},
135    "GCS_PROJECT_ID": {"title": "GCS_PROJECT_ID","type": "string","editor": "textfield","description": ""},
136    "HTTPCACHE_ALWAYS_STORE": {"title": "HTTPCACHE_ALWAYS_STORE","type": "boolean","default": false,"nullable": true,"description": ""},
137    "HTTPCACHE_DBM_MODULE": {"title": "HTTPCACHE_DBM_MODULE","type": "string","default": "dbm","editor": "textfield","description": ""},
138    "HTTPCACHE_DIR": {"title": "HTTPCACHE_DIR","type": "string","default": "httpcache","editor": "textfield","description": ""},
139    "HTTPCACHE_ENABLED": {"title": "HTTPCACHE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
140    "HTTPCACHE_EXPIRATION_SECS": {"title": "HTTPCACHE_EXPIRATION_SECS","type": "integer","default": 0,"description": ""},
141    "HTTPCACHE_GZIP": {"title": "HTTPCACHE_GZIP","type": "boolean","default": false,"nullable": true,"description": ""},
142    "HTTPCACHE_IGNORE_HTTP_CODES": {"title": "HTTPCACHE_IGNORE_HTTP_CODES","type": "array","prefill": [],"editor": "stringList","description": ""},
143    "HTTPCACHE_IGNORE_MISSING": {"title": "HTTPCACHE_IGNORE_MISSING","type": "boolean","default": false,"nullable": true,"description": ""},
144    "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": {"title": "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS","type": "array","prefill": [],"editor": "stringList","description": ""},
145    "HTTPCACHE_IGNORE_SCHEMES": {"title": "HTTPCACHE_IGNORE_SCHEMES","type": "array","prefill": [
146            "file"
147        ],"editor": "stringList","description": ""},
148    "HTTPCACHE_POLICY": {"title": "HTTPCACHE_POLICY","type": "string","default": "scrapy.extensions.httpcache.DummyPolicy","editor": "textfield","description": ""},
149    "HTTPCACHE_STORAGE": {"title": "HTTPCACHE_STORAGE","type": "string","default": "scrapy.extensions.httpcache.FilesystemCacheStorage","editor": "textfield","description": ""},
150    "HTTPPROXY_AUTH_ENCODING": {"title": "HTTPPROXY_AUTH_ENCODING","type": "string","default": "latin-1","editor": "textfield","description": ""},
151    "HTTPPROXY_ENABLED": {"title": "HTTPPROXY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
152    "IMAGES_STORE_GCS_ACL": {"title": "IMAGES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
153    "IMAGES_STORE_S3_ACL": {"title": "IMAGES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
154    "ITEM_PIPELINES": {"title": "ITEM_PIPELINES","type": "object","prefill": {},"editor": "json","description": ""},
155    "ITEM_PIPELINES_BASE": {"title": "ITEM_PIPELINES_BASE","type": "object","prefill": {},"editor": "json","description": ""},
156    "ITEM_PROCESSOR": {"title": "ITEM_PROCESSOR","type": "string","default": "scrapy.pipelines.ItemPipelineManager","editor": "textfield","description": ""},
157    "LOGSTATS_INTERVAL": {"title": "LOGSTATS_INTERVAL","type": "integer","default": 60,"description": ""},
158    "LOG_DATEFORMAT": {"title": "LOG_DATEFORMAT","type": "string","default": "%Y-%m-%d %H:%M:%S","editor": "textfield","description": ""},
159    "LOG_ENABLED": {"title": "LOG_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
160    "LOG_ENCODING": {"title": "LOG_ENCODING","type": "string","default": "utf-8","editor": "textfield","description": ""},
161    "LOG_FILE": {"title": "LOG_FILE","type": "string","editor": "textfield","description": ""},
162    "LOG_FILE_APPEND": {"title": "LOG_FILE_APPEND","type": "boolean","default": true,"nullable": true,"description": ""},
163    "LOG_FORMAT": {"title": "LOG_FORMAT","type": "string","default": "%(asctime)s [%(name)s] %(levelname)s: %(message)s","editor": "textfield","description": ""},
164    "LOG_FORMATTER": {"title": "LOG_FORMATTER","type": "string","default": "scrapy.logformatter.LogFormatter","editor": "textfield","description": ""},
165    "LOG_LEVEL": {"title": "LOG_LEVEL","type": "string","default": "DEBUG","editor": "textfield","description": ""},
166    "LOG_SHORT_NAMES": {"title": "LOG_SHORT_NAMES","type": "boolean","default": false,"nullable": true,"description": ""},
167    "LOG_STDOUT": {"title": "LOG_STDOUT","type": "boolean","default": false,"nullable": true,"description": ""},
168    "MAIL_FROM": {"title": "MAIL_FROM","type": "string","default": "scrapy@localhost","editor": "textfield","description": ""},
169    "MAIL_HOST": {"title": "MAIL_HOST","type": "string","default": "localhost","editor": "textfield","description": ""},
170    "MAIL_PASS": {"title": "MAIL_PASS","type": "string","editor": "textfield","description": ""},
171    "MAIL_PORT": {"title": "MAIL_PORT","type": "integer","default": 25,"description": ""},
172    "MAIL_USER": {"title": "MAIL_USER","type": "string","editor": "textfield","description": ""},
173    "MEMDEBUG_ENABLED": {"title": "MEMDEBUG_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
174    "MEMDEBUG_NOTIFY": {"title": "MEMDEBUG_NOTIFY","type": "array","prefill": [],"editor": "stringList","description": ""},
175    "MEMUSAGE_CHECK_INTERVAL_SECONDS": {"title": "MEMUSAGE_CHECK_INTERVAL_SECONDS","type": "integer","default": 60,"description": ""},
176    "MEMUSAGE_ENABLED": {"title": "MEMUSAGE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
177    "MEMUSAGE_LIMIT_MB": {"title": "MEMUSAGE_LIMIT_MB","type": "integer","default": 0,"description": ""},
178    "MEMUSAGE_NOTIFY_MAIL": {"title": "MEMUSAGE_NOTIFY_MAIL","type": "array","prefill": [],"editor": "stringList","description": ""},
179    "MEMUSAGE_WARNING_MB": {"title": "MEMUSAGE_WARNING_MB","type": "integer","default": 0,"description": ""},
180    "METAREFRESH_ENABLED": {"title": "METAREFRESH_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
181    "METAREFRESH_IGNORE_TAGS": {"title": "METAREFRESH_IGNORE_TAGS","type": "array","prefill": [],"editor": "stringList","description": ""},
182    "METAREFRESH_MAXDELAY": {"title": "METAREFRESH_MAXDELAY","type": "integer","default": 100,"description": ""},
183    "NEWSPIDER_MODULE": {"title": "NEWSPIDER_MODULE","type": "string","default": "","editor": "textfield","description": ""},
184    "RANDOMIZE_DOWNLOAD_DELAY": {"title": "RANDOMIZE_DOWNLOAD_DELAY","type": "boolean","default": true,"nullable": true,"description": ""},
185    "REACTOR_THREADPOOL_MAXSIZE": {"title": "REACTOR_THREADPOOL_MAXSIZE","type": "integer","default": 10,"description": ""},
186    "REDIRECT_ENABLED": {"title": "REDIRECT_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
187    "REDIRECT_MAX_TIMES": {"title": "REDIRECT_MAX_TIMES","type": "integer","default": 20,"description": ""},
188    "REDIRECT_PRIORITY_ADJUST": {"title": "REDIRECT_PRIORITY_ADJUST","type": "integer","default": 2,"description": ""},
189    "REFERER_ENABLED": {"title": "REFERER_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
190    "REFERRER_POLICY": {"title": "REFERRER_POLICY","type": "string","default": "scrapy.spidermiddlewares.referer.DefaultReferrerPolicy","editor": "textfield","description": ""},
191    "REQUEST_FINGERPRINTER_CLASS": {"title": "REQUEST_FINGERPRINTER_CLASS","type": "string","default": "scrapy.utils.request.RequestFingerprinter","editor": "textfield","description": ""},
192    "REQUEST_FINGERPRINTER_IMPLEMENTATION": {"title": "REQUEST_FINGERPRINTER_IMPLEMENTATION","type": "string","default": "2.6","editor": "textfield","description": ""},
193    "RETRY_ENABLED": {"title": "RETRY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
194    "RETRY_HTTP_CODES": {"title": "RETRY_HTTP_CODES","type": "array","prefill": [
195            500,
196            502,
197            503,
198            504,
199            522,
200            524,
201            408,
202            429
203        ],"editor": "stringList","description": ""},
204    "RETRY_PRIORITY_ADJUST": {"title": "RETRY_PRIORITY_ADJUST","type": "integer","default": -1,"description": ""},
205    "RETRY_TIMES": {"title": "RETRY_TIMES","type": "integer","default": 2,"description": ""},
206    "ROBOTSTXT_OBEY": {"title": "ROBOTSTXT_OBEY","type": "boolean","default": false,"nullable": true,"description": ""},
207    "ROBOTSTXT_PARSER": {"title": "ROBOTSTXT_PARSER","type": "string","default": "scrapy.robotstxt.ProtegoRobotParser","editor": "textfield","description": ""},
208    "ROBOTSTXT_USER_AGENT": {"title": "ROBOTSTXT_USER_AGENT","type": "string","editor": "textfield","description": ""},
209    "SCHEDULER": {"title": "SCHEDULER","type": "string","default": "scrapy.core.scheduler.Scheduler","editor": "textfield","description": ""},
210    "SCHEDULER_DEBUG": {"title": "SCHEDULER_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
211    "SCHEDULER_DISK_QUEUE": {"title": "SCHEDULER_DISK_QUEUE","type": "string","default": "scrapy.squeues.PickleLifoDiskQueue","editor": "textfield","description": ""},
212    "SCHEDULER_MEMORY_QUEUE": {"title": "SCHEDULER_MEMORY_QUEUE","type": "string","default": "scrapy.squeues.LifoMemoryQueue","editor": "textfield","description": ""},
213    "SCHEDULER_PRIORITY_QUEUE": {"title": "SCHEDULER_PRIORITY_QUEUE","type": "string","default": "scrapy.pqueues.ScrapyPriorityQueue","editor": "textfield","description": ""},
214    "SCRAPER_SLOT_MAX_ACTIVE_SIZE": {"title": "SCRAPER_SLOT_MAX_ACTIVE_SIZE","type": "integer","default": 5000000,"description": ""},
215    "SPIDER_CONTRACTS": {"title": "SPIDER_CONTRACTS","type": "object","prefill": {},"editor": "json","description": ""},
216    "SPIDER_CONTRACTS_BASE": {"title": "SPIDER_CONTRACTS_BASE","type": "object","prefill": {
217            "scrapy.contracts.default.UrlContract": 1,
218            "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
219            "scrapy.contracts.default.ReturnsContract": 2,
220            "scrapy.contracts.default.ScrapesContract": 3
221        },"editor": "json","description": ""},
222    "SPIDER_LOADER_CLASS": {"title": "SPIDER_LOADER_CLASS","type": "string","default": "scrapy.spiderloader.SpiderLoader","editor": "textfield","description": ""},
223    "SPIDER_LOADER_WARN_ONLY": {"title": "SPIDER_LOADER_WARN_ONLY","type": "boolean","default": false,"nullable": true,"description": ""},
224    "SPIDER_MIDDLEWARES": {"title": "SPIDER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
225    "SPIDER_MIDDLEWARES_BASE": {"title": "SPIDER_MIDDLEWARES_BASE","type": "object","prefill": {
226            "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
227            "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
228            "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
229            "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
230            "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
231        },"editor": "json","description": ""},
232    "SPIDER_MODULES": {"title": "SPIDER_MODULES","type": "array","prefill": [],"editor": "stringList","description": ""},
233    "STATSMAILER_RCPTS": {"title": "STATSMAILER_RCPTS","type": "array","prefill": [],"editor": "stringList","description": ""},
234    "STATS_CLASS": {"title": "STATS_CLASS","type": "string","default": "scrapy.statscollectors.MemoryStatsCollector","editor": "textfield","description": ""},
235    "STATS_DUMP": {"title": "STATS_DUMP","type": "boolean","default": true,"nullable": true,"description": ""},
236    "TELNETCONSOLE_ENABLED": {"title": "TELNETCONSOLE_ENABLED","type": "integer","default": 1,"description": ""},
237    "TELNETCONSOLE_HOST": {"title": "TELNETCONSOLE_HOST","type": "string","default": "127.0.0.1","editor": "textfield","description": ""},
238    "TELNETCONSOLE_PASSWORD": {"title": "TELNETCONSOLE_PASSWORD","type": "string","editor": "textfield","description": ""},
239    "TELNETCONSOLE_PORT": {"title": "TELNETCONSOLE_PORT","type": "array","prefill": [
240            6023,
241            6073
242        ],"editor": "stringList","description": ""},
243    "TELNETCONSOLE_USERNAME": {"title": "TELNETCONSOLE_USERNAME","type": "string","default": "scrapy","editor": "textfield","description": ""},
244    "TEMPLATES_DIR": {"title": "TEMPLATES_DIR","type": "string","default": "/usr/local/lib/python3.10/dist-packages/scrapy/templates","editor": "textfield","description": ""},
245    "TWISTED_REACTOR": {"title": "TWISTED_REACTOR","type": "string","editor": "textfield","description": ""},
246    "URLLENGTH_LIMIT": {"title": "URLLENGTH_LIMIT","type": "integer","default": 2083,"description": ""},
247    "USER_AGENT": {"title": "USER_AGENT","type": "string","default": "Scrapy/2.9.0 (+https://scrapy.org)","editor": "textfield","description": ""}
248    }
249}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23
24.scrapy
25*.log

main.py

1'''
2import logging
3from apify.log import ActorLogFormatter
4import scrapy.utils.log
5
6handler = logging.StreamHandler()
7handler.setFormatter(ActorLogFormatter())
8
9apify_logger = logging.getLogger('apify')
10apify_logger.setLevel(logging.INFO)
11apify_logger.addHandler(handler)
12
13apify_client_logger = logging.getLogger('apify_client')
14apify_client_logger.setLevel(logging.DEBUG)
15apify_client_logger.addHandler(handler)
16'''
17
18# We can't attach our log handler to the loggers normally,
19# because Scrapy would remove them in the `configure_logging` call here:
20# https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L95
21# (even though `disable_existing_loggers` is set to False :facepalm:).
22# We need to monkeypatch Scrapy's `configure_logging` method like this,
23# so that our handler is attached right after Scrapy calls the `configure_logging` method,
24# because otherwise we would lose some log messages.
25'''
26old_configure_logging = scrapy.utils.log.configure_logging
27
28def new_configure_logging(*args, **kwargs):
29    old_configure_logging(*args, **kwargs)
30
31    # Scrapy uses these four main loggers: https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L44-L61
32    logging.getLogger('scrapy').addHandler(handler)
33    logging.getLogger('twisted').addHandler(handler)
34    logging.getLogger('filelock').addHandler(handler)
35    logging.getLogger('hpack').addHandler(handler)
36
37scrapy.utils.log.configure_logging = new_configure_logging
38'''
39
40#################################################################################################################
41import asyncio
42import nest_asyncio
43import scrapy
44from scrapy.crawler import CrawlerProcess
45from scrapy.utils.reactor import install_reactor
46from apify import Actor
47
48
49# This is necessary so that twisted and asyncio work well together
50install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
51nest_asyncio.apply()
52
53SPIDERS = []
54
55class ITEM_PIPELINE:
56    def open_spider(self, spider):
57        SPIDERS.append(spider)
58    
59    def close_spider(self, spider):
60        pass # print('CLOSE', spider)
61
62    async def process_item(self, item, spider):
63        await Actor.push_data(item)
64        return item
65
66async def periodic():
67    while True:
68        for s in SPIDERS:
69            count = s.crawler.stats.get_value('item_scraped_count') or 0
70            print(f'⌛ ({s.__class__.__name__}) item_scraped_count: {count}')
71        await asyncio.sleep(5)
72
73async def main():
74    async with Actor:
75        GLOBE = {'scrapy': scrapy}
76
77        actor_input = await Actor.get_input() or {}
78        
79        # code
80        assert (spiders_code := actor_input.pop('spiders_code',0)), 'parameter required => spiders_code'
81        exec(spiders_code, GLOBE)
82  
83        # get spiders
84        start = []
85        for k,v in GLOBE.items():
86            if isinstance(v,type) and issubclass(v, scrapy.Spider):
87                print('*** SPIDER:', k)         
88                start.append(v)
89
90        # start crawlers
91        if start:       
92            process = CrawlerProcess({
93                'TWISTED_REACTOR'   : 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
94                'ITEM_PIPELINES'    : { ITEM_PIPELINE: 1 },
95                **actor_input
96            }, install_root_handler=False)
97            
98            for t in start: process.crawl(t)
99
100            timer = asyncio.get_event_loop().create_task(periodic()) # periodic status
101            process.start()
102        
103        print('Done.')
104
105asyncio.run(main())

requirements.txt

1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5nest-asyncio ~= 1.5.6
6scrapy ~= 2.9.0
Developer
Maintained by Community
Actor metrics
  • 2 monthly users
  • 44.4% runs succeeded
  • days response time
  • Created in Jun 2023
  • Modified 11 months ago