ScrapyFy
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsScrapyFy
jupri/scrapyfy
Scrapy Runner
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Specify how to launch the source code of your actor.
29# By default, the "python3 -m src" command is run
30CMD ["python3", "-m", "main"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor-1",
4 "title": "Getting started with Python and Scrapy",
5 "description": "Scrapes titles of websites using Scrapy.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "python-scrapy"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile",
12 "storages": {
13 "dataset": {
14 "actorSpecification": 1,
15 "title": "URLs and their titles",
16 "views": {
17 "titles": {
18 "title": "URLs and their titles",
19 "transformation": {
20 "fields": [
21 "url",
22 "title"
23 ]
24 },
25 "display": {
26 "component": "table",
27 "properties": {
28 "url": {
29 "label": "URL",
30 "format": "text"
31 },
32 "title": {
33 "label": "Title",
34 "format": "text"
35 }
36 }
37 }
38 }
39 }
40 }
41 }
42}
.actor/input_schema.json
1{
2 "title": "Python Scrapy Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "required": ["spiders_code"],
6 "properties": {
7 "spiders_code": {
8 "title": "Spiders Code",
9 "type": "string",
10 "description": "Define your spiders <i>(multiple spiders can be specified)</i>.",
11 "editor": "python",
12 "prefill": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n name = 'title_spider'\r\n allowed_domains = [\"apify.com\"]\r\n start_urls = [\"https://apify.com\"]\r\n\r\n custom_settings = {\r\n 'REQUEST_FINGERPRINTER_IMPLEMENTATION' : '2.7',\r\n # Obey robots.txt rules\r\n 'ROBOTSTXT_OBEY' : True,\r\n 'DEPTH_LIMIT' : 2,\r\n 'LOG_ENABLED' : False,\r\n #'CLOSESPIDER_PAGECOUNT' : 5,\r\n 'CLOSESPIDER_ITEMCOUNT' : 5,\r\n }\r\n\r\n def parse(self, response):\r\n yield {\r\n 'url': response.url,\r\n 'title': response.css('title::text').extract_first(),\r\n }\r\n for link_href in response.css('a::attr(\"href\")'):\r\n link_url = urljoin(response.url, link_href.get())\r\n if link_url.startswith(('http://', 'https://')):\r\n yield scrapy.Request(link_url)"
13 },
14
15 "AJAXCRAWL_ENABLED": {"title": "AJAXCRAWL_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
16 "ASYNCIO_EVENT_LOOP": {"title": "ASYNCIO_EVENT_LOOP","type": "string","editor": "textfield","description": ""},
17 "AUTOTHROTTLE_DEBUG": {"title": "AUTOTHROTTLE_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
18 "AUTOTHROTTLE_ENABLED": {"title": "AUTOTHROTTLE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
19 "AUTOTHROTTLE_MAX_DELAY": {"title": "AUTOTHROTTLE_MAX_DELAY","type": "integer","default": 60,"description": ""},
20 "AUTOTHROTTLE_START_DELAY": {"title": "AUTOTHROTTLE_START_DELAY","type": "integer","default": 5,"description": ""},
21 "AUTOTHROTTLE_TARGET_CONCURRENCY": {"title": "AUTOTHROTTLE_TARGET_CONCURRENCY","type": "integer","default": 1,"description": ""},
22 "BOT_NAME": {"title": "BOT_NAME","type": "string","default": "scrapybot","editor": "textfield","description": ""},
23 "CLOSESPIDER_ERRORCOUNT": {"title": "CLOSESPIDER_ERRORCOUNT","type": "integer","default": 0,"description": ""},
24 "CLOSESPIDER_ITEMCOUNT": {"title": "CLOSESPIDER_ITEMCOUNT","type": "integer","default": 0,"description": ""},
25 "CLOSESPIDER_PAGECOUNT": {"title": "CLOSESPIDER_PAGECOUNT","type": "integer","default": 0,"description": ""},
26 "CLOSESPIDER_TIMEOUT": {"title": "CLOSESPIDER_TIMEOUT","type": "integer","default": 0,"description": ""},
27 "COMMANDS_MODULE": {"title": "COMMANDS_MODULE","type": "string","default": "","editor": "textfield","description": ""},
28 "COMPRESSION_ENABLED": {"title": "COMPRESSION_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
29 "CONCURRENT_ITEMS": {"title": "CONCURRENT_ITEMS","type": "integer","default": 100,"description": ""},
30 "CONCURRENT_REQUESTS": {"title": "CONCURRENT_REQUESTS","type": "integer","default": 16,"description": ""},
31 "CONCURRENT_REQUESTS_PER_DOMAIN": {"title": "CONCURRENT_REQUESTS_PER_DOMAIN","type": "integer","default": 8,"description": ""},
32 "CONCURRENT_REQUESTS_PER_IP": {"title": "CONCURRENT_REQUESTS_PER_IP","type": "integer","default": 0,"description": ""},
33 "COOKIES_DEBUG": {"title": "COOKIES_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
34 "COOKIES_ENABLED": {"title": "COOKIES_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
35 "DEFAULT_ITEM_CLASS": {"title": "DEFAULT_ITEM_CLASS","type": "string","default": "scrapy.item.Item","editor": "textfield","description": ""},
36 "DEFAULT_REQUEST_HEADERS": {"title": "DEFAULT_REQUEST_HEADERS","type": "object","prefill": {
37 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
38 "Accept-Language": "en"
39 },"editor": "json","description": ""},
40 "DEPTH_LIMIT": {"title": "DEPTH_LIMIT","type": "integer","default": 0,"description": ""},
41 "DEPTH_PRIORITY": {"title": "DEPTH_PRIORITY","type": "integer","default": 0,"description": ""},
42 "DEPTH_STATS_VERBOSE": {"title": "DEPTH_STATS_VERBOSE","type": "boolean","default": false,"nullable": true,"description": ""},
43 "DNSCACHE_ENABLED": {"title": "DNSCACHE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
44 "DNSCACHE_SIZE": {"title": "DNSCACHE_SIZE","type": "integer","default": 10000,"description": ""},
45 "DNS_RESOLVER": {"title": "DNS_RESOLVER","type": "string","default": "scrapy.resolver.CachingThreadedResolver","editor": "textfield","description": ""},
46 "DNS_TIMEOUT": {"title": "DNS_TIMEOUT","type": "integer","default": 60,"description": ""},
47 "DOWNLOADER": {"title": "DOWNLOADER","type": "string","default": "scrapy.core.downloader.Downloader","editor": "textfield","description": ""},
48 "DOWNLOADER_CLIENTCONTEXTFACTORY": {"title": "DOWNLOADER_CLIENTCONTEXTFACTORY","type": "string","default": "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory","editor": "textfield","description": ""},
49 "DOWNLOADER_CLIENT_TLS_CIPHERS": {"title": "DOWNLOADER_CLIENT_TLS_CIPHERS","type": "string","default": "DEFAULT","editor": "textfield","description": ""},
50 "DOWNLOADER_CLIENT_TLS_METHOD": {"title": "DOWNLOADER_CLIENT_TLS_METHOD","type": "string","default": "TLS","editor": "textfield","description": ""},
51 "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING": {"title": "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING","type": "boolean","default": false,"nullable": true,"description": ""},
52 "DOWNLOADER_HTTPCLIENTFACTORY": {"title": "DOWNLOADER_HTTPCLIENTFACTORY","type": "string","default": "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory","editor": "textfield","description": ""},
53 "DOWNLOADER_MIDDLEWARES": {"title": "DOWNLOADER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
54 "DOWNLOADER_MIDDLEWARES_BASE": {"title": "DOWNLOADER_MIDDLEWARES_BASE","type": "object","prefill": {
55 "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
56 "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
57 "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
58 "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
59 "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
60 "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
61 "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
62 "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
63 "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
64 "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
65 "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
66 "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
67 "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
68 "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
69 },"editor": "json","description": ""},
70 "DOWNLOADER_STATS": {"title": "DOWNLOADER_STATS","type": "boolean","default": true,"nullable": true,"description": ""},
71 "DOWNLOAD_DELAY": {"title": "DOWNLOAD_DELAY","type": "integer","default": 0,"description": ""},
72 "DOWNLOAD_FAIL_ON_DATALOSS": {"title": "DOWNLOAD_FAIL_ON_DATALOSS","type": "boolean","default": true,"nullable": true,"description": ""},
73 "DOWNLOAD_HANDLERS": {"title": "DOWNLOAD_HANDLERS","type": "object","prefill": {},"editor": "json","description": ""},
74 "DOWNLOAD_HANDLERS_BASE": {"title": "DOWNLOAD_HANDLERS_BASE","type": "object","prefill": {
75 "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
76 "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
77 "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
78 "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
79 "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
80 "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
81 },"editor": "json","description": ""},
82 "DOWNLOAD_MAXSIZE": {"title": "DOWNLOAD_MAXSIZE","type": "integer","default": 1073741824,"description": ""},
83 "DOWNLOAD_TIMEOUT": {"title": "DOWNLOAD_TIMEOUT","type": "integer","default": 180,"description": ""},
84 "DOWNLOAD_WARNSIZE": {"title": "DOWNLOAD_WARNSIZE","type": "integer","default": 33554432,"description": ""},
85 "DUPEFILTER_CLASS": {"title": "DUPEFILTER_CLASS","type": "string","default": "scrapy.dupefilters.RFPDupeFilter","editor": "textfield","description": ""},
86 "EDITOR": {"title": "EDITOR","type": "string","default": "vi","editor": "textfield","description": ""},
87 "EXTENSIONS": {"title": "EXTENSIONS","type": "object","prefill": {},"editor": "json","description": ""},
88 "EXTENSIONS_BASE": {"title": "EXTENSIONS_BASE","type": "object","prefill": {
89 "scrapy.extensions.corestats.CoreStats": 0,
90 "scrapy.extensions.telnet.TelnetConsole": 0,
91 "scrapy.extensions.memusage.MemoryUsage": 0,
92 "scrapy.extensions.memdebug.MemoryDebugger": 0,
93 "scrapy.extensions.closespider.CloseSpider": 0,
94 "scrapy.extensions.feedexport.FeedExporter": 0,
95 "scrapy.extensions.logstats.LogStats": 0,
96 "scrapy.extensions.spiderstate.SpiderState": 0,
97 "scrapy.extensions.throttle.AutoThrottle": 0
98 },"editor": "json","description": ""},
99 "FEEDS": {"title": "FEEDS","type": "object","prefill": {},"editor": "json","description": ""},
100 "FEED_EXPORTERS": {"title": "FEED_EXPORTERS","type": "object","prefill": {},"editor": "json","description": ""},
101 "FEED_EXPORTERS_BASE": {"title": "FEED_EXPORTERS_BASE","type": "object","prefill": {
102 "json": "scrapy.exporters.JsonItemExporter",
103 "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
104 "jsonl": "scrapy.exporters.JsonLinesItemExporter",
105 "jl": "scrapy.exporters.JsonLinesItemExporter",
106 "csv": "scrapy.exporters.CsvItemExporter",
107 "xml": "scrapy.exporters.XmlItemExporter",
108 "marshal": "scrapy.exporters.MarshalItemExporter",
109 "pickle": "scrapy.exporters.PickleItemExporter"
110 },"editor": "json","description": ""},
111 "FEED_EXPORT_BATCH_ITEM_COUNT": {"title": "FEED_EXPORT_BATCH_ITEM_COUNT","type": "integer","default": 0,"description": ""},
112 "FEED_EXPORT_ENCODING": {"title": "FEED_EXPORT_ENCODING","type": "string","editor": "textfield","description": ""},
113 "FEED_EXPORT_FIELDS": {"title": "FEED_EXPORT_FIELDS","type": "string","editor": "textfield","description": ""},
114 "FEED_EXPORT_INDENT": {"title": "FEED_EXPORT_INDENT","type": "integer","default": 0,"description": ""},
115 "FEED_STORAGES": {"title": "FEED_STORAGES","type": "object","prefill": {},"editor": "json","description": ""},
116 "FEED_STORAGES_BASE": {"title": "FEED_STORAGES_BASE","type": "object","prefill": {
117 "": "scrapy.extensions.feedexport.FileFeedStorage",
118 "file": "scrapy.extensions.feedexport.FileFeedStorage",
119 "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
120 "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
121 "s3": "scrapy.extensions.feedexport.S3FeedStorage",
122 "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
123 },"editor": "json","description": ""},
124 "FEED_STORAGE_FTP_ACTIVE": {"title": "FEED_STORAGE_FTP_ACTIVE","type": "boolean","default": false,"nullable": true,"description": ""},
125 "FEED_STORAGE_GCS_ACL": {"title": "FEED_STORAGE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
126 "FEED_STORAGE_S3_ACL": {"title": "FEED_STORAGE_S3_ACL","type": "string","default": "","editor": "textfield","description": ""},
127 "FEED_STORE_EMPTY": {"title": "FEED_STORE_EMPTY","type": "boolean","default": false,"nullable": true,"description": ""},
128 "FEED_TEMPDIR": {"title": "FEED_TEMPDIR","type": "string","editor": "textfield","description": ""},
129 "FEED_URI_PARAMS": {"title": "FEED_URI_PARAMS","type": "string","editor": "textfield","description": ""},
130 "FILES_STORE_GCS_ACL": {"title": "FILES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
131 "FILES_STORE_S3_ACL": {"title": "FILES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
132 "FTP_PASSIVE_MODE": {"title": "FTP_PASSIVE_MODE","type": "boolean","default": true,"nullable": true,"description": ""},
133 "FTP_PASSWORD": {"title": "FTP_PASSWORD","type": "string","default": "guest","editor": "textfield","description": ""},
134 "FTP_USER": {"title": "FTP_USER","type": "string","default": "anonymous","editor": "textfield","description": ""},
135 "GCS_PROJECT_ID": {"title": "GCS_PROJECT_ID","type": "string","editor": "textfield","description": ""},
136 "HTTPCACHE_ALWAYS_STORE": {"title": "HTTPCACHE_ALWAYS_STORE","type": "boolean","default": false,"nullable": true,"description": ""},
137 "HTTPCACHE_DBM_MODULE": {"title": "HTTPCACHE_DBM_MODULE","type": "string","default": "dbm","editor": "textfield","description": ""},
138 "HTTPCACHE_DIR": {"title": "HTTPCACHE_DIR","type": "string","default": "httpcache","editor": "textfield","description": ""},
139 "HTTPCACHE_ENABLED": {"title": "HTTPCACHE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
140 "HTTPCACHE_EXPIRATION_SECS": {"title": "HTTPCACHE_EXPIRATION_SECS","type": "integer","default": 0,"description": ""},
141 "HTTPCACHE_GZIP": {"title": "HTTPCACHE_GZIP","type": "boolean","default": false,"nullable": true,"description": ""},
142 "HTTPCACHE_IGNORE_HTTP_CODES": {"title": "HTTPCACHE_IGNORE_HTTP_CODES","type": "array","prefill": [],"editor": "stringList","description": ""},
143 "HTTPCACHE_IGNORE_MISSING": {"title": "HTTPCACHE_IGNORE_MISSING","type": "boolean","default": false,"nullable": true,"description": ""},
144 "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": {"title": "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS","type": "array","prefill": [],"editor": "stringList","description": ""},
145 "HTTPCACHE_IGNORE_SCHEMES": {"title": "HTTPCACHE_IGNORE_SCHEMES","type": "array","prefill": [
146 "file"
147 ],"editor": "stringList","description": ""},
148 "HTTPCACHE_POLICY": {"title": "HTTPCACHE_POLICY","type": "string","default": "scrapy.extensions.httpcache.DummyPolicy","editor": "textfield","description": ""},
149 "HTTPCACHE_STORAGE": {"title": "HTTPCACHE_STORAGE","type": "string","default": "scrapy.extensions.httpcache.FilesystemCacheStorage","editor": "textfield","description": ""},
150 "HTTPPROXY_AUTH_ENCODING": {"title": "HTTPPROXY_AUTH_ENCODING","type": "string","default": "latin-1","editor": "textfield","description": ""},
151 "HTTPPROXY_ENABLED": {"title": "HTTPPROXY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
152 "IMAGES_STORE_GCS_ACL": {"title": "IMAGES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""},
153 "IMAGES_STORE_S3_ACL": {"title": "IMAGES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""},
154 "ITEM_PIPELINES": {"title": "ITEM_PIPELINES","type": "object","prefill": {},"editor": "json","description": ""},
155 "ITEM_PIPELINES_BASE": {"title": "ITEM_PIPELINES_BASE","type": "object","prefill": {},"editor": "json","description": ""},
156 "ITEM_PROCESSOR": {"title": "ITEM_PROCESSOR","type": "string","default": "scrapy.pipelines.ItemPipelineManager","editor": "textfield","description": ""},
157 "LOGSTATS_INTERVAL": {"title": "LOGSTATS_INTERVAL","type": "integer","default": 60,"description": ""},
158 "LOG_DATEFORMAT": {"title": "LOG_DATEFORMAT","type": "string","default": "%Y-%m-%d %H:%M:%S","editor": "textfield","description": ""},
159 "LOG_ENABLED": {"title": "LOG_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
160 "LOG_ENCODING": {"title": "LOG_ENCODING","type": "string","default": "utf-8","editor": "textfield","description": ""},
161 "LOG_FILE": {"title": "LOG_FILE","type": "string","editor": "textfield","description": ""},
162 "LOG_FILE_APPEND": {"title": "LOG_FILE_APPEND","type": "boolean","default": true,"nullable": true,"description": ""},
163 "LOG_FORMAT": {"title": "LOG_FORMAT","type": "string","default": "%(asctime)s [%(name)s] %(levelname)s: %(message)s","editor": "textfield","description": ""},
164 "LOG_FORMATTER": {"title": "LOG_FORMATTER","type": "string","default": "scrapy.logformatter.LogFormatter","editor": "textfield","description": ""},
165 "LOG_LEVEL": {"title": "LOG_LEVEL","type": "string","default": "DEBUG","editor": "textfield","description": ""},
166 "LOG_SHORT_NAMES": {"title": "LOG_SHORT_NAMES","type": "boolean","default": false,"nullable": true,"description": ""},
167 "LOG_STDOUT": {"title": "LOG_STDOUT","type": "boolean","default": false,"nullable": true,"description": ""},
168 "MAIL_FROM": {"title": "MAIL_FROM","type": "string","default": "scrapy@localhost","editor": "textfield","description": ""},
169 "MAIL_HOST": {"title": "MAIL_HOST","type": "string","default": "localhost","editor": "textfield","description": ""},
170 "MAIL_PASS": {"title": "MAIL_PASS","type": "string","editor": "textfield","description": ""},
171 "MAIL_PORT": {"title": "MAIL_PORT","type": "integer","default": 25,"description": ""},
172 "MAIL_USER": {"title": "MAIL_USER","type": "string","editor": "textfield","description": ""},
173 "MEMDEBUG_ENABLED": {"title": "MEMDEBUG_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""},
174 "MEMDEBUG_NOTIFY": {"title": "MEMDEBUG_NOTIFY","type": "array","prefill": [],"editor": "stringList","description": ""},
175 "MEMUSAGE_CHECK_INTERVAL_SECONDS": {"title": "MEMUSAGE_CHECK_INTERVAL_SECONDS","type": "integer","default": 60,"description": ""},
176 "MEMUSAGE_ENABLED": {"title": "MEMUSAGE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
177 "MEMUSAGE_LIMIT_MB": {"title": "MEMUSAGE_LIMIT_MB","type": "integer","default": 0,"description": ""},
178 "MEMUSAGE_NOTIFY_MAIL": {"title": "MEMUSAGE_NOTIFY_MAIL","type": "array","prefill": [],"editor": "stringList","description": ""},
179 "MEMUSAGE_WARNING_MB": {"title": "MEMUSAGE_WARNING_MB","type": "integer","default": 0,"description": ""},
180 "METAREFRESH_ENABLED": {"title": "METAREFRESH_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
181 "METAREFRESH_IGNORE_TAGS": {"title": "METAREFRESH_IGNORE_TAGS","type": "array","prefill": [],"editor": "stringList","description": ""},
182 "METAREFRESH_MAXDELAY": {"title": "METAREFRESH_MAXDELAY","type": "integer","default": 100,"description": ""},
183 "NEWSPIDER_MODULE": {"title": "NEWSPIDER_MODULE","type": "string","default": "","editor": "textfield","description": ""},
184 "RANDOMIZE_DOWNLOAD_DELAY": {"title": "RANDOMIZE_DOWNLOAD_DELAY","type": "boolean","default": true,"nullable": true,"description": ""},
185 "REACTOR_THREADPOOL_MAXSIZE": {"title": "REACTOR_THREADPOOL_MAXSIZE","type": "integer","default": 10,"description": ""},
186 "REDIRECT_ENABLED": {"title": "REDIRECT_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
187 "REDIRECT_MAX_TIMES": {"title": "REDIRECT_MAX_TIMES","type": "integer","default": 20,"description": ""},
188 "REDIRECT_PRIORITY_ADJUST": {"title": "REDIRECT_PRIORITY_ADJUST","type": "integer","default": 2,"description": ""},
189 "REFERER_ENABLED": {"title": "REFERER_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
190 "REFERRER_POLICY": {"title": "REFERRER_POLICY","type": "string","default": "scrapy.spidermiddlewares.referer.DefaultReferrerPolicy","editor": "textfield","description": ""},
191 "REQUEST_FINGERPRINTER_CLASS": {"title": "REQUEST_FINGERPRINTER_CLASS","type": "string","default": "scrapy.utils.request.RequestFingerprinter","editor": "textfield","description": ""},
192 "REQUEST_FINGERPRINTER_IMPLEMENTATION": {"title": "REQUEST_FINGERPRINTER_IMPLEMENTATION","type": "string","default": "2.6","editor": "textfield","description": ""},
193 "RETRY_ENABLED": {"title": "RETRY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""},
194 "RETRY_HTTP_CODES": {"title": "RETRY_HTTP_CODES","type": "array","prefill": [
195 500,
196 502,
197 503,
198 504,
199 522,
200 524,
201 408,
202 429
203 ],"editor": "stringList","description": ""},
204 "RETRY_PRIORITY_ADJUST": {"title": "RETRY_PRIORITY_ADJUST","type": "integer","default": -1,"description": ""},
205 "RETRY_TIMES": {"title": "RETRY_TIMES","type": "integer","default": 2,"description": ""},
206 "ROBOTSTXT_OBEY": {"title": "ROBOTSTXT_OBEY","type": "boolean","default": false,"nullable": true,"description": ""},
207 "ROBOTSTXT_PARSER": {"title": "ROBOTSTXT_PARSER","type": "string","default": "scrapy.robotstxt.ProtegoRobotParser","editor": "textfield","description": ""},
208 "ROBOTSTXT_USER_AGENT": {"title": "ROBOTSTXT_USER_AGENT","type": "string","editor": "textfield","description": ""},
209 "SCHEDULER": {"title": "SCHEDULER","type": "string","default": "scrapy.core.scheduler.Scheduler","editor": "textfield","description": ""},
210 "SCHEDULER_DEBUG": {"title": "SCHEDULER_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""},
211 "SCHEDULER_DISK_QUEUE": {"title": "SCHEDULER_DISK_QUEUE","type": "string","default": "scrapy.squeues.PickleLifoDiskQueue","editor": "textfield","description": ""},
212 "SCHEDULER_MEMORY_QUEUE": {"title": "SCHEDULER_MEMORY_QUEUE","type": "string","default": "scrapy.squeues.LifoMemoryQueue","editor": "textfield","description": ""},
213 "SCHEDULER_PRIORITY_QUEUE": {"title": "SCHEDULER_PRIORITY_QUEUE","type": "string","default": "scrapy.pqueues.ScrapyPriorityQueue","editor": "textfield","description": ""},
214 "SCRAPER_SLOT_MAX_ACTIVE_SIZE": {"title": "SCRAPER_SLOT_MAX_ACTIVE_SIZE","type": "integer","default": 5000000,"description": ""},
215 "SPIDER_CONTRACTS": {"title": "SPIDER_CONTRACTS","type": "object","prefill": {},"editor": "json","description": ""},
216 "SPIDER_CONTRACTS_BASE": {"title": "SPIDER_CONTRACTS_BASE","type": "object","prefill": {
217 "scrapy.contracts.default.UrlContract": 1,
218 "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
219 "scrapy.contracts.default.ReturnsContract": 2,
220 "scrapy.contracts.default.ScrapesContract": 3
221 },"editor": "json","description": ""},
222 "SPIDER_LOADER_CLASS": {"title": "SPIDER_LOADER_CLASS","type": "string","default": "scrapy.spiderloader.SpiderLoader","editor": "textfield","description": ""},
223 "SPIDER_LOADER_WARN_ONLY": {"title": "SPIDER_LOADER_WARN_ONLY","type": "boolean","default": false,"nullable": true,"description": ""},
224 "SPIDER_MIDDLEWARES": {"title": "SPIDER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""},
225 "SPIDER_MIDDLEWARES_BASE": {"title": "SPIDER_MIDDLEWARES_BASE","type": "object","prefill": {
226 "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
227 "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
228 "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
229 "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
230 "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
231 },"editor": "json","description": ""},
232 "SPIDER_MODULES": {"title": "SPIDER_MODULES","type": "array","prefill": [],"editor": "stringList","description": ""},
233 "STATSMAILER_RCPTS": {"title": "STATSMAILER_RCPTS","type": "array","prefill": [],"editor": "stringList","description": ""},
234 "STATS_CLASS": {"title": "STATS_CLASS","type": "string","default": "scrapy.statscollectors.MemoryStatsCollector","editor": "textfield","description": ""},
235 "STATS_DUMP": {"title": "STATS_DUMP","type": "boolean","default": true,"nullable": true,"description": ""},
236 "TELNETCONSOLE_ENABLED": {"title": "TELNETCONSOLE_ENABLED","type": "integer","default": 1,"description": ""},
237 "TELNETCONSOLE_HOST": {"title": "TELNETCONSOLE_HOST","type": "string","default": "127.0.0.1","editor": "textfield","description": ""},
238 "TELNETCONSOLE_PASSWORD": {"title": "TELNETCONSOLE_PASSWORD","type": "string","editor": "textfield","description": ""},
239 "TELNETCONSOLE_PORT": {"title": "TELNETCONSOLE_PORT","type": "array","prefill": [
240 6023,
241 6073
242 ],"editor": "stringList","description": ""},
243 "TELNETCONSOLE_USERNAME": {"title": "TELNETCONSOLE_USERNAME","type": "string","default": "scrapy","editor": "textfield","description": ""},
244 "TEMPLATES_DIR": {"title": "TEMPLATES_DIR","type": "string","default": "/usr/local/lib/python3.10/dist-packages/scrapy/templates","editor": "textfield","description": ""},
245 "TWISTED_REACTOR": {"title": "TWISTED_REACTOR","type": "string","editor": "textfield","description": ""},
246 "URLLENGTH_LIMIT": {"title": "URLLENGTH_LIMIT","type": "integer","default": 2083,"description": ""},
247 "USER_AGENT": {"title": "USER_AGENT","type": "string","default": "Scrapy/2.9.0 (+https://scrapy.org)","editor": "textfield","description": ""}
248 }
249}
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23
24.scrapy
25*.log
main.py
1'''
2import logging
3from apify.log import ActorLogFormatter
4import scrapy.utils.log
5
6handler = logging.StreamHandler()
7handler.setFormatter(ActorLogFormatter())
8
9apify_logger = logging.getLogger('apify')
10apify_logger.setLevel(logging.INFO)
11apify_logger.addHandler(handler)
12
13apify_client_logger = logging.getLogger('apify_client')
14apify_client_logger.setLevel(logging.DEBUG)
15apify_client_logger.addHandler(handler)
16'''
17
18# We can't attach our log handler to the loggers normally,
19# because Scrapy would remove them in the `configure_logging` call here:
20# https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L95
21# (even though `disable_existing_loggers` is set to False :facepalm:).
22# We need to monkeypatch Scrapy's `configure_logging` method like this,
23# so that our handler is attached right after Scrapy calls the `configure_logging` method,
24# because otherwise we would lose some log messages.
25'''
26old_configure_logging = scrapy.utils.log.configure_logging
27
28def new_configure_logging(*args, **kwargs):
29 old_configure_logging(*args, **kwargs)
30
31 # Scrapy uses these four main loggers: https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L44-L61
32 logging.getLogger('scrapy').addHandler(handler)
33 logging.getLogger('twisted').addHandler(handler)
34 logging.getLogger('filelock').addHandler(handler)
35 logging.getLogger('hpack').addHandler(handler)
36
37scrapy.utils.log.configure_logging = new_configure_logging
38'''
39
40#################################################################################################################
41import asyncio
42import nest_asyncio
43import scrapy
44from scrapy.crawler import CrawlerProcess
45from scrapy.utils.reactor import install_reactor
46from apify import Actor
47
48
49# This is necessary so that twisted and asyncio work well together
50install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
51nest_asyncio.apply()
52
53SPIDERS = []
54
55class ITEM_PIPELINE:
56 def open_spider(self, spider):
57 SPIDERS.append(spider)
58
59 def close_spider(self, spider):
60 pass # print('CLOSE', spider)
61
62 async def process_item(self, item, spider):
63 await Actor.push_data(item)
64 return item
65
66async def periodic():
67 while True:
68 for s in SPIDERS:
69 count = s.crawler.stats.get_value('item_scraped_count') or 0
70 print(f'⌛ ({s.__class__.__name__}) item_scraped_count: {count}')
71 await asyncio.sleep(5)
72
73async def main():
74 async with Actor:
75 GLOBE = {'scrapy': scrapy}
76
77 actor_input = await Actor.get_input() or {}
78
79 # code
80 assert (spiders_code := actor_input.pop('spiders_code',0)), 'parameter required => spiders_code'
81 exec(spiders_code, GLOBE)
82
83 # get spiders
84 start = []
85 for k,v in GLOBE.items():
86 if isinstance(v,type) and issubclass(v, scrapy.Spider):
87 print('*** SPIDER:', k)
88 start.append(v)
89
90 # start crawlers
91 if start:
92 process = CrawlerProcess({
93 'TWISTED_REACTOR' : 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
94 'ITEM_PIPELINES' : { ITEM_PIPELINE: 1 },
95 **actor_input
96 }, install_root_handler=False)
97
98 for t in start: process.crawl(t)
99
100 timer = asyncio.get_event_loop().create_task(periodic()) # periodic status
101 process.start()
102
103 print('Done.')
104
105asyncio.run(main())
requirements.txt
1# Add your dependencies here.
2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/
3# for how to format them
4apify ~= 1.1.1
5nest-asyncio ~= 1.5.6
6scrapy ~= 2.9.0
Developer
Maintained by Community
Categories