
ScrapyFy
Deprecated
Pricing
Pay per usage
Go to Store

ScrapyFy
Deprecated
Scrapy Runner
0.0 (0)
Pricing
Pay per usage
1
Total users
16
Monthly users
2
Last modified
2 years ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Specify how to launch the source code of your actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "main"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-1", "title": "Getting started with Python and Scrapy", "description": "Scrapes titles of websites using Scrapy.", "version": "0.0", "meta": { "templateId": "python-scrapy" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "URLs and their titles", "views": { "titles": { "title": "URLs and their titles", "transformation": { "fields": [ "url", "title" ] }, "display": { "component": "table", "properties": { "url": { "label": "URL", "format": "text" }, "title": { "label": "Title", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Python Scrapy Scraper", "type": "object", "schemaVersion": 1, "required": ["spiders_code"], "properties": { "spiders_code": { "title": "Spiders Code", "type": "string", "description": "Define your spiders <i>(multiple spiders can be specified)</i>.", "editor": "python", "prefill": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n name = 'title_spider'\r\n allowed_domains = [\"apify.com\"]\r\n start_urls = [\"https://apify.com\"]\r\n\r\n custom_settings = {\r\n 'REQUEST_FINGERPRINTER_IMPLEMENTATION' : '2.7',\r\n # Obey robots.txt rules\r\n 'ROBOTSTXT_OBEY' : True,\r\n 'DEPTH_LIMIT' : 2,\r\n 'LOG_ENABLED' : False,\r\n #'CLOSESPIDER_PAGECOUNT' : 5,\r\n 'CLOSESPIDER_ITEMCOUNT' : 5,\r\n }\r\n\r\n def parse(self, response):\r\n yield {\r\n 'url': response.url,\r\n 'title': response.css('title::text').extract_first(),\r\n }\r\n for link_href in response.css('a::attr(\"href\")'):\r\n link_url = urljoin(response.url, link_href.get())\r\n if link_url.startswith(('http://', 'https://')):\r\n yield scrapy.Request(link_url)" },
"AJAXCRAWL_ENABLED": {"title": "AJAXCRAWL_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""}, "ASYNCIO_EVENT_LOOP": {"title": "ASYNCIO_EVENT_LOOP","type": "string","editor": "textfield","description": ""}, "AUTOTHROTTLE_DEBUG": {"title": "AUTOTHROTTLE_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""}, "AUTOTHROTTLE_ENABLED": {"title": "AUTOTHROTTLE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""}, "AUTOTHROTTLE_MAX_DELAY": {"title": "AUTOTHROTTLE_MAX_DELAY","type": "integer","default": 60,"description": ""}, "AUTOTHROTTLE_START_DELAY": {"title": "AUTOTHROTTLE_START_DELAY","type": "integer","default": 5,"description": ""}, "AUTOTHROTTLE_TARGET_CONCURRENCY": {"title": "AUTOTHROTTLE_TARGET_CONCURRENCY","type": "integer","default": 1,"description": ""}, "BOT_NAME": {"title": "BOT_NAME","type": "string","default": "scrapybot","editor": "textfield","description": ""}, "CLOSESPIDER_ERRORCOUNT": {"title": "CLOSESPIDER_ERRORCOUNT","type": "integer","default": 0,"description": ""}, "CLOSESPIDER_ITEMCOUNT": {"title": "CLOSESPIDER_ITEMCOUNT","type": "integer","default": 0,"description": ""}, "CLOSESPIDER_PAGECOUNT": {"title": "CLOSESPIDER_PAGECOUNT","type": "integer","default": 0,"description": ""}, "CLOSESPIDER_TIMEOUT": {"title": "CLOSESPIDER_TIMEOUT","type": "integer","default": 0,"description": ""}, "COMMANDS_MODULE": {"title": "COMMANDS_MODULE","type": "string","default": "","editor": "textfield","description": ""}, "COMPRESSION_ENABLED": {"title": "COMPRESSION_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "CONCURRENT_ITEMS": {"title": "CONCURRENT_ITEMS","type": "integer","default": 100,"description": ""}, "CONCURRENT_REQUESTS": {"title": "CONCURRENT_REQUESTS","type": "integer","default": 16,"description": ""}, "CONCURRENT_REQUESTS_PER_DOMAIN": {"title": "CONCURRENT_REQUESTS_PER_DOMAIN","type": "integer","default": 8,"description": ""}, "CONCURRENT_REQUESTS_PER_IP": {"title": "CONCURRENT_REQUESTS_PER_IP","type": "integer","default": 0,"description": ""}, "COOKIES_DEBUG": {"title": "COOKIES_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""}, "COOKIES_ENABLED": {"title": "COOKIES_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "DEFAULT_ITEM_CLASS": {"title": "DEFAULT_ITEM_CLASS","type": "string","default": "scrapy.item.Item","editor": "textfield","description": ""}, "DEFAULT_REQUEST_HEADERS": {"title": "DEFAULT_REQUEST_HEADERS","type": "object","prefill": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en" },"editor": "json","description": ""}, "DEPTH_LIMIT": {"title": "DEPTH_LIMIT","type": "integer","default": 0,"description": ""}, "DEPTH_PRIORITY": {"title": "DEPTH_PRIORITY","type": "integer","default": 0,"description": ""}, "DEPTH_STATS_VERBOSE": {"title": "DEPTH_STATS_VERBOSE","type": "boolean","default": false,"nullable": true,"description": ""}, "DNSCACHE_ENABLED": {"title": "DNSCACHE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "DNSCACHE_SIZE": {"title": "DNSCACHE_SIZE","type": "integer","default": 10000,"description": ""}, "DNS_RESOLVER": {"title": "DNS_RESOLVER","type": "string","default": "scrapy.resolver.CachingThreadedResolver","editor": "textfield","description": ""}, "DNS_TIMEOUT": {"title": "DNS_TIMEOUT","type": "integer","default": 60,"description": ""}, "DOWNLOADER": {"title": "DOWNLOADER","type": "string","default": "scrapy.core.downloader.Downloader","editor": "textfield","description": ""}, "DOWNLOADER_CLIENTCONTEXTFACTORY": {"title": "DOWNLOADER_CLIENTCONTEXTFACTORY","type": "string","default": "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory","editor": "textfield","description": ""}, "DOWNLOADER_CLIENT_TLS_CIPHERS": {"title": "DOWNLOADER_CLIENT_TLS_CIPHERS","type": "string","default": "DEFAULT","editor": "textfield","description": ""}, "DOWNLOADER_CLIENT_TLS_METHOD": {"title": "DOWNLOADER_CLIENT_TLS_METHOD","type": "string","default": "TLS","editor": "textfield","description": ""}, "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING": {"title": "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING","type": "boolean","default": false,"nullable": true,"description": ""}, "DOWNLOADER_HTTPCLIENTFACTORY": {"title": "DOWNLOADER_HTTPCLIENTFACTORY","type": "string","default": "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory","editor": "textfield","description": ""}, "DOWNLOADER_MIDDLEWARES": {"title": "DOWNLOADER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""}, "DOWNLOADER_MIDDLEWARES_BASE": {"title": "DOWNLOADER_MIDDLEWARES_BASE","type": "object","prefill": { "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100, "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300, "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350, "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400, "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500, "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550, "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560, "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580, "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590, "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600, "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700, "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750, "scrapy.downloadermiddlewares.stats.DownloaderStats": 850, "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900 },"editor": "json","description": ""}, "DOWNLOADER_STATS": {"title": "DOWNLOADER_STATS","type": "boolean","default": true,"nullable": true,"description": ""}, "DOWNLOAD_DELAY": {"title": "DOWNLOAD_DELAY","type": "integer","default": 0,"description": ""}, "DOWNLOAD_FAIL_ON_DATALOSS": {"title": "DOWNLOAD_FAIL_ON_DATALOSS","type": "boolean","default": true,"nullable": true,"description": ""}, "DOWNLOAD_HANDLERS": {"title": "DOWNLOAD_HANDLERS","type": "object","prefill": {},"editor": "json","description": ""}, "DOWNLOAD_HANDLERS_BASE": {"title": "DOWNLOAD_HANDLERS_BASE","type": "object","prefill": { "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler", "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler", "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler", "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler", "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler", "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler" },"editor": "json","description": ""}, "DOWNLOAD_MAXSIZE": {"title": "DOWNLOAD_MAXSIZE","type": "integer","default": 1073741824,"description": ""}, "DOWNLOAD_TIMEOUT": {"title": "DOWNLOAD_TIMEOUT","type": "integer","default": 180,"description": ""}, "DOWNLOAD_WARNSIZE": {"title": "DOWNLOAD_WARNSIZE","type": "integer","default": 33554432,"description": ""}, "DUPEFILTER_CLASS": {"title": "DUPEFILTER_CLASS","type": "string","default": "scrapy.dupefilters.RFPDupeFilter","editor": "textfield","description": ""}, "EDITOR": {"title": "EDITOR","type": "string","default": "vi","editor": "textfield","description": ""}, "EXTENSIONS": {"title": "EXTENSIONS","type": "object","prefill": {},"editor": "json","description": ""}, "EXTENSIONS_BASE": {"title": "EXTENSIONS_BASE","type": "object","prefill": { "scrapy.extensions.corestats.CoreStats": 0, "scrapy.extensions.telnet.TelnetConsole": 0, "scrapy.extensions.memusage.MemoryUsage": 0, "scrapy.extensions.memdebug.MemoryDebugger": 0, "scrapy.extensions.closespider.CloseSpider": 0, "scrapy.extensions.feedexport.FeedExporter": 0, "scrapy.extensions.logstats.LogStats": 0, "scrapy.extensions.spiderstate.SpiderState": 0, "scrapy.extensions.throttle.AutoThrottle": 0 },"editor": "json","description": ""}, "FEEDS": {"title": "FEEDS","type": "object","prefill": {},"editor": "json","description": ""}, "FEED_EXPORTERS": {"title": "FEED_EXPORTERS","type": "object","prefill": {},"editor": "json","description": ""}, "FEED_EXPORTERS_BASE": {"title": "FEED_EXPORTERS_BASE","type": "object","prefill": { "json": "scrapy.exporters.JsonItemExporter", "jsonlines": "scrapy.exporters.JsonLinesItemExporter", "jsonl": "scrapy.exporters.JsonLinesItemExporter", "jl": "scrapy.exporters.JsonLinesItemExporter", "csv": "scrapy.exporters.CsvItemExporter", "xml": "scrapy.exporters.XmlItemExporter", "marshal": "scrapy.exporters.MarshalItemExporter", "pickle": "scrapy.exporters.PickleItemExporter" },"editor": "json","description": ""}, "FEED_EXPORT_BATCH_ITEM_COUNT": {"title": "FEED_EXPORT_BATCH_ITEM_COUNT","type": "integer","default": 0,"description": ""}, "FEED_EXPORT_ENCODING": {"title": "FEED_EXPORT_ENCODING","type": "string","editor": "textfield","description": ""}, "FEED_EXPORT_FIELDS": {"title": "FEED_EXPORT_FIELDS","type": "string","editor": "textfield","description": ""}, "FEED_EXPORT_INDENT": {"title": "FEED_EXPORT_INDENT","type": "integer","default": 0,"description": ""}, "FEED_STORAGES": {"title": "FEED_STORAGES","type": "object","prefill": {},"editor": "json","description": ""}, "FEED_STORAGES_BASE": {"title": "FEED_STORAGES_BASE","type": "object","prefill": { "": "scrapy.extensions.feedexport.FileFeedStorage", "file": "scrapy.extensions.feedexport.FileFeedStorage", "ftp": "scrapy.extensions.feedexport.FTPFeedStorage", "gs": "scrapy.extensions.feedexport.GCSFeedStorage", "s3": "scrapy.extensions.feedexport.S3FeedStorage", "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage" },"editor": "json","description": ""}, "FEED_STORAGE_FTP_ACTIVE": {"title": "FEED_STORAGE_FTP_ACTIVE","type": "boolean","default": false,"nullable": true,"description": ""}, "FEED_STORAGE_GCS_ACL": {"title": "FEED_STORAGE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""}, "FEED_STORAGE_S3_ACL": {"title": "FEED_STORAGE_S3_ACL","type": "string","default": "","editor": "textfield","description": ""}, "FEED_STORE_EMPTY": {"title": "FEED_STORE_EMPTY","type": "boolean","default": false,"nullable": true,"description": ""}, "FEED_TEMPDIR": {"title": "FEED_TEMPDIR","type": "string","editor": "textfield","description": ""}, "FEED_URI_PARAMS": {"title": "FEED_URI_PARAMS","type": "string","editor": "textfield","description": ""}, "FILES_STORE_GCS_ACL": {"title": "FILES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""}, "FILES_STORE_S3_ACL": {"title": "FILES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""}, "FTP_PASSIVE_MODE": {"title": "FTP_PASSIVE_MODE","type": "boolean","default": true,"nullable": true,"description": ""}, "FTP_PASSWORD": {"title": "FTP_PASSWORD","type": "string","default": "guest","editor": "textfield","description": ""}, "FTP_USER": {"title": "FTP_USER","type": "string","default": "anonymous","editor": "textfield","description": ""}, "GCS_PROJECT_ID": {"title": "GCS_PROJECT_ID","type": "string","editor": "textfield","description": ""}, "HTTPCACHE_ALWAYS_STORE": {"title": "HTTPCACHE_ALWAYS_STORE","type": "boolean","default": false,"nullable": true,"description": ""}, "HTTPCACHE_DBM_MODULE": {"title": "HTTPCACHE_DBM_MODULE","type": "string","default": "dbm","editor": "textfield","description": ""}, "HTTPCACHE_DIR": {"title": "HTTPCACHE_DIR","type": "string","default": "httpcache","editor": "textfield","description": ""}, "HTTPCACHE_ENABLED": {"title": "HTTPCACHE_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""}, "HTTPCACHE_EXPIRATION_SECS": {"title": "HTTPCACHE_EXPIRATION_SECS","type": "integer","default": 0,"description": ""}, "HTTPCACHE_GZIP": {"title": "HTTPCACHE_GZIP","type": "boolean","default": false,"nullable": true,"description": ""}, "HTTPCACHE_IGNORE_HTTP_CODES": {"title": "HTTPCACHE_IGNORE_HTTP_CODES","type": "array","prefill": [],"editor": "stringList","description": ""}, "HTTPCACHE_IGNORE_MISSING": {"title": "HTTPCACHE_IGNORE_MISSING","type": "boolean","default": false,"nullable": true,"description": ""}, "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": {"title": "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS","type": "array","prefill": [],"editor": "stringList","description": ""}, "HTTPCACHE_IGNORE_SCHEMES": {"title": "HTTPCACHE_IGNORE_SCHEMES","type": "array","prefill": [ "file" ],"editor": "stringList","description": ""}, "HTTPCACHE_POLICY": {"title": "HTTPCACHE_POLICY","type": "string","default": "scrapy.extensions.httpcache.DummyPolicy","editor": "textfield","description": ""}, "HTTPCACHE_STORAGE": {"title": "HTTPCACHE_STORAGE","type": "string","default": "scrapy.extensions.httpcache.FilesystemCacheStorage","editor": "textfield","description": ""}, "HTTPPROXY_AUTH_ENCODING": {"title": "HTTPPROXY_AUTH_ENCODING","type": "string","default": "latin-1","editor": "textfield","description": ""}, "HTTPPROXY_ENABLED": {"title": "HTTPPROXY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "IMAGES_STORE_GCS_ACL": {"title": "IMAGES_STORE_GCS_ACL","type": "string","default": "","editor": "textfield","description": ""}, "IMAGES_STORE_S3_ACL": {"title": "IMAGES_STORE_S3_ACL","type": "string","default": "private","editor": "textfield","description": ""}, "ITEM_PIPELINES": {"title": "ITEM_PIPELINES","type": "object","prefill": {},"editor": "json","description": ""}, "ITEM_PIPELINES_BASE": {"title": "ITEM_PIPELINES_BASE","type": "object","prefill": {},"editor": "json","description": ""}, "ITEM_PROCESSOR": {"title": "ITEM_PROCESSOR","type": "string","default": "scrapy.pipelines.ItemPipelineManager","editor": "textfield","description": ""}, "LOGSTATS_INTERVAL": {"title": "LOGSTATS_INTERVAL","type": "integer","default": 60,"description": ""}, "LOG_DATEFORMAT": {"title": "LOG_DATEFORMAT","type": "string","default": "%Y-%m-%d %H:%M:%S","editor": "textfield","description": ""}, "LOG_ENABLED": {"title": "LOG_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "LOG_ENCODING": {"title": "LOG_ENCODING","type": "string","default": "utf-8","editor": "textfield","description": ""}, "LOG_FILE": {"title": "LOG_FILE","type": "string","editor": "textfield","description": ""}, "LOG_FILE_APPEND": {"title": "LOG_FILE_APPEND","type": "boolean","default": true,"nullable": true,"description": ""}, "LOG_FORMAT": {"title": "LOG_FORMAT","type": "string","default": "%(asctime)s [%(name)s] %(levelname)s: %(message)s","editor": "textfield","description": ""}, "LOG_FORMATTER": {"title": "LOG_FORMATTER","type": "string","default": "scrapy.logformatter.LogFormatter","editor": "textfield","description": ""}, "LOG_LEVEL": {"title": "LOG_LEVEL","type": "string","default": "DEBUG","editor": "textfield","description": ""}, "LOG_SHORT_NAMES": {"title": "LOG_SHORT_NAMES","type": "boolean","default": false,"nullable": true,"description": ""}, "LOG_STDOUT": {"title": "LOG_STDOUT","type": "boolean","default": false,"nullable": true,"description": ""}, "MAIL_FROM": {"title": "MAIL_FROM","type": "string","default": "scrapy@localhost","editor": "textfield","description": ""}, "MAIL_HOST": {"title": "MAIL_HOST","type": "string","default": "localhost","editor": "textfield","description": ""}, "MAIL_PASS": {"title": "MAIL_PASS","type": "string","editor": "textfield","description": ""}, "MAIL_PORT": {"title": "MAIL_PORT","type": "integer","default": 25,"description": ""}, "MAIL_USER": {"title": "MAIL_USER","type": "string","editor": "textfield","description": ""}, "MEMDEBUG_ENABLED": {"title": "MEMDEBUG_ENABLED","type": "boolean","default": false,"nullable": true,"description": ""}, "MEMDEBUG_NOTIFY": {"title": "MEMDEBUG_NOTIFY","type": "array","prefill": [],"editor": "stringList","description": ""}, "MEMUSAGE_CHECK_INTERVAL_SECONDS": {"title": "MEMUSAGE_CHECK_INTERVAL_SECONDS","type": "integer","default": 60,"description": ""}, "MEMUSAGE_ENABLED": {"title": "MEMUSAGE_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "MEMUSAGE_LIMIT_MB": {"title": "MEMUSAGE_LIMIT_MB","type": "integer","default": 0,"description": ""}, "MEMUSAGE_NOTIFY_MAIL": {"title": "MEMUSAGE_NOTIFY_MAIL","type": "array","prefill": [],"editor": "stringList","description": ""}, "MEMUSAGE_WARNING_MB": {"title": "MEMUSAGE_WARNING_MB","type": "integer","default": 0,"description": ""}, "METAREFRESH_ENABLED": {"title": "METAREFRESH_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "METAREFRESH_IGNORE_TAGS": {"title": "METAREFRESH_IGNORE_TAGS","type": "array","prefill": [],"editor": "stringList","description": ""}, "METAREFRESH_MAXDELAY": {"title": "METAREFRESH_MAXDELAY","type": "integer","default": 100,"description": ""}, "NEWSPIDER_MODULE": {"title": "NEWSPIDER_MODULE","type": "string","default": "","editor": "textfield","description": ""}, "RANDOMIZE_DOWNLOAD_DELAY": {"title": "RANDOMIZE_DOWNLOAD_DELAY","type": "boolean","default": true,"nullable": true,"description": ""}, "REACTOR_THREADPOOL_MAXSIZE": {"title": "REACTOR_THREADPOOL_MAXSIZE","type": "integer","default": 10,"description": ""}, "REDIRECT_ENABLED": {"title": "REDIRECT_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "REDIRECT_MAX_TIMES": {"title": "REDIRECT_MAX_TIMES","type": "integer","default": 20,"description": ""}, "REDIRECT_PRIORITY_ADJUST": {"title": "REDIRECT_PRIORITY_ADJUST","type": "integer","default": 2,"description": ""}, "REFERER_ENABLED": {"title": "REFERER_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "REFERRER_POLICY": {"title": "REFERRER_POLICY","type": "string","default": "scrapy.spidermiddlewares.referer.DefaultReferrerPolicy","editor": "textfield","description": ""}, "REQUEST_FINGERPRINTER_CLASS": {"title": "REQUEST_FINGERPRINTER_CLASS","type": "string","default": "scrapy.utils.request.RequestFingerprinter","editor": "textfield","description": ""}, "REQUEST_FINGERPRINTER_IMPLEMENTATION": {"title": "REQUEST_FINGERPRINTER_IMPLEMENTATION","type": "string","default": "2.6","editor": "textfield","description": ""}, "RETRY_ENABLED": {"title": "RETRY_ENABLED","type": "boolean","default": true,"nullable": true,"description": ""}, "RETRY_HTTP_CODES": {"title": "RETRY_HTTP_CODES","type": "array","prefill": [ 500, 502, 503, 504, 522, 524, 408, 429 ],"editor": "stringList","description": ""}, "RETRY_PRIORITY_ADJUST": {"title": "RETRY_PRIORITY_ADJUST","type": "integer","default": -1,"description": ""}, "RETRY_TIMES": {"title": "RETRY_TIMES","type": "integer","default": 2,"description": ""}, "ROBOTSTXT_OBEY": {"title": "ROBOTSTXT_OBEY","type": "boolean","default": false,"nullable": true,"description": ""}, "ROBOTSTXT_PARSER": {"title": "ROBOTSTXT_PARSER","type": "string","default": "scrapy.robotstxt.ProtegoRobotParser","editor": "textfield","description": ""}, "ROBOTSTXT_USER_AGENT": {"title": "ROBOTSTXT_USER_AGENT","type": "string","editor": "textfield","description": ""}, "SCHEDULER": {"title": "SCHEDULER","type": "string","default": "scrapy.core.scheduler.Scheduler","editor": "textfield","description": ""}, "SCHEDULER_DEBUG": {"title": "SCHEDULER_DEBUG","type": "boolean","default": false,"nullable": true,"description": ""}, "SCHEDULER_DISK_QUEUE": {"title": "SCHEDULER_DISK_QUEUE","type": "string","default": "scrapy.squeues.PickleLifoDiskQueue","editor": "textfield","description": ""}, "SCHEDULER_MEMORY_QUEUE": {"title": "SCHEDULER_MEMORY_QUEUE","type": "string","default": "scrapy.squeues.LifoMemoryQueue","editor": "textfield","description": ""}, "SCHEDULER_PRIORITY_QUEUE": {"title": "SCHEDULER_PRIORITY_QUEUE","type": "string","default": "scrapy.pqueues.ScrapyPriorityQueue","editor": "textfield","description": ""}, "SCRAPER_SLOT_MAX_ACTIVE_SIZE": {"title": "SCRAPER_SLOT_MAX_ACTIVE_SIZE","type": "integer","default": 5000000,"description": ""}, "SPIDER_CONTRACTS": {"title": "SPIDER_CONTRACTS","type": "object","prefill": {},"editor": "json","description": ""}, "SPIDER_CONTRACTS_BASE": {"title": "SPIDER_CONTRACTS_BASE","type": "object","prefill": { "scrapy.contracts.default.UrlContract": 1, "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1, "scrapy.contracts.default.ReturnsContract": 2, "scrapy.contracts.default.ScrapesContract": 3 },"editor": "json","description": ""}, "SPIDER_LOADER_CLASS": {"title": "SPIDER_LOADER_CLASS","type": "string","default": "scrapy.spiderloader.SpiderLoader","editor": "textfield","description": ""}, "SPIDER_LOADER_WARN_ONLY": {"title": "SPIDER_LOADER_WARN_ONLY","type": "boolean","default": false,"nullable": true,"description": ""}, "SPIDER_MIDDLEWARES": {"title": "SPIDER_MIDDLEWARES","type": "object","prefill": {},"editor": "json","description": ""}, "SPIDER_MIDDLEWARES_BASE": {"title": "SPIDER_MIDDLEWARES_BASE","type": "object","prefill": { "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50, "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500, "scrapy.spidermiddlewares.referer.RefererMiddleware": 700, "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800, "scrapy.spidermiddlewares.depth.DepthMiddleware": 900 },"editor": "json","description": ""}, "SPIDER_MODULES": {"title": "SPIDER_MODULES","type": "array","prefill": [],"editor": "stringList","description": ""}, "STATSMAILER_RCPTS": {"title": "STATSMAILER_RCPTS","type": "array","prefill": [],"editor": "stringList","description": ""}, "STATS_CLASS": {"title": "STATS_CLASS","type": "string","default": "scrapy.statscollectors.MemoryStatsCollector","editor": "textfield","description": ""}, "STATS_DUMP": {"title": "STATS_DUMP","type": "boolean","default": true,"nullable": true,"description": ""}, "TELNETCONSOLE_ENABLED": {"title": "TELNETCONSOLE_ENABLED","type": "integer","default": 1,"description": ""}, "TELNETCONSOLE_HOST": {"title": "TELNETCONSOLE_HOST","type": "string","default": "127.0.0.1","editor": "textfield","description": ""}, "TELNETCONSOLE_PASSWORD": {"title": "TELNETCONSOLE_PASSWORD","type": "string","editor": "textfield","description": ""}, "TELNETCONSOLE_PORT": {"title": "TELNETCONSOLE_PORT","type": "array","prefill": [ 6023, 6073 ],"editor": "stringList","description": ""}, "TELNETCONSOLE_USERNAME": {"title": "TELNETCONSOLE_USERNAME","type": "string","default": "scrapy","editor": "textfield","description": ""}, "TEMPLATES_DIR": {"title": "TEMPLATES_DIR","type": "string","default": "/usr/local/lib/python3.10/dist-packages/scrapy/templates","editor": "textfield","description": ""}, "TWISTED_REACTOR": {"title": "TWISTED_REACTOR","type": "string","editor": "textfield","description": ""}, "URLLENGTH_LIMIT": {"title": "URLLENGTH_LIMIT","type": "integer","default": 2083,"description": ""}, "USER_AGENT": {"title": "USER_AGENT","type": "string","default": "Scrapy/2.9.0 (+https://scrapy.org)","editor": "textfield","description": ""} }}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache
.scrapy*.log
main.py
1'''2import logging3from apify.log import ActorLogFormatter4import scrapy.utils.log5
6handler = logging.StreamHandler()7handler.setFormatter(ActorLogFormatter())8
9apify_logger = logging.getLogger('apify')10apify_logger.setLevel(logging.INFO)11apify_logger.addHandler(handler)12
13apify_client_logger = logging.getLogger('apify_client')14apify_client_logger.setLevel(logging.DEBUG)15apify_client_logger.addHandler(handler)16'''17
18# We can't attach our log handler to the loggers normally,19# because Scrapy would remove them in the `configure_logging` call here:20# https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L9521# (even though `disable_existing_loggers` is set to False :facepalm:).22# We need to monkeypatch Scrapy's `configure_logging` method like this,23# so that our handler is attached right after Scrapy calls the `configure_logging` method,24# because otherwise we would lose some log messages.25'''26old_configure_logging = scrapy.utils.log.configure_logging27
28def new_configure_logging(*args, **kwargs):29 old_configure_logging(*args, **kwargs)30
31 # Scrapy uses these four main loggers: https://github.com/scrapy/scrapy/blob/a5c1ef82762c6c0910abea00c0a6249c40005e44/scrapy/utils/log.py#L44-L6132 logging.getLogger('scrapy').addHandler(handler)33 logging.getLogger('twisted').addHandler(handler)34 logging.getLogger('filelock').addHandler(handler)35 logging.getLogger('hpack').addHandler(handler)36
37scrapy.utils.log.configure_logging = new_configure_logging38'''39
40#################################################################################################################41import asyncio42import nest_asyncio43import scrapy44from scrapy.crawler import CrawlerProcess45from scrapy.utils.reactor import install_reactor46from apify import Actor47
48
49# This is necessary so that twisted and asyncio work well together50install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')51nest_asyncio.apply()52
53SPIDERS = []54
55class ITEM_PIPELINE:56 def open_spider(self, spider):57 SPIDERS.append(spider)58 59 def close_spider(self, spider):60 pass # print('CLOSE', spider)61
62 async def process_item(self, item, spider):63 await Actor.push_data(item)64 return item65
66async def periodic():67 while True:68 for s in SPIDERS:69 count = s.crawler.stats.get_value('item_scraped_count') or 070 print(f'⌛ ({s.__class__.__name__}) item_scraped_count: {count}')71 await asyncio.sleep(5)72
73async def main():74 async with Actor:75 GLOBE = {'scrapy': scrapy}76
77 actor_input = await Actor.get_input() or {}78 79 # code80 assert (spiders_code := actor_input.pop('spiders_code',0)), 'parameter required => spiders_code'81 exec(spiders_code, GLOBE)82 83 # get spiders84 start = []85 for k,v in GLOBE.items():86 if isinstance(v,type) and issubclass(v, scrapy.Spider):87 print('*** SPIDER:', k) 88 start.append(v)89
90 # start crawlers91 if start: 92 process = CrawlerProcess({93 'TWISTED_REACTOR' : 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',94 'ITEM_PIPELINES' : { ITEM_PIPELINE: 1 },95 **actor_input96 }, install_root_handler=False)97 98 for t in start: process.crawl(t)99
100 timer = asyncio.get_event_loop().create_task(periodic()) # periodic status101 process.start()102 103 print('Done.')104
105asyncio.run(main())
requirements.txt
1# Add your dependencies here.2# See https://pip.pypa.io/en/latest/reference/requirements-file-format/3# for how to format them4apify ~= 1.1.15nest-asyncio ~= 1.5.66scrapy ~= 2.9.0