ScrapyFy avatar
ScrapyFy
Under maintenance
Try for free

No credit card required

View all Actors
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy
Try for free

No credit card required

Scrapy Runner

The code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn mode

Node.js

Python

curl

1# Set API token
2API_TOKEN=<YOUR_API_TOKEN>
3
4# Prepare Actor input
5cat > input.json <<'EOF'
6{
7  "spiders_code": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n    name = 'title_spider'\r\n    allowed_domains = [\"apify.com\"]\r\n    start_urls = [\"https://apify.com\"]\r\n\r\n    custom_settings = {\r\n        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r\n        # Obey robots.txt rules\r\n        'ROBOTSTXT_OBEY'                        : True,\r\n        'DEPTH_LIMIT'                           : 2,\r\n        'LOG_ENABLED'                           : False,\r\n        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r\n        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r\n    }\r\n\r\n    def parse(self, response):\r\n        yield {\r\n            'url': response.url,\r\n            'title': response.css('title::text').extract_first(),\r\n        }\r\n        for link_href in response.css('a::attr(\"href\")'):\r\n            link_url = urljoin(response.url, link_href.get())\r\n            if link_url.startswith(('http://', 'https://')):\r\n                yield scrapy.Request(link_url)",
8  "DEFAULT_REQUEST_HEADERS": {
9    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10    "Accept-Language": "en"
11  },
12  "DOWNLOADER_MIDDLEWARES": {},
13  "DOWNLOADER_MIDDLEWARES_BASE": {
14    "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
15    "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
16    "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
17    "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
18    "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
19    "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
20    "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
21    "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
22    "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
23    "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
24    "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
25    "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
26    "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
27    "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
28  },
29  "DOWNLOAD_HANDLERS": {},
30  "DOWNLOAD_HANDLERS_BASE": {
31    "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
32    "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
33    "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
34    "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
35    "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
36    "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
37  },
38  "EXTENSIONS": {},
39  "EXTENSIONS_BASE": {
40    "scrapy.extensions.corestats.CoreStats": 0,
41    "scrapy.extensions.telnet.TelnetConsole": 0,
42    "scrapy.extensions.memusage.MemoryUsage": 0,
43    "scrapy.extensions.memdebug.MemoryDebugger": 0,
44    "scrapy.extensions.closespider.CloseSpider": 0,
45    "scrapy.extensions.feedexport.FeedExporter": 0,
46    "scrapy.extensions.logstats.LogStats": 0,
47    "scrapy.extensions.spiderstate.SpiderState": 0,
48    "scrapy.extensions.throttle.AutoThrottle": 0
49  },
50  "FEEDS": {},
51  "FEED_EXPORTERS": {},
52  "FEED_EXPORTERS_BASE": {
53    "json": "scrapy.exporters.JsonItemExporter",
54    "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
55    "jsonl": "scrapy.exporters.JsonLinesItemExporter",
56    "jl": "scrapy.exporters.JsonLinesItemExporter",
57    "csv": "scrapy.exporters.CsvItemExporter",
58    "xml": "scrapy.exporters.XmlItemExporter",
59    "marshal": "scrapy.exporters.MarshalItemExporter",
60    "pickle": "scrapy.exporters.PickleItemExporter"
61  },
62  "FEED_STORAGES": {},
63  "FEED_STORAGES_BASE": {
64    "": "scrapy.extensions.feedexport.FileFeedStorage",
65    "file": "scrapy.extensions.feedexport.FileFeedStorage",
66    "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
67    "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
68    "s3": "scrapy.extensions.feedexport.S3FeedStorage",
69    "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
70  },
71  "HTTPCACHE_IGNORE_HTTP_CODES": [],
72  "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
73  "HTTPCACHE_IGNORE_SCHEMES": [
74    "file"
75  ],
76  "ITEM_PIPELINES": {},
77  "ITEM_PIPELINES_BASE": {},
78  "MEMDEBUG_NOTIFY": [],
79  "MEMUSAGE_NOTIFY_MAIL": [],
80  "METAREFRESH_IGNORE_TAGS": [],
81  "RETRY_HTTP_CODES": [
82    500,
83    502,
84    503,
85    504,
86    522,
87    524,
88    408,
89    429
90  ],
91  "SPIDER_CONTRACTS": {},
92  "SPIDER_CONTRACTS_BASE": {
93    "scrapy.contracts.default.UrlContract": 1,
94    "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
95    "scrapy.contracts.default.ReturnsContract": 2,
96    "scrapy.contracts.default.ScrapesContract": 3
97  },
98  "SPIDER_MIDDLEWARES": {},
99  "SPIDER_MIDDLEWARES_BASE": {
100    "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
101    "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
102    "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
103    "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
104    "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
105  },
106  "SPIDER_MODULES": [],
107  "STATSMAILER_RCPTS": [],
108  "TELNETCONSOLE_PORT": [
109    6023,
110    6073
111  ]
112}
113EOF
114
115# Run the Actor using an HTTP API
116# See the full API reference at https://docs.apify.com/api/v2
117curl "https://api.apify.com/v2/acts/jupri~scrapyfy/runs?token=$API_TOKEN" \
118  -X POST \
119  -d @input.json \
120  -H 'Content-Type: application/json'
Developer
Maintained by Community
Actor metrics
  • 2 monthly users
  • 81.0% runs succeeded
  • 0.0 days response time
  • Created in Jun 2023
  • Modified 10 months ago