ScrapyFy
Under maintenanceTry for free
No credit card required
View all Actors
This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?
See alternative ActorsThe code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn more
Node.js
Python
curl
1import { ApifyClient } from 'apify-client';
2
3// Initialize the ApifyClient with your Apify API token
4const client = new ApifyClient({
5 token: '<YOUR_API_TOKEN>',
6});
7
8// Prepare Actor input
9const input = {
10 "spiders_code": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n name = 'title_spider'\r\n allowed_domains = [\"apify.com\"]\r\n start_urls = [\"https://apify.com\"]\r\n\r\n custom_settings = {\r\n 'REQUEST_FINGERPRINTER_IMPLEMENTATION' : '2.7',\r\n # Obey robots.txt rules\r\n 'ROBOTSTXT_OBEY' : True,\r\n 'DEPTH_LIMIT' : 2,\r\n 'LOG_ENABLED' : False,\r\n #'CLOSESPIDER_PAGECOUNT' : 5,\r\n 'CLOSESPIDER_ITEMCOUNT' : 5,\r\n }\r\n\r\n def parse(self, response):\r\n yield {\r\n 'url': response.url,\r\n 'title': response.css('title::text').extract_first(),\r\n }\r\n for link_href in response.css('a::attr(\"href\")'):\r\n link_url = urljoin(response.url, link_href.get())\r\n if link_url.startswith(('http://', 'https://')):\r\n yield scrapy.Request(link_url)",
11 "DEFAULT_REQUEST_HEADERS": {
12 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
13 "Accept-Language": "en"
14 },
15 "DOWNLOADER_MIDDLEWARES": {},
16 "DOWNLOADER_MIDDLEWARES_BASE": {
17 "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
18 "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
19 "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
20 "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
21 "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
22 "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
23 "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
24 "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
25 "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
26 "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
27 "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
28 "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
29 "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
30 "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
31 },
32 "DOWNLOAD_HANDLERS": {},
33 "DOWNLOAD_HANDLERS_BASE": {
34 "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
35 "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
36 "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
37 "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
38 "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
39 "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
40 },
41 "EXTENSIONS": {},
42 "EXTENSIONS_BASE": {
43 "scrapy.extensions.corestats.CoreStats": 0,
44 "scrapy.extensions.telnet.TelnetConsole": 0,
45 "scrapy.extensions.memusage.MemoryUsage": 0,
46 "scrapy.extensions.memdebug.MemoryDebugger": 0,
47 "scrapy.extensions.closespider.CloseSpider": 0,
48 "scrapy.extensions.feedexport.FeedExporter": 0,
49 "scrapy.extensions.logstats.LogStats": 0,
50 "scrapy.extensions.spiderstate.SpiderState": 0,
51 "scrapy.extensions.throttle.AutoThrottle": 0
52 },
53 "FEEDS": {},
54 "FEED_EXPORTERS": {},
55 "FEED_EXPORTERS_BASE": {
56 "json": "scrapy.exporters.JsonItemExporter",
57 "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
58 "jsonl": "scrapy.exporters.JsonLinesItemExporter",
59 "jl": "scrapy.exporters.JsonLinesItemExporter",
60 "csv": "scrapy.exporters.CsvItemExporter",
61 "xml": "scrapy.exporters.XmlItemExporter",
62 "marshal": "scrapy.exporters.MarshalItemExporter",
63 "pickle": "scrapy.exporters.PickleItemExporter"
64 },
65 "FEED_STORAGES": {},
66 "FEED_STORAGES_BASE": {
67 "": "scrapy.extensions.feedexport.FileFeedStorage",
68 "file": "scrapy.extensions.feedexport.FileFeedStorage",
69 "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
70 "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
71 "s3": "scrapy.extensions.feedexport.S3FeedStorage",
72 "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
73 },
74 "HTTPCACHE_IGNORE_HTTP_CODES": [],
75 "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
76 "HTTPCACHE_IGNORE_SCHEMES": [
77 "file"
78 ],
79 "ITEM_PIPELINES": {},
80 "ITEM_PIPELINES_BASE": {},
81 "MEMDEBUG_NOTIFY": [],
82 "MEMUSAGE_NOTIFY_MAIL": [],
83 "METAREFRESH_IGNORE_TAGS": [],
84 "RETRY_HTTP_CODES": [
85 500,
86 502,
87 503,
88 504,
89 522,
90 524,
91 408,
92 429
93 ],
94 "SPIDER_CONTRACTS": {},
95 "SPIDER_CONTRACTS_BASE": {
96 "scrapy.contracts.default.UrlContract": 1,
97 "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
98 "scrapy.contracts.default.ReturnsContract": 2,
99 "scrapy.contracts.default.ScrapesContract": 3
100 },
101 "SPIDER_MIDDLEWARES": {},
102 "SPIDER_MIDDLEWARES_BASE": {
103 "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
104 "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
105 "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
106 "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
107 "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
108 },
109 "SPIDER_MODULES": [],
110 "STATSMAILER_RCPTS": [],
111 "TELNETCONSOLE_PORT": [
112 6023,
113 6073
114 ]
115};
116
117(async () => {
118 // Run the Actor and wait for it to finish
119 const run = await client.actor("jupri/scrapyfy").call(input);
120
121 // Fetch and print Actor results from the run's dataset (if any)
122 console.log('Results from dataset');
123 console.log(`💾 Check your data here: https://console.apify.com/storage/datasets/${run.defaultDatasetId}`);
124 const { items } = await client.dataset(run.defaultDatasetId).listItems();
125 items.forEach((item) => {
126 console.dir(item);
127 });
128})();
129
130// 📚 Want to learn more 📖? Go to → https://docs.apify.com/api/client/js/docs
Developer
Maintained by Community
Actor metrics
- 2 monthly users
- 44.4% runs succeeded
- days response time
- Created in Jun 2023
- Modified 11 months ago
Categories