ScrapyFy avatar
ScrapyFy
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy

Scrapy Runner

The code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn more

1import { ApifyClient } from 'apify-client';
2
3// Initialize the ApifyClient with your Apify API token
4const client = new ApifyClient({
5    token: '<YOUR_API_TOKEN>',
6});
7
8// Prepare Actor input
9const input = {
10    "spiders_code": "from urllib.parse import urljoin\r\n\r\n### multiple spiders can be specified\r\n\r\nclass TitleSpider(scrapy.Spider):\r\n\r\n    name = 'title_spider'\r\n    allowed_domains = [\"apify.com\"]\r\n    start_urls = [\"https://apify.com\"]\r\n\r\n    custom_settings = {\r\n        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r\n        # Obey robots.txt rules\r\n        'ROBOTSTXT_OBEY'                        : True,\r\n        'DEPTH_LIMIT'                           : 2,\r\n        'LOG_ENABLED'                           : False,\r\n        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r\n        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r\n    }\r\n\r\n    def parse(self, response):\r\n        yield {\r\n            'url': response.url,\r\n            'title': response.css('title::text').extract_first(),\r\n        }\r\n        for link_href in response.css('a::attr(\"href\")'):\r\n            link_url = urljoin(response.url, link_href.get())\r\n            if link_url.startswith(('http://', 'https://')):\r\n                yield scrapy.Request(link_url)",
11    "DEFAULT_REQUEST_HEADERS": {
12        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
13        "Accept-Language": "en"
14    },
15    "DOWNLOADER_MIDDLEWARES": {},
16    "DOWNLOADER_MIDDLEWARES_BASE": {
17        "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
18        "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
19        "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
20        "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
21        "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
22        "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
23        "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
24        "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
25        "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
26        "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
27        "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
28        "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
29        "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
30        "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900
31    },
32    "DOWNLOAD_HANDLERS": {},
33    "DOWNLOAD_HANDLERS_BASE": {
34        "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
35        "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
36        "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
37        "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
38        "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
39        "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler"
40    },
41    "EXTENSIONS": {},
42    "EXTENSIONS_BASE": {
43        "scrapy.extensions.corestats.CoreStats": 0,
44        "scrapy.extensions.telnet.TelnetConsole": 0,
45        "scrapy.extensions.memusage.MemoryUsage": 0,
46        "scrapy.extensions.memdebug.MemoryDebugger": 0,
47        "scrapy.extensions.closespider.CloseSpider": 0,
48        "scrapy.extensions.feedexport.FeedExporter": 0,
49        "scrapy.extensions.logstats.LogStats": 0,
50        "scrapy.extensions.spiderstate.SpiderState": 0,
51        "scrapy.extensions.throttle.AutoThrottle": 0
52    },
53    "FEEDS": {},
54    "FEED_EXPORTERS": {},
55    "FEED_EXPORTERS_BASE": {
56        "json": "scrapy.exporters.JsonItemExporter",
57        "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
58        "jsonl": "scrapy.exporters.JsonLinesItemExporter",
59        "jl": "scrapy.exporters.JsonLinesItemExporter",
60        "csv": "scrapy.exporters.CsvItemExporter",
61        "xml": "scrapy.exporters.XmlItemExporter",
62        "marshal": "scrapy.exporters.MarshalItemExporter",
63        "pickle": "scrapy.exporters.PickleItemExporter"
64    },
65    "FEED_STORAGES": {},
66    "FEED_STORAGES_BASE": {
67        "": "scrapy.extensions.feedexport.FileFeedStorage",
68        "file": "scrapy.extensions.feedexport.FileFeedStorage",
69        "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
70        "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
71        "s3": "scrapy.extensions.feedexport.S3FeedStorage",
72        "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage"
73    },
74    "HTTPCACHE_IGNORE_HTTP_CODES": [],
75    "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
76    "HTTPCACHE_IGNORE_SCHEMES": [
77        "file"
78    ],
79    "ITEM_PIPELINES": {},
80    "ITEM_PIPELINES_BASE": {},
81    "MEMDEBUG_NOTIFY": [],
82    "MEMUSAGE_NOTIFY_MAIL": [],
83    "METAREFRESH_IGNORE_TAGS": [],
84    "RETRY_HTTP_CODES": [
85        500,
86        502,
87        503,
88        504,
89        522,
90        524,
91        408,
92        429
93    ],
94    "SPIDER_CONTRACTS": {},
95    "SPIDER_CONTRACTS_BASE": {
96        "scrapy.contracts.default.UrlContract": 1,
97        "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
98        "scrapy.contracts.default.ReturnsContract": 2,
99        "scrapy.contracts.default.ScrapesContract": 3
100    },
101    "SPIDER_MIDDLEWARES": {},
102    "SPIDER_MIDDLEWARES_BASE": {
103        "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
104        "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
105        "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
106        "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
107        "scrapy.spidermiddlewares.depth.DepthMiddleware": 900
108    },
109    "SPIDER_MODULES": [],
110    "STATSMAILER_RCPTS": [],
111    "TELNETCONSOLE_PORT": [
112        6023,
113        6073
114    ]
115};
116
117(async () => {
118    // Run the Actor and wait for it to finish
119    const run = await client.actor("jupri/scrapyfy").call(input);
120
121    // Fetch and print Actor results from the run's dataset (if any)
122    console.log('Results from dataset');
123    console.log(`💾 Check your data here: https://console.apify.com/storage/datasets/${run.defaultDatasetId}`);
124    const { items } = await client.dataset(run.defaultDatasetId).listItems();
125    items.forEach((item) => {
126        console.dir(item);
127    });
128})();
129
130// 📚 Want to learn more 📖? Go to → https://docs.apify.com/api/client/js/docs
Developer
Maintained by Community