![ScrapyFy avatar](https://images.apifyusercontent.com/EBjZLtJ46zhZBsjODOrLw6Hqf8TRL-9olS0nfrzgJdQ/rs:fill:92:92/aHR0cHM6Ly9hcGlmeS1pbWFnZS11cGxvYWRzLXByb2QuczMuYW1hem9uYXdzLmNvbS9WZWprM0NMZ2lXamhkaUg0Qi9ITExKSllXZEVmanUyaXZpcC1DdGlVN0huV0VBQVRLZmouanBlZw.webp)
ScrapyFy
DeprecatedView all Actors![ScrapyFy](https://images.apifyusercontent.com/EBjZLtJ46zhZBsjODOrLw6Hqf8TRL-9olS0nfrzgJdQ/rs:fill:92:92/aHR0cHM6Ly9hcGlmeS1pbWFnZS11cGxvYWRzLXByb2QuczMuYW1hem9uYXdzLmNvbS9WZWprM0NMZ2lXamhkaUg0Qi9ITExKSllXZEVmanUyaXZpcC1DdGlVN0huV0VBQVRLZmouanBlZw.webp)
This Actor is deprecated
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative Actors![ScrapyFy](https://images.apifyusercontent.com/EBjZLtJ46zhZBsjODOrLw6Hqf8TRL-9olS0nfrzgJdQ/rs:fill:92:92/aHR0cHM6Ly9hcGlmeS1pbWFnZS11cGxvYWRzLXByb2QuczMuYW1hem9uYXdzLmNvbS9WZWprM0NMZ2lXamhkaUg0Qi9ITExKSllXZEVmanUyaXZpcC1DdGlVN0huV0VBQVRLZmouanBlZw.webp)
ScrapyFy
jupri/scrapyfy
Scrapy Runner
The code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn more
1from apify_client import ApifyClient
2
3# Initialize the ApifyClient with your Apify API token
4client = ApifyClient("<YOUR_API_TOKEN>")
5
6# Prepare the Actor input
7run_input = {
8 "spiders_code": """from urllib.parse import urljoin\r
9\r
10### multiple spiders can be specified\r
11\r
12class TitleSpider(scrapy.Spider):\r
13\r
14 name = 'title_spider'\r
15 allowed_domains = [\"apify.com\"]\r
16 start_urls = [\"https://apify.com\"]\r
17\r
18 custom_settings = {\r
19 'REQUEST_FINGERPRINTER_IMPLEMENTATION' : '2.7',\r
20 # Obey robots.txt rules\r
21 'ROBOTSTXT_OBEY' : True,\r
22 'DEPTH_LIMIT' : 2,\r
23 'LOG_ENABLED' : False,\r
24 #'CLOSESPIDER_PAGECOUNT' : 5,\r
25 'CLOSESPIDER_ITEMCOUNT' : 5,\r
26 }\r
27\r
28 def parse(self, response):\r
29 yield {\r
30 'url': response.url,\r
31 'title': response.css('title::text').extract_first(),\r
32 }\r
33 for link_href in response.css('a::attr(\"href\")'):\r
34 link_url = urljoin(response.url, link_href.get())\r
35 if link_url.startswith(('http://', 'https://')):\r
36 yield scrapy.Request(link_url)""",
37 "DEFAULT_REQUEST_HEADERS": {
38 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 "Accept-Language": "en",
40 },
41 "DOWNLOADER_MIDDLEWARES": {},
42 "DOWNLOADER_MIDDLEWARES_BASE": {
43 "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
44 "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
45 "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
46 "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
47 "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
48 "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
49 "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
50 "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
51 "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
52 "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
53 "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
54 "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
55 "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
56 "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900,
57 },
58 "DOWNLOAD_HANDLERS": {},
59 "DOWNLOAD_HANDLERS_BASE": {
60 "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
61 "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
62 "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
63 "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
64 "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
65 "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler",
66 },
67 "EXTENSIONS": {},
68 "EXTENSIONS_BASE": {
69 "scrapy.extensions.corestats.CoreStats": 0,
70 "scrapy.extensions.telnet.TelnetConsole": 0,
71 "scrapy.extensions.memusage.MemoryUsage": 0,
72 "scrapy.extensions.memdebug.MemoryDebugger": 0,
73 "scrapy.extensions.closespider.CloseSpider": 0,
74 "scrapy.extensions.feedexport.FeedExporter": 0,
75 "scrapy.extensions.logstats.LogStats": 0,
76 "scrapy.extensions.spiderstate.SpiderState": 0,
77 "scrapy.extensions.throttle.AutoThrottle": 0,
78 },
79 "FEEDS": {},
80 "FEED_EXPORTERS": {},
81 "FEED_EXPORTERS_BASE": {
82 "json": "scrapy.exporters.JsonItemExporter",
83 "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
84 "jsonl": "scrapy.exporters.JsonLinesItemExporter",
85 "jl": "scrapy.exporters.JsonLinesItemExporter",
86 "csv": "scrapy.exporters.CsvItemExporter",
87 "xml": "scrapy.exporters.XmlItemExporter",
88 "marshal": "scrapy.exporters.MarshalItemExporter",
89 "pickle": "scrapy.exporters.PickleItemExporter",
90 },
91 "FEED_STORAGES": {},
92 "FEED_STORAGES_BASE": {
93 "": "scrapy.extensions.feedexport.FileFeedStorage",
94 "file": "scrapy.extensions.feedexport.FileFeedStorage",
95 "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
96 "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
97 "s3": "scrapy.extensions.feedexport.S3FeedStorage",
98 "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage",
99 },
100 "HTTPCACHE_IGNORE_HTTP_CODES": [],
101 "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
102 "HTTPCACHE_IGNORE_SCHEMES": ["file"],
103 "ITEM_PIPELINES": {},
104 "ITEM_PIPELINES_BASE": {},
105 "MEMDEBUG_NOTIFY": [],
106 "MEMUSAGE_NOTIFY_MAIL": [],
107 "METAREFRESH_IGNORE_TAGS": [],
108 "RETRY_HTTP_CODES": [
109 500,
110 502,
111 503,
112 504,
113 522,
114 524,
115 408,
116 429,
117 ],
118 "SPIDER_CONTRACTS": {},
119 "SPIDER_CONTRACTS_BASE": {
120 "scrapy.contracts.default.UrlContract": 1,
121 "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
122 "scrapy.contracts.default.ReturnsContract": 2,
123 "scrapy.contracts.default.ScrapesContract": 3,
124 },
125 "SPIDER_MIDDLEWARES": {},
126 "SPIDER_MIDDLEWARES_BASE": {
127 "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
128 "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
129 "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
130 "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
131 "scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
132 },
133 "SPIDER_MODULES": [],
134 "STATSMAILER_RCPTS": [],
135 "TELNETCONSOLE_PORT": [
136 6023,
137 6073,
138 ],
139}
140
141# Run the Actor and wait for it to finish
142run = client.actor("jupri/scrapyfy").call(run_input=run_input)
143
144# Fetch and print Actor results from the run's dataset (if there are any)
145print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
146for item in client.dataset(run["defaultDatasetId"]).iterate_items():
147 print(item)
148
149# 📚 Want to learn more 📖? Go to → https://docs.apify.com/api/client/python/docs/quick-start
Developer
Maintained by Community
Categories