ScrapyFy avatar
ScrapyFy
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy

Scrapy Runner

The code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn more

1from apify_client import ApifyClient
2
3# Initialize the ApifyClient with your Apify API token
4client = ApifyClient("<YOUR_API_TOKEN>")
5
6# Prepare the Actor input
7run_input = {
8    "spiders_code": """from urllib.parse import urljoin\r
9\r
10### multiple spiders can be specified\r
11\r
12class TitleSpider(scrapy.Spider):\r
13\r
14    name = 'title_spider'\r
15    allowed_domains = [\"apify.com\"]\r
16    start_urls = [\"https://apify.com\"]\r
17\r
18    custom_settings = {\r
19        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r
20        # Obey robots.txt rules\r
21        'ROBOTSTXT_OBEY'                        : True,\r
22        'DEPTH_LIMIT'                           : 2,\r
23        'LOG_ENABLED'                           : False,\r
24        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r
25        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r
26    }\r
27\r
28    def parse(self, response):\r
29        yield {\r
30            'url': response.url,\r
31            'title': response.css('title::text').extract_first(),\r
32        }\r
33        for link_href in response.css('a::attr(\"href\")'):\r
34            link_url = urljoin(response.url, link_href.get())\r
35            if link_url.startswith(('http://', 'https://')):\r
36                yield scrapy.Request(link_url)""",
37    "DEFAULT_REQUEST_HEADERS": {
38        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39        "Accept-Language": "en",
40    },
41    "DOWNLOADER_MIDDLEWARES": {},
42    "DOWNLOADER_MIDDLEWARES_BASE": {
43        "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
44        "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
45        "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
46        "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
47        "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
48        "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
49        "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
50        "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
51        "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
52        "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
53        "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
54        "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
55        "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
56        "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900,
57    },
58    "DOWNLOAD_HANDLERS": {},
59    "DOWNLOAD_HANDLERS_BASE": {
60        "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
61        "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
62        "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
63        "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
64        "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
65        "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler",
66    },
67    "EXTENSIONS": {},
68    "EXTENSIONS_BASE": {
69        "scrapy.extensions.corestats.CoreStats": 0,
70        "scrapy.extensions.telnet.TelnetConsole": 0,
71        "scrapy.extensions.memusage.MemoryUsage": 0,
72        "scrapy.extensions.memdebug.MemoryDebugger": 0,
73        "scrapy.extensions.closespider.CloseSpider": 0,
74        "scrapy.extensions.feedexport.FeedExporter": 0,
75        "scrapy.extensions.logstats.LogStats": 0,
76        "scrapy.extensions.spiderstate.SpiderState": 0,
77        "scrapy.extensions.throttle.AutoThrottle": 0,
78    },
79    "FEEDS": {},
80    "FEED_EXPORTERS": {},
81    "FEED_EXPORTERS_BASE": {
82        "json": "scrapy.exporters.JsonItemExporter",
83        "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
84        "jsonl": "scrapy.exporters.JsonLinesItemExporter",
85        "jl": "scrapy.exporters.JsonLinesItemExporter",
86        "csv": "scrapy.exporters.CsvItemExporter",
87        "xml": "scrapy.exporters.XmlItemExporter",
88        "marshal": "scrapy.exporters.MarshalItemExporter",
89        "pickle": "scrapy.exporters.PickleItemExporter",
90    },
91    "FEED_STORAGES": {},
92    "FEED_STORAGES_BASE": {
93        "": "scrapy.extensions.feedexport.FileFeedStorage",
94        "file": "scrapy.extensions.feedexport.FileFeedStorage",
95        "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
96        "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
97        "s3": "scrapy.extensions.feedexport.S3FeedStorage",
98        "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage",
99    },
100    "HTTPCACHE_IGNORE_HTTP_CODES": [],
101    "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
102    "HTTPCACHE_IGNORE_SCHEMES": ["file"],
103    "ITEM_PIPELINES": {},
104    "ITEM_PIPELINES_BASE": {},
105    "MEMDEBUG_NOTIFY": [],
106    "MEMUSAGE_NOTIFY_MAIL": [],
107    "METAREFRESH_IGNORE_TAGS": [],
108    "RETRY_HTTP_CODES": [
109        500,
110        502,
111        503,
112        504,
113        522,
114        524,
115        408,
116        429,
117    ],
118    "SPIDER_CONTRACTS": {},
119    "SPIDER_CONTRACTS_BASE": {
120        "scrapy.contracts.default.UrlContract": 1,
121        "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
122        "scrapy.contracts.default.ReturnsContract": 2,
123        "scrapy.contracts.default.ScrapesContract": 3,
124    },
125    "SPIDER_MIDDLEWARES": {},
126    "SPIDER_MIDDLEWARES_BASE": {
127        "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
128        "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
129        "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
130        "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
131        "scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
132    },
133    "SPIDER_MODULES": [],
134    "STATSMAILER_RCPTS": [],
135    "TELNETCONSOLE_PORT": [
136        6023,
137        6073,
138    ],
139}
140
141# Run the Actor and wait for it to finish
142run = client.actor("jupri/scrapyfy").call(run_input=run_input)
143
144# Fetch and print Actor results from the run's dataset (if there are any)
145print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
146for item in client.dataset(run["defaultDatasetId"]).iterate_items():
147    print(item)
148
149# 📚 Want to learn more 📖? Go to → https://docs.apify.com/api/client/python/docs/quick-start
Developer
Maintained by Community