ScrapyFy avatar
ScrapyFy
Under maintenance
Try for free

No credit card required

View all Actors
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
ScrapyFy

ScrapyFy

jupri/scrapyfy
Try for free

No credit card required

Scrapy Runner

The code examples below show how to run the Actor and get its results. To run the code, you need to have an Apify account. Replace <YOUR_API_TOKEN> in the code with your API token, which you can find under Settings > Integrations in Apify Console. Learn mode

Node.js

Python

curl

1from apify_client import ApifyClient
2
3# Initialize the ApifyClient with your Apify API token
4client = ApifyClient("<YOUR_API_TOKEN>")
5
6# Prepare the Actor input
7run_input = {
8    "spiders_code": """from urllib.parse import urljoin\r
9\r
10### multiple spiders can be specified\r
11\r
12class TitleSpider(scrapy.Spider):\r
13\r
14    name = 'title_spider'\r
15    allowed_domains = [\"apify.com\"]\r
16    start_urls = [\"https://apify.com\"]\r
17\r
18    custom_settings = {\r
19        'REQUEST_FINGERPRINTER_IMPLEMENTATION'  : '2.7',\r
20        # Obey robots.txt rules\r
21        'ROBOTSTXT_OBEY'                        : True,\r
22        'DEPTH_LIMIT'                           : 2,\r
23        'LOG_ENABLED'                           : False,\r
24        #'CLOSESPIDER_PAGECOUNT'                 : 5,\r
25        'CLOSESPIDER_ITEMCOUNT'                 : 5,\r
26    }\r
27\r
28    def parse(self, response):\r
29        yield {\r
30            'url': response.url,\r
31            'title': response.css('title::text').extract_first(),\r
32        }\r
33        for link_href in response.css('a::attr(\"href\")'):\r
34            link_url = urljoin(response.url, link_href.get())\r
35            if link_url.startswith(('http://', 'https://')):\r
36                yield scrapy.Request(link_url)""",
37    "DEFAULT_REQUEST_HEADERS": {
38        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39        "Accept-Language": "en",
40    },
41    "DOWNLOADER_MIDDLEWARES": {},
42    "DOWNLOADER_MIDDLEWARES_BASE": {
43        "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
44        "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
45        "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
46        "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
47        "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
48        "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
49        "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
50        "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
51        "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
52        "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
53        "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
54        "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
55        "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
56        "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900,
57    },
58    "DOWNLOAD_HANDLERS": {},
59    "DOWNLOAD_HANDLERS_BASE": {
60        "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
61        "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
62        "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
63        "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
64        "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
65        "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler",
66    },
67    "EXTENSIONS": {},
68    "EXTENSIONS_BASE": {
69        "scrapy.extensions.corestats.CoreStats": 0,
70        "scrapy.extensions.telnet.TelnetConsole": 0,
71        "scrapy.extensions.memusage.MemoryUsage": 0,
72        "scrapy.extensions.memdebug.MemoryDebugger": 0,
73        "scrapy.extensions.closespider.CloseSpider": 0,
74        "scrapy.extensions.feedexport.FeedExporter": 0,
75        "scrapy.extensions.logstats.LogStats": 0,
76        "scrapy.extensions.spiderstate.SpiderState": 0,
77        "scrapy.extensions.throttle.AutoThrottle": 0,
78    },
79    "FEEDS": {},
80    "FEED_EXPORTERS": {},
81    "FEED_EXPORTERS_BASE": {
82        "json": "scrapy.exporters.JsonItemExporter",
83        "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
84        "jsonl": "scrapy.exporters.JsonLinesItemExporter",
85        "jl": "scrapy.exporters.JsonLinesItemExporter",
86        "csv": "scrapy.exporters.CsvItemExporter",
87        "xml": "scrapy.exporters.XmlItemExporter",
88        "marshal": "scrapy.exporters.MarshalItemExporter",
89        "pickle": "scrapy.exporters.PickleItemExporter",
90    },
91    "FEED_STORAGES": {},
92    "FEED_STORAGES_BASE": {
93        "": "scrapy.extensions.feedexport.FileFeedStorage",
94        "file": "scrapy.extensions.feedexport.FileFeedStorage",
95        "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
96        "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
97        "s3": "scrapy.extensions.feedexport.S3FeedStorage",
98        "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage",
99    },
100    "HTTPCACHE_IGNORE_HTTP_CODES": [],
101    "HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS": [],
102    "HTTPCACHE_IGNORE_SCHEMES": ["file"],
103    "ITEM_PIPELINES": {},
104    "ITEM_PIPELINES_BASE": {},
105    "MEMDEBUG_NOTIFY": [],
106    "MEMUSAGE_NOTIFY_MAIL": [],
107    "METAREFRESH_IGNORE_TAGS": [],
108    "RETRY_HTTP_CODES": [
109        500,
110        502,
111        503,
112        504,
113        522,
114        524,
115        408,
116        429,
117    ],
118    "SPIDER_CONTRACTS": {},
119    "SPIDER_CONTRACTS_BASE": {
120        "scrapy.contracts.default.UrlContract": 1,
121        "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
122        "scrapy.contracts.default.ReturnsContract": 2,
123        "scrapy.contracts.default.ScrapesContract": 3,
124    },
125    "SPIDER_MIDDLEWARES": {},
126    "SPIDER_MIDDLEWARES_BASE": {
127        "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
128        "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
129        "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
130        "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
131        "scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
132    },
133    "SPIDER_MODULES": [],
134    "STATSMAILER_RCPTS": [],
135    "TELNETCONSOLE_PORT": [
136        6023,
137        6073,
138    ],
139}
140
141# Run the Actor and wait for it to finish
142run = client.actor("jupri/scrapyfy").call(run_input=run_input)
143
144# Fetch and print Actor results from the run's dataset (if there are any)
145for item in client.dataset(run["defaultDatasetId"]).iterate_items():
146    print(item)
Developer
Maintained by Community
Actor metrics
  • 1 monthly users
  • 88.1% runs succeeded
  • 0.0 days response time
  • Created in Jun 2023
  • Modified 10 months ago