1from apify_client import ApifyClient
2
3
4
5client = ApifyClient("<YOUR_API_TOKEN>")
6
7
8run_input = {
9 "startUrls": [{ "url": "https://crawlee.dev/js" }],
10 "respectRobotsTxtFile": True,
11 "globs": [{ "glob": "https://crawlee.dev/js/*/*" }],
12 "pseudoUrls": [],
13 "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
14 "linkSelector": "a[href]",
15 "pageFunction": """async function pageFunction(context) {
16 const { $, request, log } = context;
17
18 // The \"$\" property contains the Cheerio object which is useful
19 // for querying DOM elements and extracting data from them.
20 const pageTitle = $('title').first().text();
21
22 // The \"request\" property contains various information about the web page loaded.
23 const url = request.url;
24
25 // Use \"log\" object to print information to Actor log.
26 log.info('Page scraped', { url, pageTitle });
27
28 // Return an object with the data extracted from the page.
29 // It will be stored to the resulting dataset.
30 return {
31 url,
32 pageTitle
33 };
34}""",
35 "proxyConfiguration": { "useApifyProxy": True },
36 "initialCookies": [],
37 "additionalMimeTypes": [],
38 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
39// The functions accept two arguments: the \"crawlingContext\" object
40// and \"requestAsBrowserOptions\" which are passed to the `requestAsBrowser()`
41// function the crawler calls to navigate..
42[
43 async (crawlingContext, requestAsBrowserOptions) => {
44 // ...
45 }
46]""",
47 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
48// The functions accept a single argument: the \"crawlingContext\" object.
49[
50 async (crawlingContext) => {
51 // ...
52 },
53]""",
54 "customData": {},
55}
56
57
58run = client.actor("apify/cheerio-scraper").call(run_input=run_input)
59
60
61print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
62for item in client.dataset(run["defaultDatasetId"]).iterate_items():
63 print(item)
64
65