1from apify_client import ApifyClient
2
3
4
5client = ApifyClient("<YOUR_API_TOKEN>")
6
7
8run_input = {
9 "startUrls": [{ "url": "https://crawlee.dev/js" }],
10 "respectRobotsTxtFile": True,
11 "globs": [{ "glob": "https://crawlee.dev/js/*/*" }],
12 "pseudoUrls": [],
13 "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
14 "linkSelector": "a[href]",
15 "runScripts": False,
16 "pageFunction": """async function pageFunction(context) {
17 const { window, request, log } = context;
18
19 // The \"window\" property contains the JSDOM object which is useful
20 // for querying DOM elements and extracting data from them.
21 const pageTitle = window.document.title;
22
23 // The \"request\" property contains various information about the web page loaded.
24 const url = request.url;
25
26 // Use \"log\" object to print information to Actor log.
27 log.info('Page scraped', { url, pageTitle });
28
29 // Return an object with the data extracted from the page.
30 // It will be stored to the resulting dataset.
31 return {
32 url,
33 pageTitle
34 };
35}""",
36 "proxyConfiguration": { "useApifyProxy": True },
37 "initialCookies": [],
38 "additionalMimeTypes": [],
39 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
40// The functions accept two arguments: the \"crawlingContext\" object
41// and \"requestAsBrowserOptions\" which are passed to the `requestAsBrowser()`
42// function the crawler calls to navigate..
43[
44 async (crawlingContext, requestAsBrowserOptions) => {
45 // ...
46 }
47]""",
48 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
49// The functions accept a single argument: the \"crawlingContext\" object.
50[
51 async (crawlingContext) => {
52 // ...
53 },
54]""",
55 "customData": {},
56}
57
58
59run = client.actor("apify/jsdom-scraper").call(run_input=run_input)
60
61
62print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
63for item in client.dataset(run["defaultDatasetId"]).iterate_items():
64 print(item)
65
66