1from apify_client import ApifyClient
2
3
4
5client = ApifyClient("<YOUR_API_TOKEN>")
6
7
8run_input = {
9 "runMode": "DEVELOPMENT",
10 "startUrls": [{ "url": "https://crawlee.dev/js" }],
11 "respectRobotsTxtFile": True,
12 "linkSelector": "a[href]",
13 "globs": [{ "glob": "https://crawlee.dev/js/*/*" }],
14 "pseudoUrls": [],
15 "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
16 "pageFunction": """// The function accepts a single argument: the \"context\" object.
17// For a complete list of its properties and functions,
18// see https://apify.com/apify/web-scraper#page-function
19async function pageFunction(context) {
20 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
21 // debugger;
22
23 // jQuery is handy for finding DOM elements and extracting data from them.
24 // To use it, make sure to enable the \"Inject jQuery\" option.
25 const $ = context.jQuery;
26 const pageTitle = $('title').first().text();
27 const h1 = $('h1').first().text();
28 const first_h2 = $('h2').first().text();
29 const random_text_from_the_page = $('p').first().text();
30
31
32 // Print some information to Actor log
33 context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
34
35 // Manually add a new page to the queue for scraping.
36 await context.enqueueRequest({ url: 'http://www.example.com' });
37
38 // Return an object with the data extracted from the page.
39 // It will be stored to the resulting dataset.
40 return {
41 url: context.request.url,
42 pageTitle,
43 h1,
44 first_h2,
45 random_text_from_the_page
46 };
47}""",
48 "proxyConfiguration": { "useApifyProxy": True },
49 "initialCookies": [],
50 "waitUntil": ["networkidle2"],
51 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
52// The functions accept two arguments: the \"crawlingContext\" object
53// and \"gotoOptions\".
54[
55 async (crawlingContext, gotoOptions) => {
56 // ...
57 },
58]
59""",
60 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
61// The functions accept a single argument: the \"crawlingContext\" object.
62[
63 async (crawlingContext) => {
64 // ...
65 },
66]""",
67 "breakpointLocation": "NONE",
68 "customData": {},
69}
70
71
72run = client.actor("apify/web-scraper").call(run_input=run_input)
73
74
75print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
76for item in client.dataset(run["defaultDatasetId"]).iterate_items():
77 print(item)
78
79