1from apify_client import ApifyClient
2
3
4
5client = ApifyClient("<YOUR_API_TOKEN>")
6
7
8run_input = {
9 "runMode": "DEVELOPMENT",
10 "startUrls": [{ "url": "https://crawlee.dev" }],
11 "linkSelector": "a[href]",
12 "globs": [{ "glob": "https://crawlee.dev/*/*" }],
13 "pseudoUrls": [],
14 "pageFunction": """// The function accepts a single argument: the \"context\" object.
15// For a complete list of its properties and functions,
16// see https://apify.com/apify/web-scraper#page-function
17async function pageFunction(context) {
18 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
19 // debugger;
20
21 // jQuery is handy for finding DOM elements and extracting data from them.
22 // To use it, make sure to enable the \"Inject jQuery\" option.
23 const $ = context.jQuery;
24 const pageTitle = $('title').first().text();
25 const h1 = $('h1').first().text();
26 const first_h2 = $('h2').first().text();
27 const random_text_from_the_page = $('p').first().text();
28
29
30 // Print some information to actor log
31 context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
32
33 // Manually add a new page to the queue for scraping.
34 await context.enqueueRequest({ url: 'http://www.example.com' });
35
36 // Return an object with the data extracted from the page.
37 // It will be stored to the resulting dataset.
38 return {
39 url: context.request.url,
40 pageTitle,
41 h1,
42 first_h2,
43 random_text_from_the_page
44 };
45}""",
46 "proxyConfiguration": { "useApifyProxy": True },
47 "initialCookies": [],
48 "waitUntil": ["networkidle2"],
49 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
50// The functions accept two arguments: the \"crawlingContext\" object
51// and \"gotoOptions\".
52[
53 async (crawlingContext, gotoOptions) => {
54 // ...
55 },
56]
57""",
58 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
59// The functions accept a single argument: the \"crawlingContext\" object.
60[
61 async (crawlingContext) => {
62 // ...
63 },
64]""",
65 "breakpointLocation": "NONE",
66 "customData": {},
67}
68
69
70run = client.actor("undrtkr984/web-scraper-task").call(run_input=run_input)
71
72
73print("💾 Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
74for item in client.dataset(run["defaultDatasetId"]).iterate_items():
75 print(item)
76
77