1import { ApifyClient } from 'apify-client';
2
3
4
5const client = new ApifyClient({
6 token: '<YOUR_API_TOKEN>',
7});
8
9
10const input = {
11 "runMode": "DEVELOPMENT",
12 "startUrls": [
13 {
14 "url": "https://crawlee.dev/js"
15 }
16 ],
17 "respectRobotsTxtFile": true,
18 "linkSelector": "a[href]",
19 "globs": [
20 {
21 "glob": "https://crawlee.dev/js/*/*"
22 }
23 ],
24 "pseudoUrls": [],
25 "excludes": [
26 {
27 "glob": "/**/*.{png,jpg,jpeg,pdf}"
28 }
29 ],
30 "pageFunction": `// The function accepts a single argument: the "context" object.
31 // For a complete list of its properties and functions,
32 // see https://apify.com/apify/web-scraper#page-function
33 async function pageFunction(context) {
34 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
35 // debugger;
36
37 // jQuery is handy for finding DOM elements and extracting data from them.
38 // To use it, make sure to enable the "Inject jQuery" option.
39 const $ = context.jQuery;
40 const pageTitle = $('title').first().text();
41 const h1 = $('h1').first().text();
42 const first_h2 = $('h2').first().text();
43 const random_text_from_the_page = $('p').first().text();
44
45
46 // Print some information to Actor log
47 context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
48
49 // Manually add a new page to the queue for scraping.
50 await context.enqueueRequest({ url: 'http://www.example.com' });
51
52 // Return an object with the data extracted from the page.
53 // It will be stored to the resulting dataset.
54 return {
55 url: context.request.url,
56 pageTitle,
57 h1,
58 first_h2,
59 random_text_from_the_page
60 };
61 }`,
62 "proxyConfiguration": {
63 "useApifyProxy": true
64 },
65 "initialCookies": [],
66 "waitUntil": [
67 "networkidle2"
68 ],
69 "preNavigationHooks": `// We need to return array of (possibly async) functions here.
70 // The functions accept two arguments: the "crawlingContext" object
71 // and "gotoOptions".
72 [
73 async (crawlingContext, gotoOptions) => {
74 // ...
75 },
76 ]`,
77 "postNavigationHooks": `// We need to return array of (possibly async) functions here.
78 // The functions accept a single argument: the "crawlingContext" object.
79 [
80 async (crawlingContext) => {
81 // ...
82 },
83 ]`,
84 "breakpointLocation": "NONE",
85 "customData": {}
86};
87
88
89const run = await client.actor("apify/web-scraper").call(input);
90
91
92console.log('Results from dataset');
93console.log(`💾 Check your data here: https://console.apify.com/storage/datasets/${run.defaultDatasetId}`);
94const { items } = await client.dataset(run.defaultDatasetId).listItems();
95items.forEach((item) => {
96 console.dir(item);
97});
98
99