1import { ApifyClient } from 'apify-client';
2
3
4
5const client = new ApifyClient({
6 token: '<YOUR_API_TOKEN>',
7});
8
9
10const input = {
11 "startUrls": [
12 {
13 "url": "https://crawlee.dev/js"
14 }
15 ],
16 "respectRobotsTxtFile": true,
17 "globs": [
18 {
19 "glob": "https://crawlee.dev/js/*/*"
20 }
21 ],
22 "pseudoUrls": [],
23 "excludes": [
24 {
25 "glob": "/**/*.{png,jpg,jpeg,pdf}"
26 }
27 ],
28 "linkSelector": "a[href]",
29 "pageFunction": async function pageFunction(context) {
30 const { $, request, log } = context;
31
32
33
34 const pageTitle = $('title').first().text();
35
36
37 const url = request.url;
38
39
40 log.info('Page scraped', { url, pageTitle });
41
42
43
44 return {
45 url,
46 pageTitle
47 };
48 },
49 "proxyConfiguration": {
50 "useApifyProxy": true
51 },
52 "initialCookies": [],
53 "additionalMimeTypes": [],
54 "preNavigationHooks": `// We need to return array of (possibly async) functions here.
55 // The functions accept two arguments: the "crawlingContext" object
56 // and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`
57 // function the crawler calls to navigate..
58 [
59 async (crawlingContext, requestAsBrowserOptions) => {
60 // ...
61 }
62 ]`,
63 "postNavigationHooks": `// We need to return array of (possibly async) functions here.
64 // The functions accept a single argument: the "crawlingContext" object.
65 [
66 async (crawlingContext) => {
67 // ...
68 },
69 ]`,
70 "customData": {}
71};
72
73
74const run = await client.actor("apify/cheerio-scraper").call(input);
75
76
77console.log('Results from dataset');
78console.log(`💾 Check your data here: https://console.apify.com/storage/datasets/${run.defaultDatasetId}`);
79const { items } = await client.dataset(run.defaultDatasetId).listItems();
80items.forEach((item) => {
81 console.dir(item);
82});
83
84