1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
2import axios from 'axios';
3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
4import * as cheerio from 'cheerio';
5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
6import { Actor } from 'apify';
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10import https from 'https'
11const httpsAgent = new https.Agent({ rejectUnauthorized: false })
12// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
13await Actor.init();
14
15// Structure of input is defined in input_schema.json
16const input = await Actor.getInput();
17const { url } = input;
18
19// Fetch the HTML content of the page.
20const response = await axios.get(url, { httpsAgent: httpsAgent });
21
22// Parse the downloaded HTML with Cheerio to enable data extraction.
23const $ = cheerio.load(response.data);
24
25// Extract all headings from the page (tag name and text).
26const headings = [];
27$("h1, h2, h3, h4, h5, h6").each((i, element) => {
28 const headingObject = {
29 level: $(element).prop("tagName").toLowerCase(),
30 text: $(element).text(),
31 };
32 console.log("Extracted heading", headingObject);
33 headings.push(headingObject);
34});
35
36// Save headings to Dataset - a table-like storage.
37await Actor.pushData({ output: response.data });
38
39// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
40await Actor.exit();