1import { Actor } from "apify";
2import { PlaywrightCrawler, Dataset } from "crawlee";
3
4Actor.main(async () => {
5 const crawler = new PlaywrightCrawler({
6 maxRequestsPerCrawl: 100,
7 async requestHandler({ request, page, log }) {
8 log.info(`Processing ${request.url}`);
9
10 const title = await page.title();
11 log.info(`${title}`, { url: request.loadedUrl });
12 const result = await page.evaluate(() => {
13 const result = {
14 Category: "Rules and Regulations",
15 Title:
16 document.querySelector(".logo h2")?.innerText || "N/A",
17 MainParagraphText:
18 document.querySelector(".table-of-contents.root")
19 ?.innerText || "N/A",
20 Links: [],
21 PDFs: [],
22 };
23
24 const linkElements = document.querySelectorAll(
25 ".table-of-contents.root a"
26 );
27 for (const el of Array.from(linkElements)) {
28 const obj = {
29 linkText: el.innerText || "N/A",
30 link: el.href || "",
31 };
32 const numericValue = Number(obj.linkText);
33
34 if (
35 isNaN(numericValue) &&
36 !obj.link.includes("mailto") &&
37 obj.link !== ""
38 ) {
39 if (obj.link.endsWith(".pdf")) {
40 result.PDFs.push(obj);
41 } else result.Links.push(obj);
42 }
43 }
44
45 return result;
46 });
47
48
49 await Dataset.pushData({
50 url: request.url,
51 ...result,
52 });
53
54 },
55 });
56
57
58 await crawler.run(['https://www.ecfr.gov/current/title-12/chapter-VII']);
59});