1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6 const title = await page.title();
7 log.info(`${title}`, { url: request.loadedUrl });
8 await enqueueLinks({
9 selector: ".a-btn[href*=page]",
10 });
11 await enqueueLinks({
12 selector: "article h3 a",
13 label: "detail",
14 });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17 const title = await page.title();
18 log.info(`${title}`, { url: request.loadedUrl });
19 const result = await page.evaluate(() => {
20 let result = {
21 Category: document.querySelector("a[class*=m-breadcrumbs]")
22 .innerText,
23 Title: document.querySelector("h1").innerText,
24 MainParagraphText: document.querySelector(
25 ".m-full-width-text p:first-of-type"
26 ).innerText,
27 Text: document.querySelector(".m-full-width-text").innerText,
28 PDFs: [],
29 Links: [],
30 };
31 let category;
32 for (const el of Array.from(
33 document.querySelectorAll(".m-full-width-text >*")
34 )) {
35 if (el.tagName == "H5" || el.tagName == "H4")
36 category = el?.getAttribute("id") || el?.innerText;
37 if (!category) continue;
38 const link = el.querySelector("a");
39 if (link) {
40 const isPDF = link.href.includes(".pdf");
41 const obj = {
42 linkText: link.innerText,
43 link: link.href,
44 category,
45 };
46 if (isPDF) result.PDFs.push(obj);
47 else result.Links.push(obj);
48 }
49 }
50
51 return result;
52 });
53 const PDFs = (
54 await Promise.allSettled(
55 result.PDFs.map(
56 (pdf) =>
57 new Promise(async (res, rej) => {
58 try {
59 const pdfResponse = await page.request.fetch(
60 pdf.link
61 );
62
63 // Parse the PDF using pdf-parse
64 const pdfText = await pdfParse(
65 (
66 await pdfResponse.body()
67 ).buffer
68 );
69
70 res({
71 ...pdf,
72 text: pdfText.text,
73 info: pdfText.info,
74 metadata:
75 pdfText.metadata?._metadata ||
76 pdfText.metadata,
77 error: null,
78 });
79 } catch (e) {
80 // console.log(e);
81 res({ ...pdf, error: e.message || e.code || true });
82 }
83 })
84 )
85 )
86 ).map((p) => p.value);
87 const Links = (
88 await Promise.allSettled(
89 result.Links.map(
90 (link) =>
91 new Promise(async (res, rej) => {
92 try {
93 let text;
94 if (
95 link.linkText?.includes("Read it") &&
96 link.linkText?.includes("the Federal Register")
97 ) {
98 const FederalRegisterResponse =
99 await page.request.fetch(link.link);
100 const $ = load(
101 await FederalRegisterResponse.text()
102 );
103 text = $("#fulltext_content_area").text();
104 }
105 res({
106 ...link,
107 text,
108 });
109 } catch (e) {
110 // console.log(e);
111 res({
112 ...link,
113 error: e.message || e.code || true,
114 });
115 }
116 })
117 )
118 )
119 ).map((p) => p.value);
120 if (request.errorMessages.includes("Data item is too large")) {
121 await Dataset.pushData({
122 url: request.url,
123 ...result,
124 PDFs: PDFs.map((item) => ({
125 ...item,
126 text: "Please get Manually",
127 })),
128 Links: Links.map((item) => ({
129 ...item,
130 text: "Please get Manually",
131 })),
132 });
133 }
134 await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };