1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5
6
7const fs = require("fs");
8const { promisify } = require("util");
9
10router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
11 const title = await page.title();
12 log.info(`${title}`, { url: request.loadedUrl });
13 x = title;
14 await enqueueLinks({
15 selector: "#edit-next-container a",
16 });
17
18 await enqueueLinks({
19 selector: "tbody td a",
20 label: "detail",
21 transformRequestFunction(request) {
22 if (request.url.endsWith(".pdf")) {
23 log.info(`PDF ${request.url}`);
24 fetchPDF(request.url);
25 return false;
26 } else {
27 return request;
28 }
29 },
30 });
31 async function fetchPDF(pdfLink) {
32 const { default: fetch } = await import("node-fetch");
33 try {
34
35
36
37
38 const response = await fetch(pdfLink);
39 const buffer = await response.arrayBuffer();
40 const pdfText = await pdfParse(buffer);
41
42
43
44
45
46
47
48
49
50
51
52
53 await Dataset.pushData({
54 url: pdfLink,
55 pdftext: pdfText.text,
56 });
57
58 } catch (error) {
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78 await Dataset.pushData({
79 link: pdfLink,
80 error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
81 });
82 }
83 }
84});
85
86router.addHandler("detail", async ({ request, page, log }) => {
87 try {
88 const title = await page.title();
89 const url = request.loadedUrl;
90 log.info(`${title}`, { url: request.loadedUrl });
91 const result = await page.evaluate(() => {
92 const result = {
93 Docket:
94 document.querySelector("span[class*=docket]")?.innerText ||
95 "N/A",
96 Date:
97 document.querySelector("span[class*=date]")?.innerText ||
98 "N/A",
99 Category:
100 document.querySelector(
101 "a[href*=letters-credit-unions-other-guidance]"
102 )?.innerText || "N/A",
103 Title:
104 document.querySelector(".pseudo-title")?.innerText || "N/A",
105 MainParagraphText:
106 document.querySelector(".row.no-gutters .body")
107 ?.innerText || "N/A",
108 Links: [],
109 PDFs: [],
110 };
111
112 const linkElements = document.querySelectorAll(
113 ".row.no-gutters .body a"
114 );
115 for (const el of Array.from(linkElements)) {
116 const obj = {
117 linkText: el.innerText || "N/A",
118 link: el.href || "",
119 };
120 const numericValue = Number(obj.linkText);
121
122
123
124 if (
125 isNaN(numericValue) &&
126 !obj.link.includes("mailto") &&
127 obj.link !== ""
128 ) {
129 if(obj.link.endsWith('.pdf')){
130 result.PDFs.push(obj);
131 }
132 else result.Links.push(obj);
133 }
134 }
135
136 return result;
137 });
138
139 const Links = (
140 await Promise.allSettled(
141 result.Links.map(
142 (link) =>
143 new Promise(async (res, rej) => {
144 try {
145
146 if (!link.link.includes(".pdf")) {
147 const FederalRegisterResponse =
148 await page.request.fetch(link.link);
149 const $ = load(
150 await FederalRegisterResponse.text()
151 );
152 const contentDiv = $(".layout-content");
153 innerLinks = contentDiv
154 .find("a")
155 .map((i, el) => $(el).attr("href"))
156 .get();
157 innerLinks = innerLinks.map((innerLink) => {
158 if (!innerLink.startsWith("http")) {
159 return (
160 "https://ncua.gov" + innerLink
161 );
162 }
163 return innerLink;
164 });
165 innerLinks = Array.from(
166 new Set(innerLinks)
167 );
168 PDFLinks = innerLinks.filter((link) =>
169 link.endsWith(".pdf")
170 );
171 innerLinks = innerLinks.filter(
172 (link) => !link.endsWith(".pdf")
173 );
174 innerLinks = innerLinks.filter(
175 (link) => !link.endsWith("@ncua.gov")
176 );
177 innerLinks = innerLinks.filter(
178 (link) => !link.includes("#ftn")
179 );
180 innerText = $("p").text();
181 }
182 res({
183 ...link,
184 innerText,
185 innerLinks,
186 PDFLinks,
187 });
188 } catch (e) {
189
190 res({
191 ...link,
192
193 error: "404 page not found",
194 });
195 }
196 })
197 )
198 )
199 ).map((p) => p.value);
200
201 const InnerPDFs = (
202 await Promise.allSettled(
203 Links.map(
204 (pdf) =>
205 new Promise(async (res, rej) => {
206 try {
207 const pdfDataArray = [];
208
209
210 for (const pdfLink of pdf.PDFLinks) {
211 try {
212
213 const link = pdfLink;
214 const pdfResponse =
215 await page.request.fetch(pdfLink);
216
217
218 const pdfText = await pdfParse(
219 (
220 await pdfResponse.body()
221 ).buffer
222 );
223
224
225 pdfDataArray.push({
226 link,
227 text: pdfText.text,
228 });
229 } catch (innerError) {
230 pdfDataArray.push({
231 link: pdfLink,
232 error:
233 innerError.message ||
234 innerError.code ||
235 true,
236 });
237 }
238 }
239
240 res({
241 ...pdf,
242 pdfDataArray,
243 });
244 } catch (e) {
245
246 res({
247 ...pdf,
248 error: e.message || e.code || true,
249 });
250 }
251 })
252 )
253 )
254 ).map((p) => p.value);
255
256 const PDFs = (
257 await Promise.allSettled(
258 result.PDFs.map(
259 (pdf) =>
260 new Promise(async (res, rej) => {
261 try {
262 const pdfResponse = await page.request.fetch(
263 pdf.link
264 );
265
266
267 const pdfText = await pdfParse(
268 (
269 await pdfResponse.body()
270 ).buffer
271 );
272
273 res({
274 ...pdf,
275 text: pdfText.text,
276 });
277 } catch (e) {
278
279 res({ ...pdf, error: e.message || e.code || true });
280 }
281 })
282 )
283 )
284 ).map((p) => p.value);
285
286
287 if (request.errorMessages.includes("Data item is too large")) {
288 await Dataset.pushData({
289 url: request.url,
290 ...result,
291 PDFs: PDFs.map((item) => ({
292 ...item,
293 text: "Please retrieve manually due to size limitations",
294 })),
295 Links: Links.map((item) => ({
296 ...item,
297 text: "Please retrieve manually due to size limitations",
298 })),
299 });
300 } else {
301 await Dataset.pushData({
302 url: request.url,
303 ...result,
304 Links,
305 PDFs,
306 InnerPDFs
307 });
308 }
309 } catch (error) {
310 log.error(
311 `An unexpected error occurred: ${error.message || error.code}`
312 );
313 }
314});
315module.exports = { router };