1const { Actor, Dataset } = require("apify");
2const { PlaywrightCrawler } = require("crawlee");
3const PdfParse = require("pdf-parse");
4
5async function extractText(page) {
6 const data = {
7 heading: [],
8 mainContent: [],
9 tableData: [],
10 pdfLinks: [],
11 PdfData: [],
12 };
13
14
15 data.heading = await page.$$eval("td b", (headers) => headers.map((header) => header.textContent.trim()));
16
17
18 data.mainContent = await page.$$eval("td p", (paragraphs) => paragraphs.map((paragraph) => paragraph.textContent.trim()));
19
20
21 data.tableData = await page.$$eval("table tbody .tablebg tr", (rows) => rows.map((row) => row.textContent.trim()));
22
23
24 data.pdfLinks = await page.$$eval("a", (links) => links.filter((link) => link.href.toLowerCase().endsWith(".pdf")).map((link) => link.href));
25
26
27 const { default: fetch } = await import("node-fetch");
28 const pdfPromises = data.pdfLinks.map(async (pdfLink) => {
29 try {
30 const response = await fetch(pdfLink);
31 const buffer = await response.arrayBuffer();
32 const pdfText = await PdfParse(buffer);
33 return { pdfLink, text: pdfText.text };
34 } catch (error) {
35 return { pdfLink, error: error.message || true };
36 }
37 });
38
39 data.PdfData = await Promise.all(pdfPromises);
40
41 return data;
42}
43
44Actor.main(async () => {
45 const year = 2024;
46
47 const crawler = new PlaywrightCrawler({
48
49 async requestHandler({ request, page, log, enqueueLinks }) {
50 log.info(`Processing ${request.url} for year ${year}`);
51
52 if (request.url === "https://rbi.org.in/Scripts/NotificationUser.aspx") {
53 await page.goto("https://rbi.org.in/Scripts/NotificationUser.aspx");
54
55
56 if (year < 2015) {
57 await page.getByText("Archives").click();
58 await page.waitForTimeout(2000);
59 }
60 await page.locator(`#btn${year}`).click();
61 await page.locator(`//*[@id="${year}0"]`).click();
62
63 await page.waitForTimeout(2000);
64
65 await enqueueLinks({
66 selector: "a.link2",
67 transformRequestFunction: (reqs) => {
68 return reqs.url === "https://rbi.org.in/Scripts/NotificationUser.aspx" ? false : reqs;
69 },
70 });
71
72 } else {
73
74 const extractedData = await extractText(page);
75
76 await Dataset.pushData({
77 Year: year,
78 URL: request.url,
79 Data: extractedData,
80 });
81 }
82 },
83 });
84
85 await crawler.run(["https://rbi.org.in/Scripts/NotificationUser.aspx"]);
86});