RBI Latest avatar
RBI Latest

Deprecated

Pricing

Pay per usage

Go to Store
RBI Latest

RBI Latest

Deprecated

Developed by

Yash Agarwal

Yash Agarwal

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

2

Last modified

a year ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "RBI-latest",
"title": "Project Playwright Crawler JavaScript",
"description": "Crawlee and Playwright project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}

src/main.js

1const { Actor, Dataset } = require("apify");
2const { PlaywrightCrawler } = require("crawlee");
3const PdfParse = require("pdf-parse");
4
5async function extractText(page) {
6 const data = {
7 heading: [], // Array to hold heading data
8 mainContent: [], // Array for main content data
9 tableData: [], // Array for table data
10 pdfLinks: [], // PDF link URLs
11 PdfData: [], // Extracted data from PDFs
12 };
13
14 // Heading
15 data.heading = await page.$$eval("td b", (headers) => headers.map((header) => header.textContent.trim()));
16
17 // Main content
18 data.mainContent = await page.$$eval("td p", (paragraphs) => paragraphs.map((paragraph) => paragraph.textContent.trim()));
19
20 // Table data
21 data.tableData = await page.$$eval("table tbody .tablebg tr", (rows) => rows.map((row) => row.textContent.trim()));
22
23 // PDF links
24 data.pdfLinks = await page.$$eval("a", (links) => links.filter((link) => link.href.toLowerCase().endsWith(".pdf")).map((link) => link.href));
25
26 // Fetch and parse PDFs
27 const { default: fetch } = await import("node-fetch");
28 const pdfPromises = data.pdfLinks.map(async (pdfLink) => {
29 try {
30 const response = await fetch(pdfLink);
31 const buffer = await response.arrayBuffer();
32 const pdfText = await PdfParse(buffer);
33 return { pdfLink, text: pdfText.text };
34 } catch (error) {
35 return { pdfLink, error: error.message || true };
36 }
37 });
38
39 data.PdfData = await Promise.all(pdfPromises);
40
41 return data;
42}
43
44Actor.main(async () => {
45 const year = 2024;
46
47 const crawler = new PlaywrightCrawler({
48
49 async requestHandler({ request, page, log, enqueueLinks }) {
50 log.info(`Processing ${request.url} for year ${year}`);
51
52 if (request.url === "https://rbi.org.in/Scripts/NotificationUser.aspx") {
53 await page.goto("https://rbi.org.in/Scripts/NotificationUser.aspx");
54
55 // Navigate based on year
56 if (year < 2015) {
57 await page.getByText("Archives").click();
58 await page.waitForTimeout(2000);
59 }
60 await page.locator(`#btn${year}`).click();
61 await page.locator(`//*[@id="${year}0"]`).click();
62
63 await page.waitForTimeout(2000);
64
65 await enqueueLinks({
66 selector: "a.link2",
67 transformRequestFunction: (reqs) => {
68 return reqs.url === "https://rbi.org.in/Scripts/NotificationUser.aspx" ? false : reqs;
69 },
70 });
71
72 } else {
73 // Extract content from the current page
74 const extractedData = await extractText(page);
75
76 await Dataset.pushData({
77 Year: year,
78 URL: request.url,
79 Data: extractedData,
80 });
81 }
82 },
83 });
84
85 await crawler.run(["https://rbi.org.in/Scripts/NotificationUser.aspx"]);
86});

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

crawlee.json

{
"purgeOnStart": false
}

package.json

{
"name": "consumerfinance",
"version": "0.0.1",
"type": "commonjs",
"description": "consumerfinance rules extractor",
"dependencies": {
"22": "^0.0.0",
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"fs": "^0.0.1-security",
"node-fetch": "^3.3.2",
"path": "^0.12.7",
"pdf-parse": "^1.1.1",
"playwright": "^1.44.0"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"@playwright/test": "^1.43.1",
"@types/pdf-parse": "^1.1.4",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "Moazzam Malek"
}

start.bat

Download

start.sh

Download