RBI Latest avatar
RBI Latest
Try for free

No credit card required

View all Actors
RBI Latest

RBI Latest

nondescript_cord/rbi-latest
Try for free

No credit card required

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "RBI-latest",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor, Dataset } = require("apify");
2const { PlaywrightCrawler } = require("crawlee");
3const PdfParse = require("pdf-parse");
4
5async function extractText(page) {
6    const data = {
7        heading: [], // Array to hold heading data
8        mainContent: [], // Array for main content data
9        tableData: [], // Array for table data
10        pdfLinks: [], // PDF link URLs
11        PdfData: [],  // Extracted data from PDFs
12    };
13
14    // Heading
15    data.heading = await page.$$eval("td b", (headers) => headers.map((header) => header.textContent.trim()));
16
17    // Main content
18    data.mainContent = await page.$$eval("td p", (paragraphs) => paragraphs.map((paragraph) => paragraph.textContent.trim()));
19
20    // Table data
21    data.tableData = await page.$$eval("table tbody .tablebg tr", (rows) => rows.map((row) => row.textContent.trim()));
22
23    // PDF links
24    data.pdfLinks = await page.$$eval("a", (links) => links.filter((link) => link.href.toLowerCase().endsWith(".pdf")).map((link) => link.href));
25
26    // Fetch and parse PDFs
27    const { default: fetch } = await import("node-fetch");
28    const pdfPromises = data.pdfLinks.map(async (pdfLink) => {
29        try {
30            const response = await fetch(pdfLink);
31            const buffer = await response.arrayBuffer();
32            const pdfText = await PdfParse(buffer);
33            return { pdfLink, text: pdfText.text };
34        } catch (error) {
35            return { pdfLink, error: error.message || true };
36        }
37    });
38
39    data.PdfData = await Promise.all(pdfPromises);
40
41    return data;
42}
43
44Actor.main(async () => {
45    const year = 2024;
46
47    const crawler = new PlaywrightCrawler({
48        
49        async requestHandler({ request, page, log, enqueueLinks }) {
50            log.info(`Processing ${request.url} for year ${year}`);
51
52            if (request.url === "https://rbi.org.in/Scripts/NotificationUser.aspx") {
53                await page.goto("https://rbi.org.in/Scripts/NotificationUser.aspx");
54
55                // Navigate based on year
56                if (year < 2015) {
57                    await page.getByText("Archives").click();
58                    await page.waitForTimeout(2000);
59                }
60                await page.locator(`#btn${year}`).click();
61                await page.locator(`//*[@id="${year}0"]`).click();
62
63                await page.waitForTimeout(2000);
64
65                await enqueueLinks({
66                    selector: "a.link2",
67                    transformRequestFunction: (reqs) => {
68                        return reqs.url === "https://rbi.org.in/Scripts/NotificationUser.aspx" ? false : reqs;
69                    },
70                });
71
72            } else {
73                // Extract content from the current page
74                const extractedData = await extractText(page);
75
76                await Dataset.pushData({
77                    Year: year,
78                    URL: request.url,
79                    Data: extractedData,
80                });
81            }
82        },
83    });
84
85    await crawler.run(["https://rbi.org.in/Scripts/NotificationUser.aspx"]);
86});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY --chown=myuser . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

crawlee.json

1{
2	"purgeOnStart": false
3}

package.json

1{
2	"name": "consumerfinance",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumerfinance rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"fs": "^0.0.1-security",
11		"node-fetch": "^3.3.2",
12		"path": "^0.12.7",
13		"pdf-parse": "^1.1.1",
14		"playwright": "^1.44.0"
15	},
16	"devDependencies": {
17		"@apify/eslint-config": "^0.4.0",
18		"@playwright/test": "^1.43.1",
19		"@types/pdf-parse": "^1.1.4",
20		"eslint": "^8.50.0"
21	},
22	"scripts": {
23		"start": "node src/main.js",
24		"lint": "eslint ./src --ext .js,.jsx",
25		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
26		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27		"postinstall": "npx crawlee install-playwright-browsers"
28	},
29	"author": "Moazzam Malek"
30}

start.bat

Download

start.sh

Download
Developer
Maintained by Community
Actor metrics
  • 2 monthly users
  • 0 stars
  • 100.0% runs succeeded
  • Created in May 2024
  • Modified about 1 month ago
Categories