Deprecated

This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors

RBI Latest

nondescript_cord/rbi-latest

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "RBI-latest",
4	"title": "Project Playwright Crawler JavaScript",
5	"description": "Crawlee and Playwright project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-playwright-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        }
17    }
18}

src/main.js

1const { Actor, Dataset } = require("apify");
2const { PlaywrightCrawler } = require("crawlee");
3const PdfParse = require("pdf-parse");
4
5async function extractText(page) {
6    const data = {
7        heading: [], // Array to hold heading data
8        mainContent: [], // Array for main content data
9        tableData: [], // Array for table data
10        pdfLinks: [], // PDF link URLs
11        PdfData: [],  // Extracted data from PDFs
12    };
13
14    // Heading
15    data.heading = await page.$$eval("td b", (headers) => headers.map((header) => header.textContent.trim()));
16
17    // Main content
18    data.mainContent = await page.$$eval("td p", (paragraphs) => paragraphs.map((paragraph) => paragraph.textContent.trim()));
19
20    // Table data
21    data.tableData = await page.$$eval("table tbody .tablebg tr", (rows) => rows.map((row) => row.textContent.trim()));
22
23    // PDF links
24    data.pdfLinks = await page.$$eval("a", (links) => links.filter((link) => link.href.toLowerCase().endsWith(".pdf")).map((link) => link.href));
25
26    // Fetch and parse PDFs
27    const { default: fetch } = await import("node-fetch");
28    const pdfPromises = data.pdfLinks.map(async (pdfLink) => {
29        try {
30            const response = await fetch(pdfLink);
31            const buffer = await response.arrayBuffer();
32            const pdfText = await PdfParse(buffer);
33            return { pdfLink, text: pdfText.text };
34        } catch (error) {
35            return { pdfLink, error: error.message || true };
36        }
37    });
38
39    data.PdfData = await Promise.all(pdfPromises);
40
41    return data;
42}
43
44Actor.main(async () => {
45    const year = 2024;
46
47    const crawler = new PlaywrightCrawler({
48        
49        async requestHandler({ request, page, log, enqueueLinks }) {
50            log.info(`Processing ${request.url} for year ${year}`);
51
52            if (request.url === "https://rbi.org.in/Scripts/NotificationUser.aspx") {
53                await page.goto("https://rbi.org.in/Scripts/NotificationUser.aspx");
54
55                // Navigate based on year
56                if (year < 2015) {
57                    await page.getByText("Archives").click();
58                    await page.waitForTimeout(2000);
59                }
60                await page.locator(`#btn${year}`).click();
61                await page.locator(`//*[@id="${year}0"]`).click();
62
63                await page.waitForTimeout(2000);
64
65                await enqueueLinks({
66                    selector: "a.link2",
67                    transformRequestFunction: (reqs) => {
68                        return reqs.url === "https://rbi.org.in/Scripts/NotificationUser.aspx" ? false : reqs;
69                    },
70                });
71
72            } else {
73                // Extract content from the current page
74                const extractedData = await extractText(page);
75
76                await Dataset.pushData({
77                    Year: year,
78                    URL: request.url,
79                    Data: extractedData,
80                });
81            }
82        },
83    });
84
85    await crawler.run(["https://rbi.org.in/Scripts/NotificationUser.aspx"]);
86});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY --chown=myuser . ./
26
27
28# Run the image. If you know you won't need headful browsers,
29# you can remove the XVFB start script for a micro perf gain.
30CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

crawlee.json

1{
2	"purgeOnStart": false
3}

package.json

1{
2	"name": "consumerfinance",
3	"version": "0.0.1",
4	"type": "commonjs",
5	"description": "consumerfinance rules extractor",
6	"dependencies": {
7		"22": "^0.0.0",
8		"apify": "^3.1.10",
9		"crawlee": "^3.5.4",
10		"fs": "^0.0.1-security",
11		"node-fetch": "^3.3.2",
12		"path": "^0.12.7",
13		"pdf-parse": "^1.1.1",
14		"playwright": "^1.44.0"
15	},
16	"devDependencies": {
17		"@apify/eslint-config": "^0.4.0",
18		"@playwright/test": "^1.43.1",
19		"@types/pdf-parse": "^1.1.4",
20		"eslint": "^8.50.0"
21	},
22	"scripts": {
23		"start": "node src/main.js",
24		"lint": "eslint ./src --ext .js,.jsx",
25		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
26		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
27		"postinstall": "npx crawlee install-playwright-browsers"
28	},
29	"author": "Moazzam Malek"
30}

start.bat

Download

start.sh

Download

Developer

Yash Agarwal

Categories

Automation

🔥 LinkedIn Jobs Scraper

bebity/linkedin-jobs-scraper

ℹ️ Designed for both personal and professional use, simply enter your desired job title and location to receive a tailored list of job opportunities. Try it today!

Bebity

2.9k

Rightmove Scraper

dhrumil/rightmove-scraper

Scrape rightmove.co.uk to crawl millions of sale/rent real estate properties from United Kingdom. Our real estate scraper also lets you monitor specific listing for new updates/listing. You can provide multiple search result listings to scrape/monitor.

Dhrumil Bhankhar

116

🔥 Indeed Jobs Scraper

bebity/indeed-jobs-scraper

🚀 UPDATED 🚀 Explore job opportunities effortlessly with the Indeed Jobs Scraper Apify Actor. Input your criteria, fetch targeted listings, and streamline your job search. Advanced filters, intuitive UI, and precise data at your fingertips! 🌟🔍📊

Bebity

418

Instagram Followers Count Scraper

apify/instagram-followers-count-scraper

Scrape the number of followers & follows from any Instagram profile. Schedule the scraper to run regularly to monitor how the numbers change. You can also export scraped data, run the scraper via API, monitor runs or integrate with other tools.

Apify

2.6k

🔥 Glassdoor jobs scraper

bebity/glassdoor-jobs-scraper

ℹ️ Elevate your job search with the sleek Glassdoor Scraper for Apify. Tailor-made for the modern job seeker, it's your secret weapon in the job market. Dive into a world of opportunities with custom searches, precise filtering. Fast, efficient, and incredibly user-friendly 🚀✨

Bebity

350

Fast TikTok API (with no-watermark video download link)

novi/fast-tiktok-api

All in one TikTok APIs. Fastest TikTok API for Trend, Hashtag, Search, Music, User, Comment. Provides no-watermark download link.

Novi

657

🔥 Web Traffic Generator | 🚀 WebRocket 🚀

bebity/web-traffic-generator

🚀💥 Introducing WebRocket! 💥 Supercharge your website 📈, deep crawling 🕸️, and robust error handling 🤖. Blast off with start URLs 🚀, choose simultaneous visitors 🧑🏻‍🤝‍🧑🏻, and set visit numbers #️⃣. Customize the stay duration ⌛, pick device types 📱🖥️📟, and use residential proxies 🌍🏠

Bebity

Telegram scraper and adder

curious_coder/telegram-scraper

Scrape group members of any telegram group and and auto add them to your group. Schedule this tool to run daily and grow your group by adding your competitors audience.

Curious Coder

6.9k

Twitter comments scraper

curious_coder/twitter-replies-scraper

Scrape twitter comments and users from given tweet url. Useful to DM to people who might be interested in your product or service.

Curious Coder

461

Youtube Video Downloader

epctex/youtube-video-downloader

Effortlessly download YouTube videos of your preferred quality with our user-friendly Video Downloader. Try it now!

epctex

211

Where next?

Build new tools

Are you a developer? Build your own Actors and run them on Apify.

Learn more

Get a custom solution

Get a custom web scraping or RPA solution.

Book a demo

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

src/main.js

.dockerignore

.editorconfig

.eslintrc

.gitignore

Dockerfile

crawlee.json

package.json

start.bat

start.sh

You might also like these Actors

🔥 LinkedIn Jobs Scraper

Rightmove Scraper

🔥 Indeed Jobs Scraper

Instagram Followers Count Scraper

🔥 Glassdoor jobs scraper

Fast TikTok API (with no-watermark video download link)

🔥 Web Traffic Generator | 🚀 WebRocket 🚀

Telegram scraper and adder

Twitter comments scraper

Youtube Video Downloader

Where next?

Build new tools

Get a custom solution