Pricing

Pay per usage

Go to Store

R R I

Try for free

Developed by

Yash Agarwal

0.0 (0)

Pricing

Pay per usage

Total users

Monthly users

Runs succeeded

>99%

Last modified

a year ago

Automation

Open source

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

# Added by Apify CLI
.venv

crawlee.json

{
	"purgeOnStart": false
}

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

{
	"name": "consumerfinance",
	"version": "0.0.1",
	"type": "commonjs",
	"description": "consumerfinance rules extractor",
	"dependencies": {
		"22": "^0.0.0",
		"apify": "^3.1.10",
		"crawlee": "^3.5.4",
		"fs": "^0.0.1-security",
		"node-fetch": "^3.3.2",
		"path": "^0.12.7",
		"pdf-parse": "^1.1.1",
		"playwright": "^1.43.1"
	},
	"devDependencies": {
		"@apify/eslint-config": "^0.4.0",
		"@types/pdf-parse": "^1.1.4",
		"eslint": "^8.50.0",
		"@playwright/test": "^1.43.1"
	},
	"scripts": {
		"start": "node src/main.js",
		"lint": "eslint ./src --ext .js,.jsx",
		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
		"postinstall": "npx crawlee install-playwright-browsers"
	},
	"author": "Moazzam Malek"
}

start.bat

Download

start.sh

Download

.actor/actor.json

{
	"actorSpecification": 1,
	"name": "R-R-I",
	"title": "Project Playwright Crawler JavaScript",
	"description": "Crawlee and Playwright project in JavaScript.",
	"version": "0.0",
	"meta": {
		"templateId": "js-crawlee-playwright-chrome"
	},
	"input": "./input_schema.json",
	"dockerfile": "./Dockerfile"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

{
    "title": "PlaywrightCrawler Template",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with.",
            "editor": "requestListSources",
            "prefill": [
                {
                    "url": "https://apify.com"
                }
            ]
        }
    }
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://ncua.gov/regulation-supervision/rules-regulations",
8    ];
9
10    // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12    const crawler = new PlaywrightCrawler({
13        // proxyConfiguration,
14        maxConcurrency: 3,
15        launchContext: {
16            launchOptions: { javaScriptEnabled: false },
17        },
18
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5const fs = require("fs");
6
7router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
8    const title = await page.title();
9    log.info(`${title}`, { url: request.loadedUrl });
10    x = title;
11
12    await enqueueLinks({
13        selector: ".field-type-text_with_summary h2 a[href*=regulatory-review]",
14        label: "detail5",
15    });
16
17    await enqueueLinks({
18        selector:
19            ".field-type-text_with_summary h2 a[href*=regulatory-reform-agenda]",
20        label: "detail6",
21    });
22
23    await enqueueLinks({
24        selector:
25            ".field-type-text_with_summary h2 a[href*=interpretive-rulings-policy-statements]",
26        label: "detail7",
27    });
28});
29
30router.addHandler("detail5", async ({ request, page, log }) => {
31    try {
32        const title = await page.title();
33        const url = request.loadedUrl;
34        log.info(`${title}`, { url: request.loadedUrl });
35        const result = await page.evaluate(() => {
36            const result = {
37                Category: "Rules and Regulations",
38                Title:
39                    document.querySelector(".page-title")?.innerText || "N/A",
40                MainParagraphText:
41                    document.querySelector(".field-type-text_with_summary")
42                        ?.innerText || "N/A",
43                Links: [],
44                PDFs: [],
45            };
46
47            const linkElements = document.querySelectorAll(
48                ".field-type-text_with_summary a"
49            );
50            for (const el of Array.from(linkElements)) {
51                const obj = {
52                    linkText: el.innerText || "N/A",
53                    link: el.href || "",
54                };
55                const numericValue = Number(obj.linkText);
56
57                if (
58                    isNaN(numericValue) &&
59                    !obj.link.includes("mailto") &&
60                    obj.link !== ""
61                ) {
62                    if (obj.link.endsWith(".pdf")) {
63                        result.PDFs.push(obj);
64                    } else result.Links.push(obj);
65                }
66            }
67
68            return result;
69        });
70
71        const PDFs = (
72            await Promise.allSettled(
73                result.PDFs.map(
74                    (pdf) =>
75                        new Promise(async (res, rej) => {
76                            try {
77                                const pdfResponse = await page.request.fetch(
78                                    pdf.link
79                                );
80
81                                // Parse the PDF using pdf-parse
82                                const pdfText = await pdfParse(
83                                    (
84                                        await pdfResponse.body()
85                                    ).buffer
86                                );
87
88                                res({
89                                    ...pdf,
90                                    text: pdfText.text,
91                                });
92                            } catch (e) {
93                                // console.log(e);
94                                res({
95                                    ...pdf,
96                                    error: e.message || e.code || true,
97                                });
98                            }
99                        })
100                )
101            )
102        ).map((p) => p.value);
103
104        if (request.errorMessages.includes("Data item is too large")) {
105            await Dataset.pushData({
106                url: request.url,
107                ...result,
108                PDFs: PDFs.map((item) => ({
109                    ...item,
110                    text: "Please retrieve manually due to size limitations",
111                })),
112                Links: Links.map((item) => ({
113                    ...item,
114                    text: "Please retrieve manually due to size limitations",
115                })),
116            });
117        } else {
118            await Dataset.pushData({
119                url2: request.url,
120                ...result,
121                PDFs,
122            });
123        }
124    } catch (error) {
125        log.error(
126            `An unexpected error occurred: ${error.message || error.code}`
127        );
128    }
129});
130
131router.addHandler("detail6", async ({ request, page, log }) => {
132    try {
133        const title = await page.title();
134        const url = request.loadedUrl;
135        log.info(`${title}`, { url: request.loadedUrl });
136        const result = await page.evaluate(() => {
137            const result = {
138                Category: "Rules and Regulations",
139                Title:
140                    document.querySelector(".page-title")?.innerText || "N/A",
141                MainParagraphText:
142                    document.querySelector(".field-type-text_with_summary")
143                        ?.innerText || "N/A",
144                Links: [],
145                PDFs: [],
146            };
147
148            const linkElements = document.querySelectorAll(
149                ".field-type-text_with_summary a"
150            );
151            for (const el of Array.from(linkElements)) {
152                const obj = {
153                    linkText: el.innerText || "N/A",
154                    link: el.href || "",
155                };
156                const numericValue = Number(obj.linkText);
157
158                if (
159                    isNaN(numericValue) &&
160                    !obj.link.includes("mailto") &&
161                    obj.link !== ""
162                ) {
163                    if (obj.link.endsWith(".pdf")) {
164                        result.PDFs.push(obj);
165                    } else result.Links.push(obj);
166                }
167            }
168
169            return result;
170        });
171
172
173        const PDFs = (
174            await Promise.allSettled(
175                result.PDFs.map(
176                    (pdf) =>
177                        new Promise(async (res, rej) => {
178                            try {
179                                const pdfResponse = await page.request.fetch(
180                                    pdf.link
181                                );
182
183                                // Parse the PDF using pdf-parse
184                                const pdfText = await pdfParse(
185                                    (
186                                        await pdfResponse.body()
187                                    ).buffer
188                                );
189
190                                res({
191                                    ...pdf,
192                                    text: pdfText.text,
193                                });
194                            } catch (e) {
195                                // console.log(e);
196                                res({
197                                    ...pdf,
198                                    error: e.message || e.code || true,
199                                });
200                            }
201                        })
202                )
203            )
204        ).map((p) => p.value);
205
206        // If the request has large data errors, mark the data for manual processing
207        if (request.errorMessages.includes("Data item is too large")) {
208            await Dataset.pushData({
209                url: request.url,
210                ...result,
211                PDFs: PDFs.map((item) => ({
212                    ...item,
213                    text: "Please retrieve manually due to size limitations",
214                })),
215                Links: Links.map((item) => ({
216                    ...item,
217                    text: "Please retrieve manually due to size limitations",
218                })),
219            });
220        } else {
221            await Dataset.pushData({
222                url: request.url,
223                ...result,
224                // Links,
225                PDFs,
226                // InnerPDFs
227            });
228        }
229    } catch (error) {
230        log.error(
231            `An unexpected error occurred: ${error.message || error.code}`
232        );
233    }
234});
235
236router.addHandler("detail7", async ({ request, page, log }) => {
237    try {
238        const title = await page.title();
239        log.info(`${title}`, { url: request.loadedUrl });
240        const result = await page.evaluate(() => {
241            const result = {
242                Category: "Rules and Regulations",
243                Title:
244                    document.querySelector(".page-title")?.innerText || "N/A",
245                MainParagraphText:
246                    document.querySelector(".field-type-text_with_summary")
247                        ?.innerText || "N/A",
248                Links: [],
249                PDFs: [],
250            };
251
252            const linkElements = document.querySelectorAll(
253                ".field-type-text_with_summary a"
254            );
255            for (const el of Array.from(linkElements)) {
256                const obj = {
257                    linkText: el.innerText || "N/A",
258                    link: el.href || "",
259                };
260                const numericValue = Number(obj.linkText);
261
262                if (
263                    isNaN(numericValue) &&
264                    !obj.link.includes("mailto") &&
265                    obj.link !== ""
266                ) {
267                    if (obj.link.endsWith(".pdf")) {
268                        result.PDFs.push(obj);
269                    } else result.Links.push(obj);
270                }
271            }
272
273            return result;
274        });
275
276        const PDFs = (
277            await Promise.allSettled(
278                result.PDFs.map(
279                    (pdf) =>
280                        new Promise(async (res, rej) => {
281                            try {
282                                const pdfResponse = await page.request.fetch(
283                                    pdf.link
284                                );
285
286                                // Parse the PDF using pdf-parse
287                                const pdfText = await pdfParse(
288                                    (
289                                        await pdfResponse.body()
290                                    ).buffer
291                                );
292
293                                res({
294                                    ...pdf,
295                                    text: pdfText.text,
296                                });
297                            } catch (e) {
298                                // console.log(e);
299                                res({
300                                    ...pdf,
301                                    error: e.message || e.code || true,
302                                });
303                            }
304                        })
305                )
306            )
307        ).map((p) => p.value);
308
309        if (request.errorMessages.includes("Data item is too large")) {
310            await Dataset.pushData({
311                url: request.url,
312                ...result,
313                PDFs: PDFs.map((item) => ({
314                    ...item,
315                    text: "Please retrieve manually due to size limitations",
316                })),
317                Links: Links.map((item) => ({
318                    ...item,
319                    text: "Please retrieve manually due to size limitations",
320                })),
321            });
322        } else {
323            await Dataset.pushData({
324                url: request.url,
325                ...result,
326                // Links,
327                PDFs,
328                // InnerPDFs
329            });
330        }
331    } catch (error) {
332        log.error(
333            `An unexpected error occurred: ${error.message || error.code}`
334        );
335    }
336});
337
338module.exports = { router };

I&I Sports Supply Co., Inc. Scraper

mshopik/iandi-sports-supply-co-scraper

Scrape I&I Sports Supply Co., Inc. and extract data on firearms and weapons from iisports.com. Our I&I Sports Supply Co., Inc. API lets you crawl product information and pricing. The saved data can be downloaded as HTML, JSON, CSV, Excel, and XML.

Mark Carter

i n Scraper ⭐

jupri/linkedin

💼 Scrape Linkedin.com

cat

Patent & IP Intelligence Platform

apify_daniel/patent-ip-intelligence

Advanced intellectual property research. Monitors patent filings and IP trends. Essential for R&D teams and attorneys.

Daniel Mayne

Pinterest Image Scraper

ninz/pinterest-image-scraper

Scrapes image URLs from Pinterest based on a keyword. I made this scrapper very simple I can tweak it more let me know if any extra features are needed

Ninz

Diccionario de la Real Academia de la Lengua Española RAE (R)

sonirico/diccioanrio-de-la-real-academia-de-la-lengua-espanola-rae-r

Accede a definiciones oficiales del español directamente en formato JSON. Obtén significados, categorías gramaticales y usos de cualquier palabra. Ideal para aplicaciones educativas, correctores y análisis lingüístico.

Marquiños

Coronavirus stats in Bulgaria

zuzka/covid-bg

Gets all the charts statistics (actual number of infected, deceased and recovered people) by COVID-19 in Bulgaria from https://www.mh.government.bg/bg/informaciya-za-grazhdani/potvrdeni-sluchai-na-koronavirus-na-teritoriyata-na-r-blgariya/.

Zuzka Pelechová

🧪 ALL Social Media/WebScraper

caring_dizi/blog-content-scraper-fixed

Im trying to bring the best scraper to the store, yet my output far exceeds those above me. I appreciate the usage more than the monetary outcomes, when or if that happens hahaha!! I can handle constructive criticism, so please message with any questions. Enjoy endless possibilities

Jeff Halverson

5.0

trayv/my-actor

I fully manage an online store www.elitetreasuresclub.shop and is a current Pro member with Advanced performance and security features on Cloudflare.

Yvan Tran

Yad2 Apartments Scraper

amit123/YadScraper

I built a Yad2 scraper that automatically collects real estate listings, including price, location, rooms, descriptions, images and more. It structures the data for easy use in alert systems or property search apps.

Amit

Etsy Product Search Scraper

getdataforme/etsy-product-search-scraper

Use this Crawler if you want to look for a specific product name instead of looking for a category. For example I want to buy phone then simply type phone to get the product listings for phones in etsy