# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
	"actorSpecification": 1,
	"name": "Legal Opinions",
	"title": "Project Playwright Crawler JavaScript",
	"description": "Crawlee and Playwright project in JavaScript.",
	"version": "0.0",
	"meta": {
		"templateId": "js-crawlee-playwright-chrome"
	},
	"input": "./input_schema.json",
	"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "PlaywrightCrawler Template",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with.",
            "editor": "requestListSources",
            "prefill": [
                {
                    "url": "https://apify.com"
                }
            ]
        }
    }
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://www.consumerfinance.gov/rules-policy/final-rules/",
8        "https://www.consumerfinance.gov/rules-policy/rules-under-development/",
9    ];
10
11    // const proxyConfiguration = await Actor.createProxyConfiguration();
12
13    const crawler = new PlaywrightCrawler({
14        // proxyConfiguration,
15        maxConcurrency: 3,
16        launchContext: {
17            launchOptions: { javaScriptEnabled: false },
18        },
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28});

src/routes.js

1const { Dataset, createPlaywrightRouter } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
6    const title = await page.title();
7    log.info(`${title}`, { url: request.loadedUrl });
8    await enqueueLinks({
9        selector: ".a-btn[href*=page]",
10    });
11    await enqueueLinks({
12        selector: "article h3 a",
13        label: "detail",
14    });
15});
16router.addHandler("detail", async ({ request, page, log }) => {
17    const title = await page.title();
18    log.info(`${title}`, { url: request.loadedUrl });
19    const result = await page.evaluate(() => {
20        let result = {
21            Category: document.querySelector("a[class*=m-breadcrumbs]")
22                .innerText,
23            Title: document.querySelector("h1").innerText,
24            MainParagraphText: document.querySelector(
25                ".m-full-width-text p:first-of-type"
26            ).innerText,
27            Text: document.querySelector(".m-full-width-text").innerText,
28            PDFs: [],
29            Links: [],
30        };
31        let category;
32        for (const el of Array.from(
33            document.querySelectorAll(".m-full-width-text >*")
34        )) {
35            if (el.tagName == "H5" || el.tagName == "H4")
36                category = el?.getAttribute("id") || el?.innerText;
37            if (!category) continue;
38            const link = el.querySelector("a");
39            if (link) {
40                const isPDF = link.href.includes(".pdf");
41                const obj = {
42                    linkText: link.innerText,
43                    link: link.href,
44                    category,
45                };
46                if (isPDF) result.PDFs.push(obj);
47                else result.Links.push(obj);
48            }
49        }
50
51        return result;
52    });
53    const PDFs = (
54        await Promise.allSettled(
55            result.PDFs.map(
56                (pdf) =>
57                    new Promise(async (res, rej) => {
58                        try {
59                            const pdfResponse = await page.request.fetch(
60                                pdf.link
61                            );
62
63                            // Parse the PDF using pdf-parse
64                            const pdfText = await pdfParse(
65                                (
66                                    await pdfResponse.body()
67                                ).buffer
68                            );
69
70                            res({
71                                ...pdf,
72                                text: pdfText.text,
73                                info: pdfText.info,
74                                metadata:
75                                    pdfText.metadata?._metadata ||
76                                    pdfText.metadata,
77                                error: null,
78                            });
79                        } catch (e) {
80                            // console.log(e);
81                            res({ ...pdf, error: e.message || e.code || true });
82                        }
83                    })
84            )
85        )
86    ).map((p) => p.value);
87    const Links = (
88        await Promise.allSettled(
89            result.Links.map(
90                (link) =>
91                    new Promise(async (res, rej) => {
92                        try {
93                            let text;
94                            if (
95                                link.linkText?.includes("Read it") &&
96                                link.linkText?.includes("the Federal Register")
97                            ) {
98                                const FederalRegisterResponse =
99                                    await page.request.fetch(link.link);
100                                const $ = load(
101                                    await FederalRegisterResponse.text()
102                                );
103                                text = $("#fulltext_content_area").text();
104                            }
105                            res({
106                                ...link,
107                                text,
108                            });
109                        } catch (e) {
110                            // console.log(e);
111                            res({
112                                ...link,
113                                error: e.message || e.code || true,
114                            });
115                        }
116                    })
117            )
118        )
119    ).map((p) => p.value);
120    if (request.errorMessages.includes("Data item is too large")) {
121        await Dataset.pushData({
122            url: request.url,
123            ...result,
124            PDFs: PDFs.map((item) => ({
125                ...item,
126                text: "Please get Manually",
127            })),
128            Links: Links.map((item) => ({
129                ...item,
130                text: "Please get Manually",
131            })),
132        });
133    }
134    await Dataset.pushData({ url: request.url, ...result, PDFs, Links });
135});
136module.exports = { router };

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

# Added by Apify CLI
.venv

package.json

{
	"name": "consumer",
	"version": "0.0.1",
	"type": "commonjs",
	"description": "consumer rules extractor",
	"dependencies": {
		"22": "^0.0.0",
		"apify": "^3.1.10",
		"crawlee": "^3.5.4",
		"pdf-parse": "^1.1.1",
		"playwright": "*"
	},
	"devDependencies": {
		"@apify/eslint-config": "^0.4.0",
		"@types/pdf-parse": "^1.1.4",
		"eslint": "^8.50.0"
	},
	"scripts": {
		"start": "node src/main.js",
		"lint": "eslint ./src --ext .js,.jsx",
		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
		"postinstall": "npx crawlee install-playwright-browsers"
	},
	"author": "Dhaval Rupapara"
}

start.bat

Download

start.sh

Download

YouTube Scraper

streamers/youtube-scraper

YouTube crawler and video scraper. Alternative YouTube API with no limits or quotas. Extract and download channel name, likes, number of views, and number of subscribers.

Streamers

30K

4.6

Reddit Scraper Lite

trudax/reddit-scraper-lite

Pay Per Result, unlimited Reddit web scraper to crawl posts, comments, communities, and users without login. Limit web scraping by number of posts or items and extract all data in a dataset in multiple formats.

Gustavo Rudiger

3.9

Reddit Scraper

trudax/reddit-scraper

Unlimited Reddit web scraper to crawl posts, comments, communities, and users without login. Limit web scraping by number of posts or items and extract all data in a dataset in multiple formats.

Gustavo Rudiger

7.5K

4.1

YouTube Shorts Scraper

streamers/youtube-shorts-scraper

Extract YouTube Shorts data from one or multiple YouTube channels. Get video URL, caption, timestamp, likes, dislikes, views and comments count, basic channel info, and more. This alternative YouTube API has no limits or quotas. Download the data in JSON, CSV, and Excel.

Streamers

17K

3.8

(New) Reddit Scraper Pro - Fast, Affordable, Supported

harshmaur/reddit-scraper-pro

Reddit Scraper Pro is a powerful, user-friendly tool for extracting data from Reddit. Offers scraping of posts, users, comments, and communities, advanced search capabilities, and multiple export options. Perfect for brand monitoring, trend tracking, and competitor research. Supports n8n integration

Harsh Maur

624

5.0

Fast News Scraper

timgreen/fast-news-scraper

Extract full article text and metadata from popular news sites like The New York Times, AP News, Reuters, CNBC, NPR, and Wired. Scrape thousands of articles in just a few minutes.

Tim Green

382

5.0

Rumble all-inclusive scraper

azzouzana/rumble-all-inclusive-scraper

🚀 Extract video (revenue included), channels, playlists details & top search results from Rumble using our fast scraper. Just provide channel, video, playlist or search URLs, and export data in JSON, CSV, or HTML. 🔥 Quick, easy, and ready to use!

Azzouzana

5.0

AI LinkedIn Job Matcher

james.logantech/ai-linkedin-job-matcher

AI LinkedIn Job Matcher helps job seekers find the most relevant LinkedIn job postings using NLP, and OpenAI's GPT-4. It analyzes job descriptions, matches them to resumes, and ranks opportunities by relevance. Automate job searching, save time and discover the best career matches easily!

James

Reddit scraper

curious_coder/reddit-scraper

Scrape reddit posts and comments from reddit search and communities

Curious Coder

258

Feedly Scraper

mscraper/feedly-scraper

Feedly Scraper is a specialized web scraping tool designed to extract news from Feedly. The scraper exports the accumulated data to various formats like JSON, XML, CSV, or Excel

mscraper

MyAnimeList Scraper

rikunk/my-anime-list-scraper

MyAnimeList Scraper is a comprehensive tool that provides detailed insights into anime and manga series, extracting information on content, episodes, characters, statistics, and reviews. It's designed to serve both casual fans and professionals, offering a view into trends, and user preferences.