Deprecated

Pricing

Pay per usage

See alternative Actors

Go to Store

Letters To Credit Unions And Other Guidance

Deprecated

See alternative Actors

Developed by

Yash Agarwal

0.0 (0)

Pricing

Pay per usage

Total users

Monthly users

Last modified

a year ago

Automation

Open source

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

# Added by Apify CLI
.venv

crawlee.json

{
	"purgeOnStart": false
}

package.json

{
	"name": "consumerfinance",
	"version": "0.0.1",
	"type": "commonjs",
	"description": "consumerfinance rules extractor",
	"dependencies": {
		"22": "^0.0.0",
		"apify": "^3.1.10",
		"crawlee": "^3.5.4",
		"node-fetch": "^3.3.2",
		"pdf-parse": "^1.1.1",
		"playwright": "*"
	},
	"devDependencies": {
		"@apify/eslint-config": "^0.4.0",
		"@types/pdf-parse": "^1.1.4",
		"eslint": "^8.50.0"
	},
	"scripts": {
		"start": "node src/main.js",
		"lint": "eslint ./src --ext .js,.jsx",
		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
		"test": "echo \"Error: oops, the no tests yet, sad!\" && exit 1",
		"postinstall": "npx crawlee install-playwright-browsers"
	},
	"author": "Moazzam Malek"
}

start.bat

Download

start.sh

Download

.actor/actor.json

{
	"actorSpecification": 1,
	"name": "Letters-to-Credit-Unions-and-Other-Guidance",
	"title": "Project Playwright Crawler JavaScript",
	"description": "Crawlee and Playwright project in JavaScript.",
	"version": "0.0",
	"meta": {
		"templateId": "js-crawlee-playwright-chrome"
	},
	"input": "./input_schema.json",
	"dockerfile": "./Dockerfile"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./


# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/input_schema.json

{
    "title": "PlaywrightCrawler Template",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with.",
            "editor": "requestListSources",
            "prefill": [
                {
                    "url": "https://apify.com"
                }
            ]
        }
    }
}

src/main.js

1const { Actor } = require("apify");
2const { PlaywrightCrawler, Dataset } = require("crawlee");
3
4Actor.main(async () => {
5    const { router } = require("./routes.js");
6    const startUrls = [
7        "https://ncua.gov/regulation-supervision/letters-credit-unions-other-guidance?page=0&sort=date&dir=desc&sq=",
8    ];
9
10    // const proxyConfiguration = await Actor.createProxyConfiguration();
11
12    const crawler = new PlaywrightCrawler({
13        // proxyConfiguration,
14        maxConcurrency: 3,
15        launchContext: {
16            launchOptions: { javaScriptEnabled: false },
17        },
18
19        maxRequestRetries: 5,
20        requestHandler: router,
21        requestHandlerTimeoutSecs: 300,
22        navigationTimeoutSecs: 300,
23    });
24
25    await crawler.run(startUrls);
26    await Dataset.exportToCSV("OUTPUT");
27    await Dataset.exportToJSON("OUTPUT");
28    
29});

src/routes.js

1const { Dataset, createPlaywrightRouter, LoggerText } = require("crawlee");
2const pdfParse = require("pdf-parse");
3const router = createPlaywrightRouter();
4const { load } = require("cheerio");
5// const fetch = require("node-fetch");
6// const { default: fetch } = await import("node-fetch");
7const fs = require("fs");
8const { promisify } = require("util");
9
10router.addDefaultHandler(async ({ request, page, enqueueLinks, log }) => {
11    const title = await page.title();
12    log.info(`${title}`, { url: request.loadedUrl });
13    x = title;
14    await enqueueLinks({
15        selector: "#edit-next-container a",
16    });
17
18    await enqueueLinks({
19        selector: "tbody td a",
20        label: "detail",
21        transformRequestFunction(request) {
22            if (request.url.endsWith(".pdf")) {
23                log.info(`PDF ${request.url}`);
24                fetchPDF(request.url);
25                return false;
26            } else {
27                return request;
28            }
29        },
30    });
31    async function fetchPDF(pdfLink) {
32        const { default: fetch } = await import("node-fetch");
33        try {
34            // const mkdirAsync = promisify(fs.mkdir);
35            // const directory = "storage/PDFs";
36            // await mkdirAsync(directory, { recursive: true });
37            // const writeFileAsync = promisify(fs.writeFile);
38            const response = await fetch(pdfLink);
39            const buffer = await response.arrayBuffer();
40            const pdfText = await pdfParse(buffer);
41            // const serialNumber = pdfLink.substring(
42            //     pdfLink.lastIndexOf("/") + 1,
43            //     pdfLink.lastIndexOf(".")
44            // );
45            // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
46            // const jsonData = JSON.stringify(
47            //     { link: pdfLink, text: pdfText.text },
48            //     null,
49            //     2
50            // );
51            // await writeFileAsync(filename, jsonData);
52            // console.log(`JSON file "${filename}" created successfully.`);
53            await Dataset.pushData({
54                url: pdfLink,
55                pdftext: pdfText.text,
56            });
57
58        } catch (error) {
59            // const writeFileAsync = promisify(fs.writeFile);
60            // const serialNumber = pdfLink.substring(
61            //     pdfLink.lastIndexOf("/") + 1,
62            //     pdfLink.lastIndexOf(".")
63            // );
64            // const filename = `storage/PDFs/${serialNumber}.json`; // Updated path
65            // const jsonData = JSON.stringify(
66            //     {
67            //         link: pdfLink,
68            //         error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
69            //     },
70            //     null,
71            //     2
72            // );
73            // await writeFileAsync(filename, jsonData);
74            // console.error(
75            //     `Error fetching or parsing PDF from ${pdfLink}:`,
76            //     error
77            // );
78            await Dataset.pushData({
79                link: pdfLink,
80                error: `Error fetching or parsing PDF from ${pdfLink} and ${error}`,
81            });
82        }
83    }
84});
85
86router.addHandler("detail", async ({ request, page, log }) => {
87    try {
88        const title = await page.title();
89        const url = request.loadedUrl;
90        log.info(`${title}`, { url: request.loadedUrl });
91        const result = await page.evaluate(() => {
92            const result = {
93                Docket:
94                    document.querySelector("span[class*=docket]")?.innerText ||
95                    "N/A",
96                Date:
97                    document.querySelector("span[class*=date]")?.innerText ||
98                    "N/A",
99                Category:
100                    document.querySelector(
101                        "a[href*=letters-credit-unions-other-guidance]"
102                    )?.innerText || "N/A",
103                Title:
104                    document.querySelector(".pseudo-title")?.innerText || "N/A",
105                MainParagraphText:
106                    document.querySelector(".row.no-gutters .body")
107                        ?.innerText || "N/A",
108                Links: [],
109                PDFs: [],
110            };
111
112            const linkElements = document.querySelectorAll(
113                ".row.no-gutters .body a"
114            );
115            for (const el of Array.from(linkElements)) {
116                const obj = {
117                    linkText: el.innerText || "N/A",
118                    link: el.href || "",
119                };
120                const numericValue = Number(obj.linkText);
121
122               
123
124                if (
125                    isNaN(numericValue) &&
126                    !obj.link.includes("mailto") &&
127                    obj.link !== ""
128                ) {
129                    if(obj.link.endsWith('.pdf')){
130                        result.PDFs.push(obj);
131                    }
132                    else result.Links.push(obj);
133                }
134            }
135
136            return result;
137        });
138
139        const Links = (
140            await Promise.allSettled(
141                result.Links.map(
142                    (link) =>
143                        new Promise(async (res, rej) => {
144                            try {
145                                // let innerTitle;
146                                if (!link.link.includes(".pdf")) {
147                                    const FederalRegisterResponse =
148                                        await page.request.fetch(link.link);
149                                    const $ = load(
150                                        await FederalRegisterResponse.text()
151                                    );
152                                    const contentDiv = $(".layout-content");
153                                    innerLinks = contentDiv
154                                        .find("a")
155                                        .map((i, el) => $(el).attr("href"))
156                                        .get();
157                                    innerLinks = innerLinks.map((innerLink) => {
158                                        if (!innerLink.startsWith("http")) {
159                                            return (
160                                                "https://ncua.gov" + innerLink
161                                            );
162                                        }
163                                        return innerLink;
164                                    });
165                                    innerLinks = Array.from(
166                                        new Set(innerLinks)
167                                    );
168                                    PDFLinks = innerLinks.filter((link) =>
169                                        link.endsWith(".pdf")
170                                    );
171                                    innerLinks = innerLinks.filter(
172                                        (link) => !link.endsWith(".pdf")
173                                    );
174                                    innerLinks = innerLinks.filter(
175                                        (link) => !link.endsWith("@ncua.gov")
176                                    );
177                                    innerLinks = innerLinks.filter(
178                                        (link) => !link.includes("#ftn")
179                                    );
180                                    innerText = $("p").text();
181                                }
182                                res({
183                                    ...link,
184                                    innerText,
185                                    innerLinks,
186                                    PDFLinks,
187                                });
188                            } catch (e) {
189                                // console.log(e);
190                                res({
191                                    ...link,
192                                    // error: e.message || e.code || true,
193                                    error: "404 page not found",
194                                });
195                            }
196                        })
197                )
198            )
199        ).map((p) => p.value);
200
201        const InnerPDFs = (
202            await Promise.allSettled(
203                Links.map(
204                    (pdf) =>
205                        new Promise(async (res, rej) => {
206                            try {
207                                const pdfDataArray = [];
208
209                                // Loop through all the PDF links in the `PDFLinks` array
210                                for (const pdfLink of pdf.PDFLinks) {
211                                    try {
212                                        // Fetch the PDF content from the current link
213                                        const link = pdfLink;
214                                        const pdfResponse =
215                                            await page.request.fetch(pdfLink);
216
217                                        // Parse the fetched PDF using pdf-parse
218                                        const pdfText = await pdfParse(
219                                            (
220                                                await pdfResponse.body()
221                                            ).buffer
222                                        );
223
224                                        // Store the parsed information for this PDF
225                                        pdfDataArray.push({
226                                            link,
227                                            text: pdfText.text,
228                                        });
229                                    } catch (innerError) {
230                                        pdfDataArray.push({
231                                            link: pdfLink,
232                                            error:
233                                                innerError.message ||
234                                                innerError.code ||
235                                                true,
236                                        });
237                                    }
238                                }
239
240                                res({
241                                    ...pdf,
242                                    pdfDataArray,
243                                });
244                            } catch (e) {
245                                // console.log(e);
246                                res({
247                                    ...pdf,
248                                    error: e.message || e.code || true,
249                                });
250                            }
251                        })
252                )
253            )
254        ).map((p) => p.value);
255
256        const PDFs = (
257            await Promise.allSettled(
258                result.PDFs.map(
259                    (pdf) =>
260                        new Promise(async (res, rej) => {
261                            try {
262                                const pdfResponse = await page.request.fetch(
263                                    pdf.link
264                                );
265    
266                                // Parse the PDF using pdf-parse
267                                const pdfText = await pdfParse(
268                                    (
269                                        await pdfResponse.body()
270                                    ).buffer
271                                );
272    
273                                res({
274                                    ...pdf,
275                                    text: pdfText.text,
276                                });
277                            } catch (e) {
278                                // console.log(e);
279                                res({ ...pdf, error: e.message || e.code || true });
280                            }
281                        })
282                )
283            )
284        ).map((p) => p.value);
285
286        // If the request has large data errors, mark the data for manual processing
287        if (request.errorMessages.includes("Data item is too large")) {
288            await Dataset.pushData({
289                url: request.url,
290                ...result,
291                PDFs: PDFs.map((item) => ({
292                    ...item,
293                    text: "Please retrieve manually due to size limitations",
294                })),
295                Links: Links.map((item) => ({
296                    ...item,
297                    text: "Please retrieve manually due to size limitations",
298                })),
299            });
300        } else {
301            await Dataset.pushData({
302                url: request.url,
303                ...result,
304                Links,
305                PDFs,
306                InnerPDFs
307            });
308        }
309    } catch (error) {
310        log.error(
311            `An unexpected error occurred: ${error.message || error.code}`
312        );
313    }
314});
315module.exports = { router };

TikTok Hashtag Scraper

clockworks/tiktok-hashtag-scraper

Scrape TikTok hashtag data. Just add one or more hashtags and extract TikTok videos with that hashtag: URLs, likes, country of creation, video and music metadata, TikTok creator data. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

4.6

TikTok Profile Scraper

clockworks/tiktok-profile-scraper

Extract data from TikTok profiles and videos they post. Get TikTok profile data, URLs, numbers of shares, followers, comments, hearts, video and music metadata. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

9.2K

4.7

TikTok Sound Scraper

clockworks/tiktok-sound-scraper

Scrape TikTok videos with a chosen sound. Just add one or more sound URLs and extract tiktoks that have it: URLs, likes, country of creation, video and music metadata, creator data. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

403

5.0

TikTok Discover Scraper

clockworks/tiktok-discover-scraper

Scrape TikTok Discover data. Just add one or more hashtags and the scraper will extract related videos, tag breadcrumbs, similar trends, and subtopics. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Clockworks

211

4.3

TikTok User Search Scraper

clockworks/tiktok-user-search-scraper

Extract data about users based on TikTok user search. You'll get full user profiles, including name, nickname, signature, number of followers, number of videos, bio link, and author’s ID.

Clockworks

239

4.4

Federal Credit Union Act

nondescript_cord/Federal-credit-union-act

Yash Agarwal

Google Maps Scraper

compass/crawler-google-places

Extract data from thousands of Google Maps locations and businesses, including reviews, reviewer details, images, contact info, opening hours, location, prices & more. Export scraped data, run the scraper via API, schedule and monitor runs, or integrate with other tools.

Compass

122K

4.2

TikTok Data Extractor

clockworks/free-tiktok-scraper

Extract data about videos, users, and channels based on hashtags or scrape full user profiles including posts, total likes, name, nickname, numbers of comments, shares, followers, following, and more.

Clockworks

28K

4.8

Contact Details Scraper

vdrmota/contact-info-scraper

Free email extractor and lead scraper to extract and download emails, phone numbers, Facebook, Twitter, LinkedIn, and Instagram profiles from any website. Extract contact information at scale from lists of URLs and download the data as Excel, CSV, JSON, HTML, and XML.

Vojta Drmota

33K

3.7

TikTok Comments Scraper

clockworks/tiktok-comments-scraper

Extract TikTok comments. Just add a TikTok URL and get TikTok video and profile data: comments, URLs, numbers of shares, followers, hashtags, hearts, video, and music metadata. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

10K

4.3

TikTok Video Scraper

clockworks/tiktok-video-scraper

Extract data from chosen tiktoks. Just add a TikTok URL and get TikTok video and profile data: URLs, numbers of shares, followers, hashtags, hearts, video, and music metadata. Export scraped data, run the scraper via API, schedule and monitor runs or integrate with other tools.

Clockworks

3.3K

4.8