Federal Credit Union Act avatar
Federal Credit Union Act

Pricing

Pay per usage

Go to Store
Federal Credit Union Act

Federal Credit Union Act

Developed by

Yash Agarwal

Yash Agarwal

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

2

Monthly users

1

Runs succeeded

>99%

Last modified

a year ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
apify_storage
crawlee_storage
storage
# Added by Apify CLI
.venv

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:20
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

package.json

{
"name": "my-crawler",
"version": "0.0.1",
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"apify": "^3.2.0",
"crawlee": "^3.0.0",
"playwright": "*"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "It's not you it's me",
"license": "ISC"
}

start.bat

Download

start.sh

Download

.actor/actor.json

{
"actorSpecification": 1,
"name": "Federal-credit-union-act",
"version": "0.0",
"buildTag": "latest",
"environmentVariables": {}
}

src/main.js

1import { Actor } from "apify";
2import { PlaywrightCrawler, Dataset } from "crawlee";
3
4Actor.main(async () => {
5 const crawler = new PlaywrightCrawler({
6 maxRequestsPerCrawl: 100,
7 async requestHandler({ request, page, log }) {
8 log.info(`Processing ${request.url}`);
9
10 const title = await page.title();
11 log.info(`${title}`, { url: request.loadedUrl });
12 const result = await page.evaluate(() => {
13 const result = {
14 Category: "Rules and Regulations",
15 Title:
16 document.querySelector(".chapter-head")?.innerText || "N/A",
17 MainParagraphText:
18 document.querySelector(".documentViewer")
19 ?.innerText || "N/A",
20 Links: [],
21 PDFs: [],
22 };
23
24 const linkElements = document.querySelectorAll(
25 ".documentViewer a"
26 );
27 for (const el of Array.from(linkElements)) {
28 const obj = {
29 linkText: el.innerText || "N/A",
30 link: el.href || "",
31 };
32 const numericValue = Number(obj.linkText);
33
34 if (
35 isNaN(numericValue) &&
36 !obj.link.includes("mailto") &&
37 obj.link !== ""
38 ) {
39 if (obj.link.endsWith(".pdf")) {
40 result.PDFs.push(obj);
41 } else result.Links.push(obj);
42 }
43 }
44
45 return result;
46 });
47
48 // Save the data to the dataset
49 await Dataset.pushData({
50 url: request.url,
51 ...result,
52 });
53 },
54 });
55
56 // Run the crawler with the initial URL
57 await crawler.run(['https://uscode.house.gov/view.xhtml?path=/prelim@title12/chapter14&edition=prelim']);
58});

src/routes.js

1import { createPlaywrightRouter } from 'crawlee';
2
3export const router = createPlaywrightRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://crawlee.dev/**'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log, pushData }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});