Yahoo Finance Daily News avatar

Yahoo Finance Daily News

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Yahoo Finance Daily News

Yahoo Finance Daily News

bravolad/yahoo-finance-daily-news

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:20 AS builder
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install all dependencies. Don't audit to speed up the installation.
11RUN npm install --include=dev --audit=false
12
13# Next, copy the source files using the user set
14# in the base image.
15COPY --chown=myuser . ./
16
17# Install all dependencies and build the project.
18# Don't audit to speed up the installation.
19RUN npm run build
20
21# Create final image
22FROM apify/actor-node-puppeteer-chrome:20
23
24# Copy just package.json and package-lock.json
25# to speed up the build using Docker layer cache.
26COPY --chown=myuser package*.json ./
27
28# Install NPM packages, skip optional and development dependencies to
29# keep the image small. Avoid logging too much and print the dependency
30# tree for debugging
31RUN npm --quiet set progress=false \
32    && npm install --omit=dev --omit=optional \
33    && echo "Installed NPM packages:" \
34    && (npm list --omit=dev --all || true) \
35    && echo "Node.js version:" \
36    && node --version \
37    && echo "NPM version:" \
38    && npm --version \
39    && rm -r ~/.npm
40
41# Copy built JS files from builder image
42COPY --from=builder --chown=myuser /home/myuser/dist ./dist
43
44# Next, copy the remaining files and directories with the source code.
45# Since we do this after NPM install, quick build will be really fast
46# for most source file changes.
47COPY --chown=myuser . ./
48
49
50# Run the image. If you know you won't need headful browsers,
51# you can remove the XVFB start script for a micro perf gain.
52CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

.actor/actor.json

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://finance.yahoo.com/news"
14                }
15            ]
16        }
17    }
18}

src/main.ts

1import { Actor } from 'apify';
2import { PuppeteerCrawler, Request } from 'crawlee';
3import { router } from './routes.js';
4
5// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
6await Actor.init();
7
8interface Input {
9    startUrls: Request[];
10}
11// Define the URLs to start the crawler with - get them from the input of the Actor or use a default list.
12const {
13    startUrls = ['https://finance.yahoo.com/news'],
14} = await Actor.getInput<Input>() ?? {};
15
16// Create a proxy configuration that will rotate proxies from Apify Proxy.
17const proxyConfiguration = await Actor.createProxyConfiguration();
18
19// Create a PuppeteerCrawler that will use the proxy configuration and and handle requests with the router from routes.js file.
20const crawler = new PuppeteerCrawler({
21    proxyConfiguration,
22    requestHandler: router,
23});
24
25// Run the crawler with the start URLs and wait for it to finish.
26await crawler.run(startUrls);
27
28// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
29await Actor.exit();

src/routes.ts

1import { Dataset, createPuppeteerRouter, log } from 'crawlee';
2
3let resultCount = 0;
4const MAX_RESULTS = 57;
5
6export const router = createPuppeteerRouter();
7
8router.addDefaultHandler(async ({ enqueueLinks }) => {
9    if (resultCount >= MAX_RESULTS) {
10        log.info(`Reached the maximum of ${MAX_RESULTS} results. Stopping the crawler.`);
11        process.exit(); // Gracefully stop the crawler
12    }
13    
14    log.info(`Enqueueing new URLs`);
15    await enqueueLinks({
16        globs: ['https://finance.yahoo.com/news/*'],
17        label: 'detail',
18    });
19});
20
21router.addHandler('detail', async ({ request, page }) => {
22    if (resultCount >= MAX_RESULTS) {
23        log.info(`Reached the maximum of ${MAX_RESULTS} results. Stopping the crawler.`);
24        process.exit(); // Gracefully stop the crawler
25    }
26
27    const title = await page.$eval('h1[data-test-locator="headline"]', el => el.textContent);
28    const author = await page.$eval('.caas-attr-item-author a', el => el.textContent);
29    const time = await page.$eval('.caas-attr-time-style time', el => el.textContent);
30    const content = await page.$$eval('.caas-body p', paragraphs => paragraphs.map(p => p.textContent).join('\n'));
31
32    log.info(`Scraped data from ${request.loadedUrl}`, { title, author, time });
33
34    await Dataset.pushData({
35        url: request.loadedUrl,
36        title,
37        author,
38        time,
39        content,
40    });
41
42    resultCount++;
43});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-typescript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.1.10",
11        "crawlee": "^3.5.4",
12        "puppeteer": "*"
13    },
14    "devDependencies": {
15        "@apify/eslint-config-ts": "^0.3.0",
16        "@apify/tsconfig": "^0.1.0",
17        "@typescript-eslint/eslint-plugin": "^6.7.2",
18        "@typescript-eslint/parser": "^6.7.2",
19        "eslint": "^8.50.0",
20        "tsx": "^4.6.2",
21        "typescript": "^5.3.3"
22    },
23    "scripts": {
24        "start": "npm run start:dev",
25        "start:prod": "node dist/main.js",
26        "start:dev": "tsx src/main.ts",
27        "build": "tsc",
28        "lint": "eslint ./src --ext .ts",
29        "lint:fix": "eslint ./src --ext .ts --fix",
30        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
31    },
32    "author": "It's not you it's me",
33    "license": "ISC"
34}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}
Developer
Maintained by Community