Yahoo Finance Daily News
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsYahoo Finance Daily News
bravolad/yahoo-finance-daily-news
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:20 AS builder
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install all dependencies. Don't audit to speed up the installation.
11RUN npm install --include=dev --audit=false
12
13# Next, copy the source files using the user set
14# in the base image.
15COPY . ./
16
17# Install all dependencies and build the project.
18# Don't audit to speed up the installation.
19RUN npm run build
20
21# Create final image
22FROM apify/actor-node-puppeteer-chrome:20
23
24# Copy just package.json and package-lock.json
25# to speed up the build using Docker layer cache.
26COPY package*.json ./
27
28# Install NPM packages, skip optional and development dependencies to
29# keep the image small. Avoid logging too much and print the dependency
30# tree for debugging
31RUN npm --quiet set progress=false \
32 && npm install --omit=dev --omit=optional \
33 && echo "Installed NPM packages:" \
34 && (npm list --omit=dev --all || true) \
35 && echo "Node.js version:" \
36 && node --version \
37 && echo "NPM version:" \
38 && npm --version \
39 && rm -r ~/.npm
40
41# Copy built JS files from builder image
42COPY /home/myuser/dist ./dist
43
44# Next, copy the remaining files and directories with the source code.
45# Since we do this after NPM install, quick build will be really fast
46# for most source file changes.
47COPY . ./
48
49
50# Run the image. If you know you won't need headful browsers,
51# you can remove the XVFB start script for a micro perf gain.
52CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
.actor/actor.json
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with.",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://finance.yahoo.com/news"
14 }
15 ]
16 }
17 }
18}
src/main.ts
1import { Actor } from 'apify';
2import { PuppeteerCrawler, Request } from 'crawlee';
3import { router } from './routes.js';
4
5// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
6await Actor.init();
7
8interface Input {
9 startUrls: Request[];
10}
11// Define the URLs to start the crawler with - get them from the input of the Actor or use a default list.
12const {
13 startUrls = ['https://finance.yahoo.com/news'],
14} = await Actor.getInput<Input>() ?? {};
15
16// Create a proxy configuration that will rotate proxies from Apify Proxy.
17const proxyConfiguration = await Actor.createProxyConfiguration();
18
19// Create a PuppeteerCrawler that will use the proxy configuration and and handle requests with the router from routes.js file.
20const crawler = new PuppeteerCrawler({
21 proxyConfiguration,
22 requestHandler: router,
23});
24
25// Run the crawler with the start URLs and wait for it to finish.
26await crawler.run(startUrls);
27
28// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
29await Actor.exit();
src/routes.ts
1import { Dataset, createPuppeteerRouter, log } from 'crawlee';
2
3let resultCount = 0;
4const MAX_RESULTS = 57;
5
6export const router = createPuppeteerRouter();
7
8router.addDefaultHandler(async ({ enqueueLinks }) => {
9 if (resultCount >= MAX_RESULTS) {
10 log.info(`Reached the maximum of ${MAX_RESULTS} results. Stopping the crawler.`);
11 process.exit(); // Gracefully stop the crawler
12 }
13
14 log.info(`Enqueueing new URLs`);
15 await enqueueLinks({
16 globs: ['https://finance.yahoo.com/news/*'],
17 label: 'detail',
18 });
19});
20
21router.addHandler('detail', async ({ request, page }) => {
22 if (resultCount >= MAX_RESULTS) {
23 log.info(`Reached the maximum of ${MAX_RESULTS} results. Stopping the crawler.`);
24 process.exit(); // Gracefully stop the crawler
25 }
26
27 const title = await page.$eval('h1[data-test-locator="headline"]', el => el.textContent);
28 const author = await page.$eval('.caas-attr-item-author a', el => el.textContent);
29 const time = await page.$eval('.caas-attr-time-style time', el => el.textContent);
30 const content = await page.$$eval('.caas-body p', paragraphs => paragraphs.map(p => p.textContent).join('\n'));
31
32 log.info(`Scraped data from ${request.loadedUrl}`, { title, author, time });
33
34 await Dataset.pushData({
35 url: request.loadedUrl,
36 title,
37 author,
38 time,
39 content,
40 });
41
42 resultCount++;
43});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "root": true,
3 "env": {
4 "browser": true,
5 "es2020": true,
6 "node": true
7 },
8 "extends": [
9 "@apify/eslint-config-ts"
10 ],
11 "parserOptions": {
12 "project": "./tsconfig.json",
13 "ecmaVersion": 2020
14 },
15 "ignorePatterns": [
16 "node_modules",
17 "dist",
18 "**/*.d.ts"
19 ]
20}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-puppeteer-typescript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.1.10",
11 "crawlee": "^3.5.4",
12 "puppeteer": "*"
13 },
14 "devDependencies": {
15 "@apify/eslint-config-ts": "^0.3.0",
16 "@apify/tsconfig": "^0.1.0",
17 "@typescript-eslint/eslint-plugin": "^6.7.2",
18 "@typescript-eslint/parser": "^6.7.2",
19 "eslint": "^8.50.0",
20 "tsx": "^4.6.2",
21 "typescript": "^5.3.3"
22 },
23 "scripts": {
24 "start": "npm run start:dev",
25 "start:prod": "node dist/main.js",
26 "start:dev": "tsx src/main.ts",
27 "build": "tsc",
28 "lint": "eslint ./src --ext .ts",
29 "lint:fix": "eslint ./src --ext .ts --fix",
30 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
31 },
32 "author": "It's not you it's me",
33 "license": "ISC"
34}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "NodeNext",
5 "moduleResolution": "NodeNext",
6 "target": "ES2022",
7 "outDir": "dist",
8 "noUnusedLocals": false,
9 "skipLibCheck": true,
10 "lib": ["DOM"]
11 },
12 "include": [
13 "./src/**/*"
14 ]
15}
Developer
Maintained by Community
Categories