Link Extractor
Try for free
No credit card required
View all Actors
Link Extractor
juansgaitan/link-extractor
Try for free
No credit card required
Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-puppeteer
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "latest",
9 "type-check": "latest"
10 },
11 "scripts": {
12 "start": "node main.js"
13 }
14}
main.js
1const { URL } = require('url');
2const Apify = require('apify');
3const { typeCheck } = require('type-check');
4
5const { log, dir } = console;
6
7const INPUT_TYPE = `{
8 baseUrl: String,
9 pageFunction: String,
10 waitForCssSelector: String,
11 usernames: [String],
12}`;
13
14const parseUrlFor = baseUrl => input => new URL(input, baseUrl);
15let parseUrl = null;
16
17async function extractUrls(browser, username, url, pageFunc, cssSelector) {
18 let page = null;
19 const result = {
20 username,
21 postsLinks: [],
22 };
23 try {
24 page = await browser.newPage();
25 log(`New browser page for: ${url}`);
26
27 const response = await page.goto(url, { waitUntil: 'networkidle2' });
28 if (!/^2\d{2}$/.test(response.status)) {
29 log('Response:', response.status);
30 return Object.assign({}, result, {
31 errorMessage: `${url} responded ${response.status}. Verify the username.`,
32 });
33 }
34 await page.waitForSelector(cssSelector);
35
36 const postsUrls = await page.evaluate((fn) => {
37 const func = new Function(fn);
38 return func();
39 }, pageFunc);
40
41 const parsedPostsUrls = postsUrls.map(parseUrl);
42 result.postsLinks.push(...parsedPostsUrls);
43 } catch (error) {
44 throw new Error(`The page ${url}, could not be loaded: ${error}`);
45 } finally {
46 if (page) {
47 await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`));
48 }
49 }
50 return result;
51}
52
53Apify.main(async () => {
54 let input = await Apify.getValue('INPUT');
55 if (typeof input === 'string') {
56 input = JSON.parse(input);
57 }
58 log(input);
59 if (!typeCheck(INPUT_TYPE, input)) {
60 log('Expected input:');
61 log(INPUT_TYPE);
62 log('Received input:');
63 dir(input);
64 throw new Error('Received invalid input');
65 }
66 const {
67 baseUrl,
68 usernames,
69 pageFunction,
70 waitForCssSelector,
71 } = input;
72 log(baseUrl, usernames);
73
74 log('Openning browser...');
75 const browser = await Apify.launchPuppeteer();
76 log('New browser window.');
77
78 parseUrl = parseUrlFor(baseUrl);
79 const allExtractedUrls = usernames.map((username) => {
80 const { href } = parseUrl(username);
81 return extractUrls(browser, username, href, pageFunction, waitForCssSelector);
82 });
83 const urls = await Promise.all(allExtractedUrls);
84 await Apify.setValue('OUTPUT', urls);
85 log(JSON.stringify(urls, null, 2));
86
87 log('Closing browser.');
88 await browser.close();
89});
Developer
Maintained by Community
Actor metrics
- 6 monthly users
- 2 stars
- Created in Oct 2017
- Modified about 2 years ago
Categories