Link Extractor avatar
Link Extractor

Pricing

Pay per usage

Go to Store
Link Extractor

Link Extractor

Developed by

Juan Gaitán Villamizar

Juan Gaitán Villamizar

Maintained by Community

Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.

0.0 (0)

Pricing

Pay per usage

2

Total users

169

Monthly users

7

0

Last modified

3 years ago

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-puppeteer
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=node:node . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "latest",
9        "type-check": "latest"
10    },
11    "scripts": {
12        "start": "node main.js"
13    }
14}

main.js

1const { URL } = require('url');
2const Apify = require('apify');
3const { typeCheck } = require('type-check');
4
5const { log, dir } = console;
6
7const INPUT_TYPE = `{
8  baseUrl: String,
9  pageFunction: String,
10  waitForCssSelector: String,
11  usernames: [String],
12}`;
13
14const parseUrlFor = baseUrl => input => new URL(input, baseUrl);
15let parseUrl = null;
16
17async function extractUrls(browser, username, url, pageFunc, cssSelector) {
18  let page = null;
19  const result = {
20    username,
21    postsLinks: [],
22  };
23  try {
24    page = await browser.newPage();
25    log(`New browser page for: ${url}`);
26
27    const response = await page.goto(url, { waitUntil: 'networkidle2' });
28    if (!/^2\d{2}$/.test(response.status)) {
29      log('Response:', response.status);
30      return Object.assign({}, result, {
31        errorMessage: `${url} responded ${response.status}. Verify the username.`,
32      });
33    }
34    await page.waitForSelector(cssSelector);
35
36    const postsUrls = await page.evaluate((fn) => {
37      const func = new Function(fn);
38      return func();
39    }, pageFunc);
40
41    const parsedPostsUrls = postsUrls.map(parseUrl);
42    result.postsLinks.push(...parsedPostsUrls);
43  } catch (error) {
44    throw new Error(`The page ${url}, could not be loaded: ${error}`);
45  } finally {
46    if (page) {
47      await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`));
48    }
49  }
50  return result;
51}
52
53Apify.main(async () => {
54  let input = await Apify.getValue('INPUT');
55  if (typeof input === 'string') {
56    input = JSON.parse(input);
57  }
58  log(input);
59  if (!typeCheck(INPUT_TYPE, input)) {
60    log('Expected input:');
61    log(INPUT_TYPE);
62    log('Received input:');
63    dir(input);
64    throw new Error('Received invalid input');
65  }
66  const {
67    baseUrl,
68    usernames,
69    pageFunction,
70    waitForCssSelector,
71  } = input;
72  log(baseUrl, usernames);
73
74  log('Openning browser...');
75  const browser = await Apify.launchPuppeteer();
76  log('New browser window.');
77
78  parseUrl = parseUrlFor(baseUrl);
79  const allExtractedUrls = usernames.map((username) => {
80    const { href } = parseUrl(username);
81    return extractUrls(browser, username, href, pageFunction, waitForCssSelector);
82  });
83  const urls = await Promise.all(allExtractedUrls);
84  await Apify.setValue('OUTPUT', urls);
85  log(JSON.stringify(urls, null, 2));
86
87  log('Closing browser.');
88  await browser.close();
89});