Link Extractor avatar

Link Extractor

Try for free

No credit card required

View all Actors
Link Extractor

Link Extractor

juansgaitan/link-extractor
Try for free

No credit card required

Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-puppeteer
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=node:node . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "latest",
9        "type-check": "latest"
10    },
11    "scripts": {
12        "start": "node main.js"
13    }
14}

main.js

1const { URL } = require('url');
2const Apify = require('apify');
3const { typeCheck } = require('type-check');
4
5const { log, dir } = console;
6
7const INPUT_TYPE = `{
8  baseUrl: String,
9  pageFunction: String,
10  waitForCssSelector: String,
11  usernames: [String],
12}`;
13
14const parseUrlFor = baseUrl => input => new URL(input, baseUrl);
15let parseUrl = null;
16
17async function extractUrls(browser, username, url, pageFunc, cssSelector) {
18  let page = null;
19  const result = {
20    username,
21    postsLinks: [],
22  };
23  try {
24    page = await browser.newPage();
25    log(`New browser page for: ${url}`);
26
27    const response = await page.goto(url, { waitUntil: 'networkidle2' });
28    if (!/^2\d{2}$/.test(response.status)) {
29      log('Response:', response.status);
30      return Object.assign({}, result, {
31        errorMessage: `${url} responded ${response.status}. Verify the username.`,
32      });
33    }
34    await page.waitForSelector(cssSelector);
35
36    const postsUrls = await page.evaluate((fn) => {
37      const func = new Function(fn);
38      return func();
39    }, pageFunc);
40
41    const parsedPostsUrls = postsUrls.map(parseUrl);
42    result.postsLinks.push(...parsedPostsUrls);
43  } catch (error) {
44    throw new Error(`The page ${url}, could not be loaded: ${error}`);
45  } finally {
46    if (page) {
47      await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`));
48    }
49  }
50  return result;
51}
52
53Apify.main(async () => {
54  let input = await Apify.getValue('INPUT');
55  if (typeof input === 'string') {
56    input = JSON.parse(input);
57  }
58  log(input);
59  if (!typeCheck(INPUT_TYPE, input)) {
60    log('Expected input:');
61    log(INPUT_TYPE);
62    log('Received input:');
63    dir(input);
64    throw new Error('Received invalid input');
65  }
66  const {
67    baseUrl,
68    usernames,
69    pageFunction,
70    waitForCssSelector,
71  } = input;
72  log(baseUrl, usernames);
73
74  log('Openning browser...');
75  const browser = await Apify.launchPuppeteer();
76  log('New browser window.');
77
78  parseUrl = parseUrlFor(baseUrl);
79  const allExtractedUrls = usernames.map((username) => {
80    const { href } = parseUrl(username);
81    return extractUrls(browser, username, href, pageFunction, waitForCssSelector);
82  });
83  const urls = await Promise.all(allExtractedUrls);
84  await Apify.setValue('OUTPUT', urls);
85  log(JSON.stringify(urls, null, 2));
86
87  log('Closing browser.');
88  await browser.close();
89});
Developer
Maintained by Community
Actor metrics
  • 6 monthly users
  • 2 stars
  • Created in Oct 2017
  • Modified about 2 years ago
Categories