Link Extractor
Pricing
Pay per usage
Go to Store
Link Extractor
Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.
0.0 (0)
Pricing
Pay per usage
2
Total users
174
Monthly users
4
Runs succeeded
0%
Last modified
3 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-puppeteer
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "latest", "type-check": "latest" }, "scripts": { "start": "node main.js" }}
main.js
1const { URL } = require('url');2const Apify = require('apify');3const { typeCheck } = require('type-check');4
5const { log, dir } = console;6
7const INPUT_TYPE = `{8 baseUrl: String,9 pageFunction: String,10 waitForCssSelector: String,11 usernames: [String],12}`;13
14const parseUrlFor = baseUrl => input => new URL(input, baseUrl);15let parseUrl = null;16
17async function extractUrls(browser, username, url, pageFunc, cssSelector) {18 let page = null;19 const result = {20 username,21 postsLinks: [],22 };23 try {24 page = await browser.newPage();25 log(`New browser page for: ${url}`);26
27 const response = await page.goto(url, { waitUntil: 'networkidle2' });28 if (!/^2\d{2}$/.test(response.status)) {29 log('Response:', response.status);30 return Object.assign({}, result, {31 errorMessage: `${url} responded ${response.status}. Verify the username.`,32 });33 }34 await page.waitForSelector(cssSelector);35
36 const postsUrls = await page.evaluate((fn) => {37 const func = new Function(fn);38 return func();39 }, pageFunc);40
41 const parsedPostsUrls = postsUrls.map(parseUrl);42 result.postsLinks.push(...parsedPostsUrls);43 } catch (error) {44 throw new Error(`The page ${url}, could not be loaded: ${error}`);45 } finally {46 if (page) {47 await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`));48 }49 }50 return result;51}52
53Apify.main(async () => {54 let input = await Apify.getValue('INPUT');55 if (typeof input === 'string') {56 input = JSON.parse(input);57 }58 log(input);59 if (!typeCheck(INPUT_TYPE, input)) {60 log('Expected input:');61 log(INPUT_TYPE);62 log('Received input:');63 dir(input);64 throw new Error('Received invalid input');65 }66 const {67 baseUrl,68 usernames,69 pageFunction,70 waitForCssSelector,71 } = input;72 log(baseUrl, usernames);73
74 log('Openning browser...');75 const browser = await Apify.launchPuppeteer();76 log('New browser window.');77
78 parseUrl = parseUrlFor(baseUrl);79 const allExtractedUrls = usernames.map((username) => {80 const { href } = parseUrl(username);81 return extractUrls(browser, username, href, pageFunction, waitForCssSelector);82 });83 const urls = await Promise.all(allExtractedUrls);84 await Apify.setValue('OUTPUT', urls);85 log(JSON.stringify(urls, null, 2));86
87 log('Closing browser.');88 await browser.close();89});