Link Extractor

No credit card required

Link Extractor

Link Extractor

juansgaitan/link-extractor

No credit card required

Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system. 2# The base image name below is set during the act build, based on user settings. 3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user 4FROM apify/actor-node-puppeteer 5 6# Second, copy just package.json and package-lock.json since it should be 7# the only file that affects "npm install" in the next step, to speed up the build 8COPY package*.json ./ 9 10# Install NPM packages, skip optional and development dependencies to 11# keep the image small. Avoid logging too much and print the dependency 12# tree for debugging 13RUN npm --quiet set progress=false \ 14 && npm install --only=prod --no-optional \ 15 && echo "Installed NPM packages:" \ 16 && (npm list --all || true) \ 17 && echo "Node.js version:" \ 18 && node --version \ 19 && echo "NPM version:" \ 20 && npm --version 21 22# Copy source code to container 23# Do this in the last step, to have fast build if only the source code changed 24COPY --chown=node:node . ./ 25 26# NOTE: The CMD is already defined by the base image. 27# Uncomment this for local node inspector debugging: 28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ] 29

package.json

1{ 2 "name": "apify-project", 3 "version": "0.0.1", 4 "description": "", 5 "author": "It's not you it's me", 6 "license": "ISC", 7 "dependencies": { 8 "apify": "latest", 9 "type-check": "latest" 10 }, 11 "scripts": { 12 "start": "node main.js" 13 } 14}

main.js

1const { URL } = require('url'); 2const Apify = require('apify'); 3const { typeCheck } = require('type-check'); 4 5const { log, dir } = console; 6 7const INPUT_TYPE = `{ 8 baseUrl: String, 9 pageFunction: String, 10 waitForCssSelector: String, 11 usernames: [String], 12}`; 13 14const parseUrlFor = baseUrl => input => new URL(input, baseUrl); 15let parseUrl = null; 16 17async function extractUrls(browser, username, url, pageFunc, cssSelector) { 18 let page = null; 19 const result = { 20 username, 21 postsLinks: [], 22 }; 23 try { 24 page = await browser.newPage(); 25 log(`New browser page for: ${url}`); 26 27 const response = await page.goto(url, { waitUntil: 'networkidle2' }); 28 if (!/^2\d{2}$/.test(response.status)) { 29 log('Response:', response.status); 30 return Object.assign({}, result, { 31 errorMessage: `${url} responded ${response.status}. Verify the username.`, 32 }); 33 } 34 await page.waitForSelector(cssSelector); 35 36 const postsUrls = await page.evaluate((fn) => { 37 const func = new Function(fn); 38 return func(); 39 }, pageFunc); 40 41 const parsedPostsUrls = postsUrls.map(parseUrl); 42 result.postsLinks.push(...parsedPostsUrls); 43 } catch (error) { 44 throw new Error(`The page ${url}, could not be loaded: ${error}`); 45 } finally { 46 if (page) { 47 await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`)); 48 } 49 } 50 return result; 51} 52 53Apify.main(async () => { 54 let input = await Apify.getValue('INPUT'); 55 if (typeof input === 'string') { 56 input = JSON.parse(input); 57 } 58 log(input); 59 if (!typeCheck(INPUT_TYPE, input)) { 60 log('Expected input:'); 61 log(INPUT_TYPE); 62 log('Received input:'); 63 dir(input); 64 throw new Error('Received invalid input'); 65 } 66 const { 67 baseUrl, 68 usernames, 69 pageFunction, 70 waitForCssSelector, 71 } = input; 72 log(baseUrl, usernames); 73 74 log('Openning browser...'); 75 const browser = await Apify.launchPuppeteer(); 76 log('New browser window.'); 77 78 parseUrl = parseUrlFor(baseUrl); 79 const allExtractedUrls = usernames.map((username) => { 80 const { href } = parseUrl(username); 81 return extractUrls(browser, username, href, pageFunction, waitForCssSelector); 82 }); 83 const urls = await Promise.all(allExtractedUrls); 84 await Apify.setValue('OUTPUT', urls); 85 log(JSON.stringify(urls, null, 2)); 86 87 log('Closing browser.'); 88 await browser.close(); 89});
Developer
Maintained by Community
Actor stats
  • 103 users
  • 799 runs
  • Modified about 1 year ago
Categories

You might also like these Actors