Product Hunt User Finder (2023)
Pricing
Pay per usage
Go to Apify Store
Product Hunt User Finder (2023)
DeprecatedThis actor, an updated version of the original Product Hunt User Finder, operates on the current Product Hunt webpage. It matches a list of Twitter users with their Product Hunt profiles. However, it only works if the usernames are similar, which is often the case.
0.0 (0)
Pricing
Pay per usage
0
5
1
Last modified
2 years ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:16
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "project-cheerio-crawler-javascript", "title": "Project Cheerio Crawler Javascript", "description": "Crawlee and Cheerio project in javascript.", "version": "0.0", "meta": { "templateId": "js-crawlee-cheerio" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "CheerioCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "usernames": { "title": "Usernames", "type": "string", "description": "Product Hunt usernames", "editor": "textarea" } }, "required": ["usernames"]}
src/modules/helpers.js
1export const collectUsernamesArray = (usernamesInput) => {2 const matches = [...usernamesInput.matchAll(/[^ \n\t,]+/g)];3 4 if (matches.length === 0) {5 throw new Error('There are no usernames to process');6 }7
8 return matches.map((match) => match[0]);9};10
11export const getColumnName = (() => {12 let index = 1;13
14 return (prefix, name) => {15 const formattedIndex = index.toString().padStart(2, '0');16
17 index++;18
19 return `${formattedIndex}_${prefix}_${name}`;20 };21})();
src/modules/product-hunt.js
1import { collectUsernamesArray } from './helpers.js';2
3const getProfileUrl = (username) => {4 return `https://www.producthunt.com/@${username}`;5};6
7const getUsernameFromUrl = (url) => {8 const regex = /@(.+)/;9 const match = url.match(regex);10 11 return match[1];12};13
14const collectProfileUrls = (usernamesInput) => {15 const usernames = collectUsernamesArray(usernamesInput);16 const profileUrls = [];17
18 for (const username of usernames) {19 profileUrls.push(getProfileUrl(username));20 }21
22 if (profileUrls.length === 0) {23 throw new Error('There are no links to crawl');24 }25
26 return profileUrls;27};28
29const getProfileName = ($) => {30 return $('h1').text();31};32
33const getProfileAbout = ($) => {34 return $('h2:contains("About") + p').text();35};36
37const getProfileFollowersCount = ($) => {38 const followersText = $('header a:contains("followers")').text();39
40 return parseInt(followersText.replace(',', ''));41};42
43export default {44 getProfileUrl,45 getUsernameFromUrl,46 collectProfileUrls,47 getProfileName,48 getProfileAbout,49 getProfileFollowersCount50};
src/main.js
1import { Actor } from 'apify';2import { CheerioCrawler } from 'crawlee';3import { router } from './routes.js';4import productHunt from './modules/product-hunt.js';5
6await Actor.init();7
8const { usernames } = await Actor.getInput();9const profileUrls = productHunt.collectProfileUrls(usernames);10
11const proxyConfiguration = await Actor.createProxyConfiguration();12const crawler = new CheerioCrawler({13 proxyConfiguration,14 requestHandler: router15});16
17await crawler.run(profileUrls);18
19await Actor.exit();
src/routes.js
1import { Dataset, createCheerioRouter } from 'crawlee';2import productHunt from './modules/product-hunt.js';3
4export const router = createCheerioRouter();5
6router.addDefaultHandler(async ({ request: { loadedUrl }, response: { statusCode }, $ }) => {7 let urlData = {8 '01_ph_url': loadedUrl,9 '02_ph_status': statusCode,10 };11
12 if (statusCode !== 404) {13 const name = productHunt.getProfileName($);14 const about = productHunt.getProfileAbout($);15 const followers = productHunt.getProfileFollowersCount($);16
17 urlData = {18 ...urlData,19 '03_ph_name': name,20 '04_ph_followers': followers,21 '05_ph_about': about22 };23 }24
25 await Dataset.pushData(urlData);26
27 console.log(`${loadedUrl} parsed`);28});
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "crawlee-cheerio-javascript", "version": "0.0.1", "type": "module", "description": "This is a boilerplate of an Apify actor.", "engines": { "node": ">=16.0.0" }, "dependencies": { "apify": "^3.0.0", "crawlee": "^3.0.0" }, "devDependencies": { "@apify/eslint-config": "^0.3.1", "eslint": "^8.36.0" }, "scripts": { "start": "node src/main.js", "lint": "eslint ./src --ext .js,.jsx", "lint:fix": "eslint ./src --ext .js,.jsx --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}