# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16

# Second, copy just package.json and package-lock.json since those are the only
# files that affect "npm install" in the next step, to speed up the build.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your Actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
    "title": "My input schema",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "category": {
            "title": "Category",
            "type": "string",
            "description": "Economist.com category to be scraped",
            "editor": "textarea",
            "prefill": "briefing"
        }
    }
}

main.js

1// This is the main Node.js source code file of your Actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the Actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/platform/actors/development/actor-definition/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for Actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "breakpointLocation": "NONE",
17        "browserLog": false,
18        "closeCookieModals": false,
19        "debugLog": false,
20        "downloadCss": true,
21        "downloadMedia": true,
22        "excludes": [
23            {
24                "glob": "/**/*.{png,jpg,jpeg,pdf}"
25            }
26        ],
27        "globs": [
28            {
29                "glob": "https://crawlee.dev/*/*"
30            }
31        ],
32        "headless": true,
33        "ignoreCorsAndCsp": false,
34        "ignoreSslErrors": false,
35        "injectJQuery": true,
36        "keepUrlFragments": false,
37        "linkSelector": "a",
38        "pageFunction": "async function pageFunction(context) {\n    // request is an instance of Apify.Request (https://sdk.apify.com/docs/api/request)\n    // $ is an instance of jQuery (http://jquery.com/)\n    const request = context.request;\n    const $ = context.jQuery;\n    const pageNum = parseInt(request.url.split('?page=').pop());\n\n  context.log.info(`Scraping ${context.request.url}`);\n\n    // Extract all articles.\n    const articles = [];\n    $('article').each((index, articleEl) => {\n        const $articleEl = $(articleEl);\n\n        // H3 contains 2 child elements where first one is topic and second is article title.\n        const $h3El = $articleEl.find('h3');\n\n        // Extract additonal info and push it to data object.\n        articles.push({\n            pageNum,\n            topic: $h3El.children().first().text(),\n            title: $h3El.children().last().text(),\n            url: $articleEl.find('a')[0].href,\n            teaser: $articleEl.find('.teaser__text').text(),\n        });\n    });\n\n    // Return results.\n    return articles;\n}",
39        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
40            // The functions accept a single argument: the "crawlingContext" object.
41            [
42                async (crawlingContext) => {
43                    // ...
44                },
45            ]`,
46        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
47            // The functions accept two arguments: the "crawlingContext" object
48            // and "gotoOptions".
49            [
50                async (crawlingContext, gotoOptions) => {
51                    // ...
52                },
53            ]`,
54        "proxyConfiguration": {
55            "useApifyProxy": true
56        },
57        "startUrls": [
58        {
59        "url": `https://www.economist.com/${input.category}/?page=1`,
60        "method": "GET"
61        }
62        ],
63        "globs": [
64         {
65        "purl": `https://www.economist.com/${input.category}/?page=[\\d+]`,
66        "method": "GET"
67        }
68        ],
69        "runMode": "DEVELOPMENT",
70        
71        "useChrome": false,
72        "waitUntil": [
73            "networkidle2"
74        ]
75    };
76
77    // Now let's metamorph into Actor apify/web-scraper using the created input.
78    await Apify.metamorph('apify/web-scraper', metamorphInput);
79});

package.json

{
    "name": "my-actor",
    "version": "0.0.1",
    "dependencies": {
        "apify": "^2.2.2"
    },
    "scripts": {
        "start": "node main.js"
    },
    "author": "Me!"
}

XING Data Extractor

epctex/xing-scraper

Unleash the power of advanced scraping to gather comprehensive data on companies, jobs, profiles, communities, and groups from XING. Extract job descriptions, images, company details, summarizations and more. Customize your search terms, filters, and mappings for precision.

epctex

299

Dealroom Scraper

saswave/dealroom-scraper

Dealroom company scraper. Collect data from companies using their url / domain name. Dealroom web crawler extract data about social networks, similarweb website traffic, funding rounds, investments, team members, KPI revenues and more ..

SASWAVE

Upwork Scraper Without Stale Job Posts

arlusm/upwork-scraper-with-fresh-job-posts

Comes without any stale data like many other scrapers do. Low cost and efficient. Can filter out duplicates and posts older than 24h

Artur

325

5.0

🔥 Fast Indeed Jobs Scraper

memo23/apify-indeed-cheerio

Web scraper for Indeed.com job listings. Add Indeed URLs to customize searches. Set max jobs, concurrency, and other parameters to optimize performance while respecting site resources. Efficiently extracts job data, balancing speed with ethical practices. Ideal for gathering targeted job market info

Muhamed Didovic

292

3.0

🔥 Fast Indeed Jobs Scraper (Pay Per Result)

memo23/apify-indeed-cheerio-ppr

Muhamed Didovic

Free Linkedin Job Scraper

forward_dinosaur/linkedin-job-scraper

A simple Linkedin Job Scraper to find new job opportunities.

forward_dinosaur

1.3K

1.0

Postjobfree Scraper

getdataforme/postjobfree-scraper

Easy scrape data from Post job Free using our web scraper postjobfree Scraper

GetDataForMe

Indeed job scraper

curious_coder/indeed-scraper

This is an actively maintained scraper which can extract job postings and hiring company details at scale from any indeed search results page for a fixed monthly rental price. Well documented with examples and demos

Curious Coder

1.9K

1.4

Upwork Job Scraper🔥

nec/upwork-job-scraper

Streamline your Upwork.com job search

NEC

195

1.1

Meta Media Downloader

web.harvester/meta-media-downloader

Easily download Facebook and Instagram videos, images, reels, and stories with this powerful tool. Perfect for content creators, marketers, and social media managers, it ensures fast, high-quality media extraction from Meta platforms. Save and repurpose content effortlessly in just a few clicks!

Web Harvester

5.0

Indeed Job Scraper

newbs/Indeed-Job-Scraper

Indeed Job Scraper fetches detailed job listings from Indeed. Specify job criteria and get comprehensive job data like title, salary, location, and more. Ideal for job seekers and recruiters. Enhance your job search on Indeed with valuable insights.