Vdab-be jobs avatar
Vdab-be jobs

Deprecated

Pricing

Pay per usage

Go to Store
Vdab-be jobs

Vdab-be jobs

Deprecated

Developed by

Alexey Udovydchenko

Alexey Udovydchenko

Maintained by Community

Pick up any job categories from Belgium national job board https://www.vdab.be/jobs then choose sorting or filtering if you want and get list results for around 1 cent per 100 jobs!

0.0 (0)

Pricing

Pay per usage

2

Total users

6

Monthly users

1

Last modified

3 years ago

.actor/actor.json

{
"actorSpecification": 1,
"name": "default-output",
"title": "Live output",
"description": "Auto format for dataset items",
"version": "0.0.1",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "",
"description": "",
"views": {}
}
}
}

src/main.js

1import { Actor } from 'apify';
2import { KeyValueStore, CheerioCrawler } from 'crawlee';
3import { handleSearch } from './routes.js';
4
5await Actor.init();
6
7const input = await KeyValueStore.getInput();
8const {
9 startUrls = [],
10 proxyConfiguration = {
11 useApifyProxy: true,
12 },
13} = input;
14
15const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
16
17const crawler = new CheerioCrawler({
18 proxyConfiguration: proxyConfig,
19 async requestHandler(context) {
20 return handleSearch(context, input);
21 }
22});
23
24await crawler.run(startUrls);
25
26// Exit successfully
27await Actor.exit();

src/routes.js

1import { Dataset, log } from 'crawlee';
2
3export const handleSearch = async (context, input) => {
4 const { request, $, crawler } = context;
5 const { url, userData } = request;
6 const { page = 1 } = userData;
7 const { resultsLimit = 0 } = input;
8
9 const items = Array.from($('.result-item')).map((x) => {
10 const source_url = $('a', x).attr('href');
11 const job_title = $('.result-title', x).text().trim();
12 const loc = Array.from($('.location-span strong', x)).map((a) => $(a).text().trim());
13 const advertiser_name = loc?.[0];
14 const advertiser_location = loc?.[1];
15 const full_text = $('p.job-description', x).text().trim();
16 // const = $('', x).text().trim();
17 return {
18 source_url,
19 job_title,
20 advertiser_name,
21 advertiser_location,
22 full_text,
23 searchUrl: url
24 };
25 });
26
27 if (!items?.length) {
28 log.info(`[NO-DATA]: no jobs at ${url}`);
29 return;
30 }
31
32 const itemsCounter = (page - 1) * 10;
33 const resultsCounter = itemsCounter + items.length;
34
35 await Dataset.pushData(items.slice(0, resultsLimit && resultsCounter > resultsLimit ? resultsLimit - itemsCounter : undefined));
36
37 const counter = parseInt($('span.amount-of-jobs.desktop-block > strong').text().replace('.', '')) || 0;
38
39 if (!(resultsCounter >= counter) && resultsLimit && !(resultsCounter >= resultsLimit)) {
40 const pagedUrl = new URL(url);
41 pagedUrl.searchParams.set('pageNumber', page + 1);
42 await crawler.requestQueue.addRequest({
43 url: pagedUrl.toString(),
44 userData: {
45 page: page + 1
46 }
47 });
48 } else {
49 log.info(`[DONE]: ${resultsCounter} job(s) out of ${counter} at ${url}`);
50 }
51 }

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
apify_storage
storage

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

INPUT_SCHEMA.json

{
"title": "Input schema",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Add Vdab.be job categories URLs you want to scrape ",
"type": "array",
"description": "Add one or more job categories with optional sorting and filtering",
"editor": "stringList",
"placeholderValue": "URL",
"prefill": ["https://www.vdab.be/vindeenjob/jobs/financieel?sort=1"],
"patternValue": "https:\\/\\/(www\\.)?vdab\\.be\\/.+",
"uniqueItems": true
},
"resultsLimit": {
"title": "Max results",
"type": "integer",
"description": "How many jobs you want to scrape from each category URL",
"editor": "number",
"unit": "per page",
"default": 100
},
"proxyConfiguration": {
"title": "Proxy configuration",
"type": "object",
"description": "A proxy server is required to run this actor!",
"prefill": {
"useApifyProxy": true
},
"editor": "proxy",
"sectionCaption": "Proxy configuration",
"sectionDescription": "Select your proxy here."
}
}
}

package.json

{
"name": "crawlee-cheerio-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify actor.",
"engines": {
"node": ">=16.0.0"
},
"dependencies": {
"apify": "^3.0.0",
"crawlee": "^3.0.0"
},
"devDependencies": {
"@apify/eslint-config": "^0.3.1",
"eslint": "^8.20.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}