2GIS Company Card parser avatar
2GIS Company Card parser

Under maintenance

Pricing

Pay per usage

Go to Store
2GIS Company Card parser

2GIS Company Card parser

Under maintenance

Developed by

Daniil Gerasimenko

Maintained by Community

Scrapes basic information from 2GIS company pages.

0.0 (0)

Pricing

Pay per usage

2

Monthly users

20

Runs succeeded

>99%

Response time

7.2 hours

Last modified

3 months ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install NPM packages, skip optional and development dependencies to
14# keep the image small. Avoid logging too much and print the dependency
15# tree for debugging
16RUN npm --quiet set progress=false \
17    && npm install --omit=dev --omit=optional \
18    && echo "Installed NPM packages:" \
19    && (npm list --omit=dev --all || true) \
20    && echo "Node.js version:" \
21    && node --version \
22    && echo "NPM version:" \
23    && npm --version \
24    && rm -r ~/.npm
25
26# Next, copy the remaining files and directories with the source code.
27# Since we do this after NPM install, quick build will be really fast
28# for most source file changes.
29COPY . ./
30
31
32# Run the image.
33CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Project Cheerio Crawler Javascript",
5    "description": "Crawlee and Cheerio project in javascript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-cheerio"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "CheerioCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://2gis.ru/moscow/firm/70000001023063516"
14                }
15            ]
16        },
17        "maxRequestsPerCrawl": {
18            "title": "Max Requests per Crawl",
19            "type": "integer",
20            "description": "Maximum number of requests that can be made by this crawler.",
21            "default": 100
22        }
23    }
24}

src/main.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
2import { Actor } from 'apify';
3// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
4import { CheerioCrawler, Dataset } from 'crawlee';
5// this is ESM project, and as such, it requires you to specify extensions in your relative imports
6// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
7// import { router } from './routes.js';
8
9// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
10await Actor.init();
11
12// Structure of input is defined in input_schema.json
13const {
14    startUrls = ['https://crawlee.dev'],
15    maxRequestsPerCrawl = 100,
16} = await Actor.getInput() ?? {};
17
18const proxyConfiguration = await Actor.createProxyConfiguration();
19
20const crawler = new CheerioCrawler({
21    proxyConfiguration,
22    maxRequestsPerCrawl,
23    async requestHandler({ request, $, log }) {
24        // Extract title from the page.
25        const title = $('title').text();
26        log.info(`${title}`, { url: request.loadedUrl });
27
28        $('a[href*="link.2gis"]').each((index, el) => {
29            const innerText = $(el).text();
30
31            if(innerText.includes('.') && !innerText.includes('hh')) {
32                Actor.pushData({site: innerText, title: title})
33            }
34        })
35    },
36});
37
38await crawler.run(startUrls);
39
40// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
41await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-cheerio-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.2.6",
11        "crawlee": "^3.11.5",
12        "lodash": "^4.17.21"
13    },
14    "devDependencies": {
15        "@apify/eslint-config": "^0.4.0",
16        "eslint": "^8.50.0"
17    },
18    "scripts": {
19        "start": "node src/main.js",
20        "lint": "eslint ./src --ext .js,.jsx",
21        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
22        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23    },
24    "author": "It's not you it's me",
25    "license": "ISC"
26}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.