Actor picture

RealityMix.cz Scraper

leskovde/realitymix-scraper

A web scraper for the realitymix.cz real estate portal. The scaper looks through listings with an applied filter. The filter is specified in the actor's input. The output is all available listings with the filter applied.

No credit card required

Author's avatarDenis Leskovar
  • Modified
  • Users10
  • Runs211
Actor picture
RealityMix.cz Scraper

src/consts.js

module.exports = {
    STATES: {
        "Praha" : 19,
        "Jihočeský kraj" : 27,
        "Jihomoravský kraj" : 60,
        "Karlovarský kraj" : 51, 
        "Královéhradecký kraj" : 78, 
        "Liberecký kraj": 86, 
        "Moravskoslezský kraj" : 94, 
        "Olomoucký kraj" : 124, 
        "Pardubický kraj" : 132, 
        "Plzeňský kraj" : 141, 
        "Středočeský kraj" : 116, 
        "Ústecký kraj" : 108, 
        "Vysočina" : 35, 
        "Zlínský kraj" : 43
    },
    LAYOUTS: {
        "Garsoniéra" : 1, 
        "1+kk" : 2, 
        "1+1" : 9, 
        "2+kk" : 10, 
        "2+1" : 3, 
        "3+kk" : 4, 
        "3+1" : 11, 
        "4+kk" : 5, 
        "4+1" : 12, 
        "5+kk" : 6, 
        "5+1" : 14, 
        "6+kk" : 16,
        "6+1" : 13, 
        "7+kk" : 8, 
        "7+1" : 17, 
        "Atypický" : 7,
        "Jiný" : 15
    },
    ADVERTTYPES : {
        "Prodej" : 1,
        "Pronájem" : 2,
        "Vše" : 0
    },
    BUILDINGTYPES : {
        "Byty" : 4,
        "Domy" : 6,
        "Chaty" : 10,
        "Pozemky" : 3,
    },
    OWNERSHIPTYPES : {
        "osobní vlastnictví" : 1, 
        "družstevní vlastnictví" : 2, 
        "jiné vlastnictví" : 3
    }
};

src/routes.js

This file is 134 lines long. Only the first 50 are shown. Show all

const Apify = require("apify");

const {
    utils: { log },
} = Apify;

exports.handleStart = async ({ request, page, crawler }) => {
    log.info("[START]: Start.");

    const lastPageNumber = await page.$$eval("li.paginator__list-item", $list => {
        const classContainer = Array.from($list)
        const item = classContainer[classContainer.length - 2];

        return parseInt(item.querySelector('a').textContent);
    });

    const pageUrl = page.url();

    for (let index = 1; index < lastPageNumber; index++) {
        const link = pageUrl.concat(`&stranka=${index}`);
        const request = {
            userData: {
                label: "LIST",
            },
            url: link,
        };
        log.info(`Adding listing page url: ${request.url}`);
        await crawler.requestQueue.addRequest(request);
    }

    log.info("[START]: Listing pages pushed.");
};

exports.handleList = async ({ request, page, crawler }) => {
    log.info("[LIST]: Start.");
   
    const listings = await page.$$eval(".advert-list-items__content", ($listing) => {
            const items = [];
            $listing.forEach(($item) => {
                const link = $item.querySelector("h2 > a").getAttribute("href");
                items.push({
                    userData: {
                        label: "DETAIL",
                    },
                    url: link,
                });
            });
            return items;
        });

src/tools.js

const Apify = require('apify');
const { STATES, LAYOUTS, ADVERTTYPES, BUILDINGTYPES, OWNERSHIPTYPES } = require('./consts');

const { utils: { log } } = Apify;

const getAndValidateInput = async () => {
    const input = await Apify.getInput();

    const stateToInt = (x) => {
        if (x in STATES)
            return STATES[x];
        throw new Error("Invalid state.");
    }

    const layoutToInt = (x) => {
        if (x in LAYOUTS)
            return LAYOUTS[x];
        throw new Error("Invalid layout.");
    }

    const advertTypeToInt = (x) => {
        if (x in ADVERTTYPES)
            return ADVERTTYPES[x];
        throw new Error("Invalid advert type.");
    }

    const buildingTypeToInt = (x) => {
        if (x in BUILDINGTYPES)
            return BUILDINGTYPES[x];
        throw new Error("Invalid building type.");
    }

    const ownershipTypeToInt = (x) => {
        if (x in OWNERSHIPTYPES)
            return OWNERSHIPTYPES[x];
        throw new Error("Invalid ownership type.");
    }

    const maybeThis = (x) => { 
        return (x === undefined) ? "" : x;
    }

    let {
        state,
        priceStart,
        priceEnd,
        layout,
        advertType,
        buildingType,
        areaFrom,
        areaTo,
        levelFrom,
        levelTo,
        ownershipType,
    } = input;

    state = stateToInt(state);
    layout = layoutToInt(layout);
    advertType = advertTypeToInt(advertType);
    buildingType = buildingTypeToInt(buildingType);
    ownershipType = ownershipTypeToInt(ownershipType);

    priceStart = maybeThis(priceStart);
    priceEnd = maybeThis(priceEnd);
    areaFrom = maybeThis(areaFrom);
    areaTo = maybeThis(areaTo);
    levelFrom = maybeThis(levelFrom);
    levelTo = maybeThis(levelTo);

    log.info("----- Input overview -----");
    log.info(`Search state: ${state}`);
    log.info(`Search priceStart: ${priceStart}`);
    log.info(`Search priceEnd: ${priceEnd}`);
    log.info(`Search layout: ${layout}`);
    log.info(`Search advertType: ${advertType}`);
    log.info(`Search buildingType: ${buildingType}`);
    log.info(`Search areaFrom: ${areaFrom}`);
    log.info(`Search areaTo: ${areaTo}`);
    log.info(`Search levelFrom: ${levelFrom}`);
    log.info(`Search levelTo: ${levelTo}`);
    log.info(`Search ownershipType: ${ownershipType}`);
    log.info("----- End of input overview -----");

    return {
        state,
        priceStart,
        priceEnd,
        layout,
        advertType,
        buildingType,
        areaFrom,
        areaTo,
        levelFrom,
        levelTo,
        ownershipType,
    };
}

module.exports = {
    getAndValidateInput
};

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify"
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
node_modules

apify_storage

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:16

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
  "title": "RealityMix properties",
  "description": "Properties for flat scraping.",
  "type": "object",
  "schemaVersion": 1,
  "properties": {
    "state": {
      "title": "State",
      "type": "string",
      "description": "The state in Czech",
      "editor": "select",
      "default": "Praha",
      "enum": ["Praha", "Jihočeský kraj", "Jihomoravský kraj", "Karlovarský kraj", "Královéhradecký kraj", "Liberecký kraj", "Moravskoslezský kraj", "Olomoucký kraj", "Pardubický kraj", "Plzeňský kraj", "Středočeský kraj", "Ústecký kraj", "Vysočina", "Zlínský kraj"]
    },
    "priceStart": {
      "title": "PriceRangeStart",
      "type": "integer",
      "description": "The lower bound of the price range.",
      "editor": "number"
    },
    "priceEnd": {
      "title": "PriceRangeEnd",
      "type": "integer",
      "description": "The upper bound of the price range.",
      "editor": "number"
    },
    "layout": {
      "title": "FlatLayout",
      "type": "string",
      "description": "The layout of the property.",
      "editor": "select",
      "default": "1+kk",
      "enum": ["1+kk", "1+1", "2+kk", "2+1", "3+kk", "3+1", "4+kk", "5+kk", "5+1", "6+kk", "6+1", "7+kk", "7+1", "Atypický", "Jiný"]
    },
    "advertType": {
      "title": "AdvertisementType",
      "type": "string",
      "description": "The type of the advertisement",
      "editor": "select",
      "default": "Vše",
      "enum": ["Prodej", "Pronájem", "Vše"]
    },
    "buildingType": {
      "title": "BuildingType",
      "type": "string",
      "description": "The type of the building",
      "editor": "select",
      "default": "Byty",
      "enum": ["Byty"]
    },
    "areaFrom": {
      "title": "AreaRangeFrom",
      "type": "integer",
      "description": "The lower bound of the m2 area.",
      "editor": "number"
    },
    "areaTo": {
      "title": "AreaRangeTo",
      "type": "integer",
      "description": "The upper bound of the m2 area.",
      "editor": "number"
    },
    "levelFrom": {
      "title": "FlatLevelRangeFrom",
      "type": "integer",
      "description": "The lower bound of the level on which the flat should be located.",
      "editor": "number"
    },
    "levelTo": {
      "title": "FlatLevelRangeTo",
      "type": "integer",
      "description": "The upper bound of the level on which the flat should be located.",
      "editor": "number"
    },
    "ownershipType": {
      "title": "OwnershipType",
      "type": "string",
      "description": "The type of the ownership.",
      "editor": "select",
      "default": "osobní vlastnictví",
      "enum": ["osobní vlastnictví", "družstevní vlastnictví", "jiné vlastnictví"]
    }
  },
  "required": [
    "state",
    "layout",
    "advertType",
    "buildingType",
    "ownershipType"
  ]
}

README.md

# RealityMix Listings Scraper

Puppeteer-based real estate advertisement scraper.

## Usage

Several input properties support a full text specification of their value. Once the required input is specified, the actor constructs an URL with the specified filter. The linked page features 20 listings. The actor switches the pages during the search, thus allowing to collect more than 20 listings in a single run. The Actor uses Puppeteer, the minimum required memory for running is 2048 MB.

## Input

Since some properties support full text values, we provide a comprehensive list of supported values.

| Property name     | Possible values        | Required           |
| ----------------- | ---------------------- | ------------------ |
| **state**         | Praha                  | **yes**            |
|                   | Jihočeský kraj         |                    |
|                   | Jihomoravský kraj      |                    |
|                   | Karlovarský kraj       |                    |
|                   | Královéhradecký kraj   |                    |
|                   | Liberecký kraj         |                    |
|                   | Moravskoslezský kraj   |                    |
|                   | Olomoucký kraj         |                    |
|                   | Pardubický kraj        |                    |
|                   | Plzeňský kraj          |                    |
|                   | Středočeský kraj       |                    |
|                   | Ústecký kraj           |                    |
|                   | Vysočina               |                    |
|                   | Zlínský kraj           |                    |
| **layout**        | Garsoniéra             | **yes**            |
|                   | 1+kk                   |                    |
|                   | 1+1                    |                    |
|                   | 2+kk                   |                    |
|                   | 2+1                    |                    |
|                   | 3+kk                   |                    |
|                   | 3+1                    |                    |
|                   | 4+kk                   |                    |
|                   | 4+1                    |                    |
|                   | 5+kk                   |                    |
|                   | 5+1                    |                    |
|                   | 6+kk                   |                    |
|                   | 6+1                    |                    |
|                   | 7+kk                   |                    |
|                   | 7+1                    |                    |
|                   | Atypický               |                    |
|                   | Jiný                   |                    |
| **advertType**    | Prodej                 | **yes**            |
|                   | Pronájem               |                    |
|                   | Vše                    |                    |
| **buildingType**  | Byty                   | **yes**            |
|                   | Domy                   |                    |
|                   | Chaty                  |                    |
|                   | Pozemky                |                    |
| **ownershipType** | osobní vlastnictví     | **yes**            |
|                   | družstevní vlastnictví |                    |
|                   | jiné vlastnictví       |                    |
| **priceStart**    | 0, 1, ...              | **no**             |
| **priceEnd**      | 0, 1, ...              | **no**             |
| **areaFrom**      | 0, 1, ...              | **no**             |
| **areaTo**        | 0, 1, ...              | **no**             |
| **levelFrom**     | 0, 1, ...              | **no**             |
| **levelTo**       | 0, 1, ...              | **no**             |

## Output

The output of the Actor features details of each listing. The detail contains the type of the advertised building, its location, and its price. URLs to provided images are listed in the result as well. Additionally, the output contains the content of the listing's overview table. Moreover, the full text description is also included.

apify.json

{
	"name": "realitymix-scraper",
	"version": "0.0",
	"buildTag": "latest",
	"env": null,
	"template": "project_puppeteer_crawler"
}

main.js

const Apify = require('apify');
const { getAndValidateInput } = require('./src/tools');
const { handleStart, handleList, handleDetail } = require('./src/routes');

const { utils: { log } } = Apify;

Apify.main(async () => {
    const {
        state,
        priceStart,
        priceEnd,
        layout,
        advertType,
        buildingType,
        areaFrom,
        areaTo,
        levelFrom,
        levelTo,
        ownershipType,
    } = await getAndValidateInput();

    let startUrl = `https://realitymix.cz/vypis-nabidek/?form%5Badresa_kraj_id%5D[]=${state}&form%5Badresa_obec_id%5D=&form%5Bcena_mena%5D=&form%5Bcena_normalizovana__from%5D=${priceStart}&form%5Bcena_normalizovana__to%5D=${priceEnd}&form%5Bdispozice%5D[]=${layout}&form%5Bexclusive%5D=&form%5Bfk_rk%5D=&form%5Binzerat_typ%5D=${advertType}&form%5Bnemovitost_typ%5D[]=${buildingType}&form%5Bplocha__from%5D=${areaFrom}&form%5Bplocha__to%5D=${areaTo}&form%5Bpodlazi_cislo__from%5D=${levelFrom}&form%5Bpodlazi_cislo__to%5D=${levelTo}&form%5Bprojekt_id%5D=&form%5Bsearch_in_city%5D=&form%5Bsearch_in_text%5D=&form%5Bstari_inzeratu%5D=&form%5Bstav_objektu%5D=&form%5Btop_nabidky%5D=&form%5Bvlastnictvi%5D[]=${ownershipType}`

    log.info(`Starting at url: ${startUrl}`);

    const requestList = await Apify.openRequestList('start-urls', [ startUrl ]);
    const requestQueue = await Apify.openRequestQueue();
    const proxyConfiguration = await Apify.createProxyConfiguration();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        requestQueue,
        proxyConfiguration,
        launchContext: {
            useChrome: false,
            stealth: true,
        },
        handlePageFunction: async (context) => {
            const { url, userData: { label } } = context.request;
            log.info('Page opened.', { label, url });
            switch (label) {
                case 'LIST':
                    return handleList(context);
                case 'DETAIL':
                    return handleDetail(context);
                default:
                    return handleStart(context);
            }
        },
    });

    log.info('Starting the crawl.');
    await crawler.run();
    log.info('Crawl finished.');
});

package-lock.json

This file is 3393 lines long. Only the first 50 are shown. Show all

{
	"name": "realitymix-scraper",
	"version": "0.0.1",
	"lockfileVersion": 1,
	"requires": true,
	"dependencies": {
		"@apify/consts": {
			"version": "1.4.0",
			"resolved": "https://registry.npmjs.org/@apify/consts/-/consts-1.4.0.tgz",
			"integrity": "sha512-OcysWtfs+NOVlGHdIIDci7iOhr0ZDyCoZcsG/VpfJCBwna+w7AVfxX3TmYdFgrJCuq7KM7y/8BYQ+azJmceALQ=="
		},
		"@apify/datastructures": {
			"version": "1.0.1",
			"resolved": "https://registry.npmjs.org/@apify/datastructures/-/datastructures-1.0.1.tgz",
			"integrity": "sha512-AgnrfMjzDph+Te5WGNnIsz3+dJM7v/Sqo82nWwSqca292paRotUhORXr9Ik+d0yurC5LutDAhcvu8VZ8SfANGg=="
		},
		"@apify/eslint-config": {
			"version": "0.1.4",
			"resolved": "https://registry.npmjs.org/@apify/eslint-config/-/eslint-config-0.1.4.tgz",
			"integrity": "sha512-sbEpFJk+drdTxRVRoL3Ou0h9pmfu/BAiAxZDH3ANHuF7NoprLV1tQvs3PRu+IsFhxIHihI/6znY19KnPOq1dpA==",
			"dev": true,
			"requires": {
				"eslint-config-airbnb": "^18.2.0",
				"eslint-config-airbnb-base": "^14.2.0",
				"eslint-import-resolver-typescript": "^2.2.1",
				"eslint-plugin-import": "^2.22.0",
				"eslint-plugin-jsx-a11y": "^6.2.3",
				"eslint-plugin-promise": "^4.2.1",
				"eslint-plugin-react": "^7.20.0",
				"eslint-plugin-react-hooks": "^4.1.0"
			}
		},
		"@apify/log": {
			"version": "1.1.4",
			"resolved": "https://registry.npmjs.org/@apify/log/-/log-1.1.4.tgz",
			"integrity": "sha512-NkuEAJSvywASI+8tsEIuctEgPF7GObfIqNfP0w6me/aIGaMuFVleGy+UF7FLeJbPRAYw6uAgtciiWcE10mfbDQ==",
			"requires": {
				"@apify/consts": "^1.4.0",
				"ansi-colors": "^4.1.1"
			}
		},
		"@apify/ps-tree": {
			"version": "1.1.4",
			"resolved": "https://registry.npmjs.org/@apify/ps-tree/-/ps-tree-1.1.4.tgz",
			"integrity": "sha512-zUMjF8hO82Wg4+NHW+plQZ/HSPqAeYt0Ejo8t28hShAMYRbIpLjdAh3Nts1PL1+3YdxlY9BxHvUCY+hXbxWYRg==",
			"requires": {
				"event-stream": "3.3.4"
			}
		},
		"@apify/storage-local": {

package.json

{
	"name": "realitymix-scraper",
	"version": "0.0.1",
	"description": "This is a boilerplate of an Apify actor.",
	"dependencies": {
		"apify": "^2.0.7",
		"puppeteer": "*"
	},
	"devDependencies": {
		"@apify/eslint-config": "^0.1.3",
		"eslint": "^7.0.0"
	},
	"scripts": {
		"start": "node main.js",
		"lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
		"lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
	},
	"author": "It's not you it's me",
	"license": "ISC"
}