RealityMix.cz Scraper avatar

RealityMix.cz Scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
RealityMix.cz Scraper

RealityMix.cz Scraper

leskovde/realitymix-scraper

A web scraper for the realitymix.cz real estate portal. The scaper looks through listings with an applied filter. The filter is specified in the actor's input. The output is all available listings with the filter applied.

src/consts.js

1module.exports = {
2    STATES: {
3        "Praha" : 19,
4        "Jihočeský kraj" : 27,
5        "Jihomoravský kraj" : 60,
6        "Karlovarský kraj" : 51, 
7        "Královéhradecký kraj" : 78, 
8        "Liberecký kraj": 86, 
9        "Moravskoslezský kraj" : 94, 
10        "Olomoucký kraj" : 124, 
11        "Pardubický kraj" : 132, 
12        "Plzeňský kraj" : 141, 
13        "Středočeský kraj" : 116, 
14        "Ústecký kraj" : 108, 
15        "Vysočina" : 35, 
16        "Zlínský kraj" : 43
17    },
18    LAYOUTS: {
19        "Garsoniéra" : 1, 
20        "1+kk" : 2, 
21        "1+1" : 9, 
22        "2+kk" : 10, 
23        "2+1" : 3, 
24        "3+kk" : 4, 
25        "3+1" : 11, 
26        "4+kk" : 5, 
27        "4+1" : 12, 
28        "5+kk" : 6, 
29        "5+1" : 14, 
30        "6+kk" : 16,
31        "6+1" : 13, 
32        "7+kk" : 8, 
33        "7+1" : 17, 
34        "Atypický" : 7,
35        "Jiný" : 15
36    },
37    ADVERTTYPES : {
38        "Prodej" : 1,
39        "Pronájem" : 2,
40        "Vše" : 0
41    },
42    BUILDINGTYPES : {
43        "Byty" : 4,
44        "Domy" : 6,
45        "Chaty" : 10,
46        "Pozemky" : 3,
47    },
48    OWNERSHIPTYPES : {
49        "osobní vlastnictví" : 1, 
50        "družstevní vlastnictví" : 2, 
51        "jiné vlastnictví" : 3
52    }
53};

src/routes.js

1const Apify = require("apify");
2
3const {
4    utils: { log },
5} = Apify;
6
7exports.handleStart = async ({ request, page, crawler }) => {
8    log.info("[START]: Start.");
9
10    const lastPageNumber = await page.$$eval("li.paginator__list-item", $list => {
11        const classContainer = Array.from($list)
12        const item = classContainer[classContainer.length - 2];
13
14        return parseInt(item.querySelector('a').textContent);
15    });
16
17    const pageUrl = page.url();
18
19    for (let index = 1; index < lastPageNumber; index++) {
20        const link = pageUrl.concat(`&stranka=${index}`);
21        const request = {
22            userData: {
23                label: "LIST",
24            },
25            url: link,
26        };
27        log.info(`Adding listing page url: ${request.url}`);
28        await crawler.requestQueue.addRequest(request);
29    }
30
31    log.info("[START]: Listing pages pushed.");
32};
33
34exports.handleList = async ({ request, page, crawler }) => {
35    log.info("[LIST]: Start.");
36   
37    const listings = await page.$$eval(".advert-list-items__content", ($listing) => {
38            const items = [];
39            $listing.forEach(($item) => {
40                const link = $item.querySelector("h2 > a").getAttribute("href");
41                items.push({
42                    userData: {
43                        label: "DETAIL",
44                    },
45                    url: link,
46                });
47            });
48            return items;
49        });
50
51    log.info("[LIST]: Scraped.");
52
53    for (let index = 0; index < listings.length; index++) {
54        const request = listings[index];
55        await crawler.requestQueue.addRequest(request);
56    }
57
58    log.info("[LIST]: Listings pushed.");
59};
60
61exports.handleDetail = async ({ request, page }) => {
62    log.info("[DETAIL]: Getting detail info.");
63
64    let {
65        buildingType,
66        location,
67        price 
68    } = await page.$$eval(".advert-detail-fixed-top__content-info", $list => {
69        const item = Array.from($list)[0];
70        
71        const headerText = item.querySelector("h4").textContent.trim();
72        const buildingType = headerText.split(",")[0].trim();
73        const location = item.querySelector("p").textContent.trim();
74        
75        const highlight = item.querySelector("strong");
76        const price = highlight.querySelector("span").textContent.trim();
77
78        flag = true;
79
80        return {
81            "buildingType" : buildingType,
82            "location" : location,
83            "price" : price
84        }
85    });
86
87    let mainImage = await page.$$eval(".gallery__main-img-inner", $list => {
88        const item = Array.from($list)[0];
89
90        const imageUrl = item.querySelector("img").src;
91
92        return { "imageUrl" : imageUrl }
93    });
94
95    let smallImages = await page.$$eval(".gallery__item--image", $list => {
96        const items = [];
97
98        $list.forEach($item => {
99            const imageContainer = $item.querySelector("a");
100            const imageUrl = imageContainer.querySelector("img").src;
101
102            items.push({ "imageUrl" : imageUrl });
103        });
104        
105        return items;
106    });
107
108    let images = [ mainImage ].concat(smallImages);
109
110    let propertyElList = await page.$$eval(".detail-information__data-item", $list => {
111        const items = [];
112
113        $list.forEach($item => {
114            const key = $item.querySelector("span:nth-child(1)").textContent.trim();
115            const value = $item.querySelector("span:nth-child(2)").textContent.trim();
116
117            items.push({ "key": key, "value" : value });
118        });
119        
120        return items;
121    });
122
123    let {
124        description 
125    } = await page.$$eval(".advert-description__text-inner-inner", $list => {
126        const item = Array.from($list)[0];
127
128        return { "description" : item.textContent.trim() };
129    });
130
131    await Apify.pushData({buildingType, location, price, "images" : images, "properties" : propertyElList, description });
132
133    log.info("[DETAIL]: Detail info done.");
134};

src/tools.js

1const Apify = require('apify');
2const { STATES, LAYOUTS, ADVERTTYPES, BUILDINGTYPES, OWNERSHIPTYPES } = require('./consts');
3
4const { utils: { log } } = Apify;
5
6const getAndValidateInput = async () => {
7    const input = await Apify.getInput();
8
9    const stateToInt = (x) => {
10        if (x in STATES)
11            return STATES[x];
12        throw new Error("Invalid state.");
13    }
14
15    const layoutToInt = (x) => {
16        if (x in LAYOUTS)
17            return LAYOUTS[x];
18        throw new Error("Invalid layout.");
19    }
20
21    const advertTypeToInt = (x) => {
22        if (x in ADVERTTYPES)
23            return ADVERTTYPES[x];
24        throw new Error("Invalid advert type.");
25    }
26
27    const buildingTypeToInt = (x) => {
28        if (x in BUILDINGTYPES)
29            return BUILDINGTYPES[x];
30        throw new Error("Invalid building type.");
31    }
32
33    const ownershipTypeToInt = (x) => {
34        if (x in OWNERSHIPTYPES)
35            return OWNERSHIPTYPES[x];
36        throw new Error("Invalid ownership type.");
37    }
38
39    const maybeThis = (x) => { 
40        return (x === undefined) ? "" : x;
41    }
42
43    let {
44        state,
45        priceStart,
46        priceEnd,
47        layout,
48        advertType,
49        buildingType,
50        areaFrom,
51        areaTo,
52        levelFrom,
53        levelTo,
54        ownershipType,
55    } = input;
56
57    state = stateToInt(state);
58    layout = layoutToInt(layout);
59    advertType = advertTypeToInt(advertType);
60    buildingType = buildingTypeToInt(buildingType);
61    ownershipType = ownershipTypeToInt(ownershipType);
62
63    priceStart = maybeThis(priceStart);
64    priceEnd = maybeThis(priceEnd);
65    areaFrom = maybeThis(areaFrom);
66    areaTo = maybeThis(areaTo);
67    levelFrom = maybeThis(levelFrom);
68    levelTo = maybeThis(levelTo);
69
70    log.info("----- Input overview -----");
71    log.info(`Search state: ${state}`);
72    log.info(`Search priceStart: ${priceStart}`);
73    log.info(`Search priceEnd: ${priceEnd}`);
74    log.info(`Search layout: ${layout}`);
75    log.info(`Search advertType: ${advertType}`);
76    log.info(`Search buildingType: ${buildingType}`);
77    log.info(`Search areaFrom: ${areaFrom}`);
78    log.info(`Search areaTo: ${areaTo}`);
79    log.info(`Search levelFrom: ${levelFrom}`);
80    log.info(`Search levelTo: ${levelTo}`);
81    log.info(`Search ownershipType: ${ownershipType}`);
82    log.info("----- End of input overview -----");
83
84    return {
85        state,
86        priceStart,
87        priceEnd,
88        layout,
89        advertType,
90        buildingType,
91        areaFrom,
92        areaTo,
93        levelFrom,
94        levelTo,
95        ownershipType,
96    };
97}
98
99module.exports = {
100    getAndValidateInput
101};

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify"
3}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4node_modules
5
6apify_storage

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2  "title": "RealityMix properties",
3  "description": "Properties for flat scraping.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "state": {
8      "title": "State",
9      "type": "string",
10      "description": "The state in Czech",
11      "editor": "select",
12      "default": "Praha",
13      "enum": ["Praha", "Jihočeský kraj", "Jihomoravský kraj", "Karlovarský kraj", "Královéhradecký kraj", "Liberecký kraj", "Moravskoslezský kraj", "Olomoucký kraj", "Pardubický kraj", "Plzeňský kraj", "Středočeský kraj", "Ústecký kraj", "Vysočina", "Zlínský kraj"]
14    },
15    "priceStart": {
16      "title": "PriceRangeStart",
17      "type": "integer",
18      "description": "The lower bound of the price range.",
19      "editor": "number"
20    },
21    "priceEnd": {
22      "title": "PriceRangeEnd",
23      "type": "integer",
24      "description": "The upper bound of the price range.",
25      "editor": "number"
26    },
27    "layout": {
28      "title": "FlatLayout",
29      "type": "string",
30      "description": "The layout of the property.",
31      "editor": "select",
32      "default": "1+kk",
33      "enum": ["1+kk", "1+1", "2+kk", "2+1", "3+kk", "3+1", "4+kk", "5+kk", "5+1", "6+kk", "6+1", "7+kk", "7+1", "Atypický", "Jiný"]
34    },
35    "advertType": {
36      "title": "AdvertisementType",
37      "type": "string",
38      "description": "The type of the advertisement",
39      "editor": "select",
40      "default": "Vše",
41      "enum": ["Prodej", "Pronájem", "Vše"]
42    },
43    "buildingType": {
44      "title": "BuildingType",
45      "type": "string",
46      "description": "The type of the building",
47      "editor": "select",
48      "default": "Byty",
49      "enum": ["Byty"]
50    },
51    "areaFrom": {
52      "title": "AreaRangeFrom",
53      "type": "integer",
54      "description": "The lower bound of the m2 area.",
55      "editor": "number"
56    },
57    "areaTo": {
58      "title": "AreaRangeTo",
59      "type": "integer",
60      "description": "The upper bound of the m2 area.",
61      "editor": "number"
62    },
63    "levelFrom": {
64      "title": "FlatLevelRangeFrom",
65      "type": "integer",
66      "description": "The lower bound of the level on which the flat should be located.",
67      "editor": "number"
68    },
69    "levelTo": {
70      "title": "FlatLevelRangeTo",
71      "type": "integer",
72      "description": "The upper bound of the level on which the flat should be located.",
73      "editor": "number"
74    },
75    "ownershipType": {
76      "title": "OwnershipType",
77      "type": "string",
78      "description": "The type of the ownership.",
79      "editor": "select",
80      "default": "osobní vlastnictví",
81      "enum": ["osobní vlastnictví", "družstevní vlastnictví", "jiné vlastnictví"]
82    }
83  },
84  "required": [
85    "state",
86    "layout",
87    "advertType",
88    "buildingType",
89    "ownershipType"
90  ]
91}

apify.json

1{
2	"name": "realitymix-scraper",
3	"version": "0.0",
4	"buildTag": "latest",
5	"env": null,
6	"template": "project_puppeteer_crawler"
7}

main.js

1const Apify = require('apify');
2const { getAndValidateInput } = require('./src/tools');
3const { handleStart, handleList, handleDetail } = require('./src/routes');
4
5const { utils: { log } } = Apify;
6
7Apify.main(async () => {
8    const {
9        state,
10        priceStart,
11        priceEnd,
12        layout,
13        advertType,
14        buildingType,
15        areaFrom,
16        areaTo,
17        levelFrom,
18        levelTo,
19        ownershipType,
20    } = await getAndValidateInput();
21
22    let startUrl = `https://realitymix.cz/vypis-nabidek/?form%5Badresa_kraj_id%5D[]=${state}&form%5Badresa_obec_id%5D=&form%5Bcena_mena%5D=&form%5Bcena_normalizovana__from%5D=${priceStart}&form%5Bcena_normalizovana__to%5D=${priceEnd}&form%5Bdispozice%5D[]=${layout}&form%5Bexclusive%5D=&form%5Bfk_rk%5D=&form%5Binzerat_typ%5D=${advertType}&form%5Bnemovitost_typ%5D[]=${buildingType}&form%5Bplocha__from%5D=${areaFrom}&form%5Bplocha__to%5D=${areaTo}&form%5Bpodlazi_cislo__from%5D=${levelFrom}&form%5Bpodlazi_cislo__to%5D=${levelTo}&form%5Bprojekt_id%5D=&form%5Bsearch_in_city%5D=&form%5Bsearch_in_text%5D=&form%5Bstari_inzeratu%5D=&form%5Bstav_objektu%5D=&form%5Btop_nabidky%5D=&form%5Bvlastnictvi%5D[]=${ownershipType}`
23
24    log.info(`Starting at url: ${startUrl}`);
25
26    const requestList = await Apify.openRequestList('start-urls', [ startUrl ]);
27    const requestQueue = await Apify.openRequestQueue();
28    const proxyConfiguration = await Apify.createProxyConfiguration();
29
30    const crawler = new Apify.PuppeteerCrawler({
31        requestList,
32        requestQueue,
33        proxyConfiguration,
34        launchContext: {
35            useChrome: false,
36            stealth: true,
37        },
38        handlePageFunction: async (context) => {
39            const { url, userData: { label } } = context.request;
40            log.info('Page opened.', { label, url });
41            switch (label) {
42                case 'LIST':
43                    return handleList(context);
44                case 'DETAIL':
45                    return handleDetail(context);
46                default:
47                    return handleStart(context);
48            }
49        },
50    });
51
52    log.info('Starting the crawl.');
53    await crawler.run();
54    log.info('Crawl finished.');
55});

package.json

1{
2	"name": "realitymix-scraper",
3	"version": "0.0.1",
4	"description": "This is a boilerplate of an Apify actor.",
5	"dependencies": {
6		"apify": "^2.0.7",
7		"puppeteer": "*"
8	},
9	"devDependencies": {
10		"@apify/eslint-config": "^0.1.3",
11		"eslint": "^7.0.0"
12	},
13	"scripts": {
14		"start": "node main.js",
15		"lint": "./node_modules/.bin/eslint ./src --ext .js,.jsx",
16		"lint:fix": "./node_modules/.bin/eslint ./src --ext .js,.jsx --fix",
17		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18	},
19	"author": "It's not you it's me",
20	"license": "ISC"
21}
Developer
Maintained by Community