Bazaraki Scrapper avatar
Bazaraki Scrapper

Pricing

Pay per usage

Go to Store
Bazaraki Scrapper

Bazaraki Scrapper

Developed by

Александр Штерн

Александр Штерн

Maintained by Community

scrapping bazaraki 1rst page real estate

0.0 (0)

Pricing

Pay per usage

0

Total users

4

Monthly users

4

Runs succeeded

>99%

Last modified

24 days ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:20
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor",
"title": "Project Puppeteer Crawler JavaScript",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PuppeteerCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}

src/main.js

1import { Actor } from 'apify';
2import { CheerioCrawler, Dataset } from 'crawlee';
3
4await Actor.init();
5
6const startUrls = ['https://www.bazaraki.com/real-estate-for-sale/apartments-flats/?ordering=newest'];
7
8const crawler = new CheerioCrawler({
9 requestHandler: async ({ request, $, log }) => {
10 log.info(`Парсим: ${request.url}`);
11
12 $('.js-item-listing').each((_, el) => {
13 const title = $(el).find('a.advert__content-title').text().trim() || null;
14 const url = 'https://www.bazaraki.com' + ($(el).find('a.advert__content-title').attr('href') || '');
15 const price = $(el).find('a.advert__content-price._not-title span').text().replace(/\s+/g, ' ').trim() || null;
16
17 const result = { title, url, price };
18
19 console.log(result);
20 Dataset.pushData(result); // ← вот это сохраняет в output!
21 });
22 },
23});
24
25await crawler.run(startUrls);
26await Actor.exit();

src/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://apify.com/*'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 const title = await page.title();
15 log.info(`${title}`, { url: request.loadedUrl });
16
17 await Dataset.pushData({
18 url: request.loadedUrl,
19 title,
20 });
21});

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-puppeteer-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.2.6",
"crawlee": "^3.11.5",
"puppeteer": "*"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}