My Actor avatar
My Actor

Under maintenance

Pricing

Pay per usage

Go to Store
My Actor

My Actor

Under maintenance

Developed by

Emmanuel Monroy Franco

Maintained by Community

0.0 (0)

Pricing

Pay per usage

0

Monthly users

1

Runs succeeded

>99%

Last modified

a month ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install NPM packages, skip optional and development dependencies to
14# keep the image small. Avoid logging too much and print the dependency
15# tree for debugging
16RUN npm --quiet set progress=false \
17    && npm install --omit=dev --omit=optional \
18    && echo "Installed NPM packages:" \
19    && (npm list --omit=dev --all || true) \
20    && echo "Node.js version:" \
21    && node --version \
22    && echo "NPM version:" \
23    && npm --version \
24    && rm -r ~/.npm
25
26# Next, copy the remaining files and directories with the source code.
27# Since we do this after NPM install, quick build will be really fast
28# for most source file changes.
29COPY . ./
30
31# Create and run as a non-root user.
32RUN adduser -h /home/apify -D apify && \
33    chown -R apify:apify ./
34USER apify
35
36# Run the image.
37CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Scrape single page in JavaScript",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2  "title": "Yelp Scraper Input",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "busqueda": {
7      "type": "string",
8      "title": "Profesión o Categoría",
9      "description": "Ejemplo: restaurantes, abogados, veterinarias.",
10      "editor": "textfield"
11    },
12    "lugar": {
13      "type": "string",
14      "title": "Ciudad o Ubicación",
15      "description": "Ejemplo: Bogotá, Medellín, Cali.",
16      "editor": "textfield"
17    }
18  },
19  "required": ["busqueda", "lugar"]
20}

src/main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4    // Obtener el input
5    const input = await Apify.getInput();
6    const { busqueda, lugar } = input;
7    if (!busqueda || !lugar) {
8        throw new Error('Debes proporcionar "busqueda" y "lugar".');
9    }
10    
11    // Construir la URL de Yelp
12    // Formato: https://www.yelp.com/search?find_desc=<busqueda>&find_loc=<lugar>
13    const searchUrl = `https://www.yelp.com/search?find_desc=${encodeURIComponent(busqueda)}&find_loc=${encodeURIComponent(lugar)}`;
14    console.log(`Navegando a: ${searchUrl}`);
15
16    // Lanzar Puppeteer (modo visible para depuración; cambia headless a true en producción)
17    const browser = await Apify.launchPuppeteer({ headless: false });
18    const page = await browser.newPage();
19    await page.goto(searchUrl, { waitUntil: 'networkidle2' });
20    
21    // Realizar un auto-scroll para cargar todos los resultados
22    await autoScroll(page);
23    
24    // Espera adicional para que se carguen todos los resultados (ajusta el tiempo según sea necesario)
25    await page.waitForTimeout(30000);
26
27    // Tomar una captura para depuración
28    await page.screenshot({ path: 'debug_yelp.png' });
29    console.log('Screenshot guardado: debug_yelp.png');
30
31    // Extraer datos de Yelp; ajusta los selectores según la estructura actual
32    const results = await page.evaluate(() => {
33        const items = [];
34        // Yelp suele listar los resultados dentro de una lista <ul>, donde cada resultado es un <li>
35        const containers = document.querySelectorAll('ul.lemon--ul__373c0__1_cxs li');
36        containers.forEach(li => {
37            // Si el resultado contiene un enlace con clase que identifica el nombre del negocio
38            const nameElement = li.querySelector('a.css-166la90');
39            if (nameElement) {
40                const name = nameElement.innerText.trim();
41                const addressElement = li.querySelector('address');
42                const address = addressElement ? addressElement.innerText.trim() : 'No disponible';
43                // El rating puede estar en un div con clase similar a "i-stars__373c0__1T6rz"
44                const ratingElement = li.querySelector('div.i-stars__373c0__1T6rz');
45                const rating = ratingElement ? ratingElement.getAttribute('aria-label') : 'No disponible';
46                items.push({ name, address, rating });
47            }
48        });
49        return items;
50    });
51
52    console.log('Resultados extraídos:', results);
53    await browser.close();
54
55    // Si no se extrajeron resultados, empuja un objeto con el error exacto
56    if (results.length === 0) {
57        await Apify.pushData({ error: "No se extrajeron resultados. Verifica los selectores o la carga de la página." });
58    } else {
59        await Apify.pushData(results);
60    }
61});
62
63// Función para hacer scroll automáticamente
64async function autoScroll(page) {
65    await page.evaluate(async () => {
66        await new Promise((resolve) => {
67            let totalHeight = 0;
68            const distance = 100;
69            const timer = setInterval(() => {
70                const scrollHeight = document.body.scrollHeight;
71                window.scrollBy(0, distance);
72                totalHeight += distance;
73                if (totalHeight >= scrollHeight - window.innerHeight) {
74                    clearInterval(timer);
75                    resolve();
76                }
77            }, 200);
78        });
79    });
80}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json

package.json

1{
2  "name": "yelp-scraper-actor",
3  "version": "1.0.0",
4  "description": "Actor de Apify para extraer resultados de Yelp usando Crawlee y Puppeteer",
5  "main": "main.js",
6  "scripts": {
7    "start": "node main.js"
8  },
9  "dependencies": {
10    "apify": "^3.3.2",
11    "crawlee": "^3.13.0",
12    "puppeteer": "^24.4.0"
13  },
14  "engines": {
15    "node": ">=16.0.0"
16  },
17  "license": "ISC"
18}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.