My Actor
Deprecated
Pricing
Pay per usage
Go to Store
My Actor
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Runs succeeded
>99%
Last modified
3 months ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Create and run as a non-root user.RUN adduser -h /home/apify -D apify && \ chown -R apify:apify ./USER apify
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in JavaScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "js-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Yelp Scraper Input", "type": "object", "schemaVersion": 1, "properties": { "busqueda": { "type": "string", "title": "Profesión o Categoría", "description": "Ejemplo: restaurantes, abogados, veterinarias.", "editor": "textfield" }, "lugar": { "type": "string", "title": "Ciudad o Ubicación", "description": "Ejemplo: Bogotá, Medellín, Cali.", "editor": "textfield" } }, "required": ["busqueda", "lugar"]}
src/main.js
1const Apify = require('apify');2
3Apify.main(async () => {4 // Obtener el input5 const input = await Apify.getInput();6 const { busqueda, lugar } = input;7 if (!busqueda || !lugar) {8 throw new Error('Debes proporcionar "busqueda" y "lugar".');9 }10 11 // Construir la URL de Yelp12 // Formato: https://www.yelp.com/search?find_desc=<busqueda>&find_loc=<lugar>13 const searchUrl = `https://www.yelp.com/search?find_desc=${encodeURIComponent(busqueda)}&find_loc=${encodeURIComponent(lugar)}`;14 console.log(`Navegando a: ${searchUrl}`);15
16 // Lanzar Puppeteer (modo visible para depuración; cambia headless a true en producción)17 const browser = await Apify.launchPuppeteer({ headless: false });18 const page = await browser.newPage();19 await page.goto(searchUrl, { waitUntil: 'networkidle2' });20 21 // Realizar un auto-scroll para cargar todos los resultados22 await autoScroll(page);23 24 // Espera adicional para que se carguen todos los resultados (ajusta el tiempo según sea necesario)25 await page.waitForTimeout(30000);26
27 // Tomar una captura para depuración28 await page.screenshot({ path: 'debug_yelp.png' });29 console.log('Screenshot guardado: debug_yelp.png');30
31 // Extraer datos de Yelp; ajusta los selectores según la estructura actual32 const results = await page.evaluate(() => {33 const items = [];34 // Yelp suele listar los resultados dentro de una lista <ul>, donde cada resultado es un <li>35 const containers = document.querySelectorAll('ul.lemon--ul__373c0__1_cxs li');36 containers.forEach(li => {37 // Si el resultado contiene un enlace con clase que identifica el nombre del negocio38 const nameElement = li.querySelector('a.css-166la90');39 if (nameElement) {40 const name = nameElement.innerText.trim();41 const addressElement = li.querySelector('address');42 const address = addressElement ? addressElement.innerText.trim() : 'No disponible';43 // El rating puede estar en un div con clase similar a "i-stars__373c0__1T6rz"44 const ratingElement = li.querySelector('div.i-stars__373c0__1T6rz');45 const rating = ratingElement ? ratingElement.getAttribute('aria-label') : 'No disponible';46 items.push({ name, address, rating });47 }48 });49 return items;50 });51
52 console.log('Resultados extraídos:', results);53 await browser.close();54
55 // Si no se extrajeron resultados, empuja un objeto con el error exacto56 if (results.length === 0) {57 await Apify.pushData({ error: "No se extrajeron resultados. Verifica los selectores o la carga de la página." });58 } else {59 await Apify.pushData(results);60 }61});62
63// Función para hacer scroll automáticamente64async function autoScroll(page) {65 await page.evaluate(async () => {66 await new Promise((resolve) => {67 let totalHeight = 0;68 const distance = 100;69 const timer = setInterval(() => {70 const scrollHeight = document.body.scrollHeight;71 window.scrollBy(0, distance);72 totalHeight += distance;73 if (totalHeight >= scrollHeight - window.innerHeight) {74 clearInterval(timer);75 resolve();76 }77 }, 200);78 });79 });80}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.gitignore
# This file tells Git which files shouldn't be added to source control.DS_Store.ideadistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
package.json
{ "name": "yelp-scraper-actor", "version": "1.0.0", "description": "Actor de Apify para extraer resultados de Yelp usando Crawlee y Puppeteer", "main": "main.js", "scripts": { "start": "node main.js" }, "dependencies": { "apify": "^3.3.2", "crawlee": "^3.13.0", "puppeteer": "^24.4.0" }, "engines": { "node": ">=16.0.0" }, "license": "ISC"}