Google Maps Easy Scrape
Try for free
No credit card required
View all Actors
Google Maps Easy Scrape
mikepowers/google-maps-easy-scrape
Try for free
No credit card required
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor-13",
4 "title": "Project Playwright Crawler JavaScript",
5 "description": "Crawlee and Playwright project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-playwright-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "googleMapsURL": {
7 "title": "Google Maps URL",
8 "type": "string",
9 "description": "The URL of the Google Maps search results to scrape.",
10 "editor": "textfield",
11 "prefill": "https://www.google.com/maps/search/gym/@44.3267641,-84.7358592,12.73z/data=!4m2!2m1!6e1?entry=ttu"
12 }
13 },
14 "required": ["googleMapsURL"]
15}
src/main.js
1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5
6 Actor.init()
7
8 // Fetch the input
9 const input = await Actor.getInput();
10 // Use the provided Google Maps URL from the input
11 const googleMapsURL = input.googleMapsURL;
12
13 // Launch browser
14 console.time("Execution Time");
15 const browser = await chromium.launch();
16 const context = await browser.newContext();
17 const page = await context.newPage();
18
19 // Enter URL
20 await page.goto(googleMapsURL);
21 await page.waitForSelector('[jstcache="3"]');
22
23 let urls = [];
24
25 // Scroll within the specific element identified by XPath
26 while (true) {
27 const pageContent = await page.content();
28 if (pageContent.includes("You've reached the end of the list.")) {
29 console.log("Reached the end of the list.");
30 break;
31 } else {
32 await page.evaluate(() => {
33 const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
34 scrollElement.scrollTop += 500;
35 });
36 }
37 }
38
39 // Extract URLs
40 urls = await page.evaluate(() => {
41 let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
42 return elements.map(element => element.href);
43 });
44
45 console.log(`Number of URLs extracted: ${urls.length}`);
46
47 let data = [];
48 const batchSize = 5;
49
50 // Pull info for each site
51 for (let i = 0; i < urls.length; i += batchSize) {
52 console.log(`Processing batch: ${i/batchSize + 1}/${Math.ceil(urls.length/batchSize)}`);
53 const batchUrls = urls.slice(i, i + batchSize);
54 const batchData = await Promise.all(batchUrls.map(async (url) => {
55 const page = await context.newPage();
56 await page.goto(url);
57 await page.waitForSelector('[jstcache="3"]');
58
59 // Selectors to pull the information
60 const details = await page.evaluate(() => {
61
62 // Function for text
63 const getText = (selector) => {
64 const element = document.querySelector(selector);
65 return element ? element.innerText : '';
66 };
67
68 // Function for href
69 const getHref = (primarySelector, fallbackSelector) => {
70 let element = document.querySelector(primarySelector);
71 if (!element) {
72 element = document.querySelector(fallbackSelector);
73 }
74 return element && element.href ? element.href : '';
75 };
76
77 // Function for xpath
78 const getTextFromXPath = (xpath) => {
79 const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
80 return result.singleNodeValue ? result.singleNodeValue.innerText : '';
81 };
82
83 const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
84 const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
85 let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
86 numberReviews = numberReviews.replace(/\(|\)/g, '');
87 const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
88
89
90 return {
91 company: companyName,
92 rating: rating,
93 reviews: numberReviews,
94 category: category,
95 address: getText('button[data-tooltip="Copy address"]'),
96 website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
97 phone: getText('button[data-tooltip="Copy phone number"]')
98 };
99 });
100
101 await page.close();
102 return { ...details, url };
103 }));
104
105 // Push data to Apify's dataset
106 for (const item of batchData) {
107 await Actor.pushData(item);
108 console.log(`Data pushed for URL: ${item.url}`);
109 }
110
111 data.push(...batchData);
112 }
113 await Actor.exit();
114
115 console.timeEnd("Execution Time");
116})();
src/main3.js
1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5 Actor.init();
6
7 // Fetch the input
8 const input = await Actor.getInput();
9 // Use the provided Google Maps URL from the input
10 const googleMapsURL = input.googleMapsURL;
11
12 // Launch browser in headless mode for better performance
13 console.time("Execution Time");
14 const browser = await chromium.launch({ headless: true });
15 const context = await browser.newContext();
16 const page = await context.newPage();
17
18 // Enter URL and wait for DOM content to load
19 await page.goto(googleMapsURL, { waitUntil: 'domcontentloaded' });
20 await page.waitForSelector('[jstcache="3"]');
21
22 let urls = [];
23
24 // Scroll and extract URLs
25 while (true) {
26 const pageContent = await page.content();
27 if (pageContent.includes("You've reached the end of the list.")) {
28 console.log("Reached the end of the list.");
29 break;
30 } else {
31 await page.evaluate(() => {
32 const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
33 scrollElement.scrollTop += 500;
34 });
35 }
36 }
37
38 urls = await page.evaluate(() => {
39 let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
40 return elements.map(element => element.href);
41 });
42
43 await page.close();
44
45 console.log(`Number of URLs extracted: ${urls.length}`);
46
47 // Process URLs in parallel
48 const concurrency = 5; // Maximum number of pages to process simultaneously
49 const promises = [];
50
51 for (let url of urls) {
52 const p = processUrl(url, context).then(details => {
53 // Push data to Apify's dataset
54 Actor.pushData(details);
55 console.log(`Data pushed for URL: ${details.url}`);
56 }).catch(error => {
57 console.error(`Error processing URL ${url}: ${error}`);
58 });
59
60 promises.push(p);
61
62 if (promises.length >= concurrency) {
63 await Promise.all(promises);
64 promises.length = 0; // Clear the array
65 }
66 }
67
68 // Process any remaining promises
69 await Promise.all(promises);
70 console.timeEnd("Execution Time");
71 await Actor.exit();
72})();
73
74// Function to process each URL
75async function processUrl(url, context) {
76 const page = await context.newPage();
77 await page.goto(url, { waitUntil: 'domcontentloaded' });
78 await page.waitForSelector('[jstcache="3"]');
79
80 // Selectors to pull the information
81 const details = await page.evaluate(() => {
82
83 // Function for text
84 const getText = (selector) => {
85 const element = document.querySelector(selector);
86 return element ? element.innerText : '';
87 };
88
89 // Function for href
90 const getHref = (primarySelector, fallbackSelector) => {
91 let element = document.querySelector(primarySelector);
92 if (!element) {
93 element = document.querySelector(fallbackSelector);
94 }
95 return element && element.href ? element.href : '';
96 };
97
98 // Function for xpath
99 const getTextFromXPath = (xpath) => {
100 const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
101 return result.singleNodeValue ? result.singleNodeValue.innerText : '';
102 };
103
104 const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
105 const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
106 let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
107 numberReviews = numberReviews.replace(/\(|\)/g, '');
108 const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
109
110
111 return {
112 company: companyName,
113 rating: rating,
114 reviews: numberReviews,
115 category: category,
116 address: getText('button[data-tooltip="Copy address"]'),
117 website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
118 phone: getText('button[data-tooltip="Copy phone number"]')
119 };
120 });
121
122 await page.close();
123 return { ...details, url };
124}
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "crawlee-playwright-javascript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.1.10",
8 "crawlee": "^3.5.4",
9 "playwright": "*"
10 },
11 "devDependencies": {
12 "@apify/eslint-config": "^0.4.0",
13 "eslint": "^8.50.0"
14 },
15 "scripts": {
16 "start": "node src/main.js",
17 "lint": "eslint ./src --ext .js,.jsx",
18 "lint:fix": "eslint ./src --ext .js,.jsx --fix",
19 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
20 "postinstall": "npx crawlee install-playwright-browsers"
21 },
22 "author": "It's not you it's me",
23 "license": "ISC"
24}
Developer
Maintained by Community
Actor metrics
- 74 monthly users
- 13 stars
- 91.0% runs succeeded
- Created in Jan 2024
- Modified 10 months ago
Categories