Google Maps Easy Scrape avatar
Google Maps Easy Scrape

Pricing

Pay per usage

Go to Store
Google Maps Easy Scrape

Google Maps Easy Scrape

Developed by

Mike Powers

Mike Powers

Maintained by Community

5.0 (1)

Pricing

Pay per usage

22

Total users

1.3k

Monthly users

53

Runs succeeded

88%

Last modified

a year ago

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-13",
"title": "Project Playwright Crawler JavaScript",
"description": "Crawlee and Playwright project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-playwright-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"googleMapsURL": {
"title": "Google Maps URL",
"type": "string",
"description": "The URL of the Google Maps search results to scrape.",
"editor": "textfield",
"prefill": "https://www.google.com/maps/search/gym/@44.3267641,-84.7358592,12.73z/data=!4m2!2m1!6e1?entry=ttu"
}
},
"required": ["googleMapsURL"]
}

src/main.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5
6 Actor.init()
7
8 // Fetch the input
9 const input = await Actor.getInput();
10 // Use the provided Google Maps URL from the input
11 const googleMapsURL = input.googleMapsURL;
12
13 // Launch browser
14 console.time("Execution Time");
15 const browser = await chromium.launch();
16 const context = await browser.newContext();
17 const page = await context.newPage();
18
19 // Enter URL
20 await page.goto(googleMapsURL);
21 await page.waitForSelector('[jstcache="3"]');
22
23 let urls = [];
24
25 // Scroll within the specific element identified by XPath
26 while (true) {
27 const pageContent = await page.content();
28 if (pageContent.includes("You've reached the end of the list.")) {
29 console.log("Reached the end of the list.");
30 break;
31 } else {
32 await page.evaluate(() => {
33 const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
34 scrollElement.scrollTop += 500;
35 });
36 }
37 }
38
39 // Extract URLs
40 urls = await page.evaluate(() => {
41 let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
42 return elements.map(element => element.href);
43 });
44
45 console.log(`Number of URLs extracted: ${urls.length}`);
46
47 let data = [];
48 const batchSize = 5;
49
50 // Pull info for each site
51 for (let i = 0; i < urls.length; i += batchSize) {
52 console.log(`Processing batch: ${i/batchSize + 1}/${Math.ceil(urls.length/batchSize)}`);
53 const batchUrls = urls.slice(i, i + batchSize);
54 const batchData = await Promise.all(batchUrls.map(async (url) => {
55 const page = await context.newPage();
56 await page.goto(url);
57 await page.waitForSelector('[jstcache="3"]');
58
59 // Selectors to pull the information
60 const details = await page.evaluate(() => {
61
62 // Function for text
63 const getText = (selector) => {
64 const element = document.querySelector(selector);
65 return element ? element.innerText : '';
66 };
67
68 // Function for href
69 const getHref = (primarySelector, fallbackSelector) => {
70 let element = document.querySelector(primarySelector);
71 if (!element) {
72 element = document.querySelector(fallbackSelector);
73 }
74 return element && element.href ? element.href : '';
75 };
76
77 // Function for xpath
78 const getTextFromXPath = (xpath) => {
79 const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
80 return result.singleNodeValue ? result.singleNodeValue.innerText : '';
81 };
82
83 const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
84 const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
85 let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
86 numberReviews = numberReviews.replace(/\(|\)/g, '');
87 const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
88
89
90 return {
91 company: companyName,
92 rating: rating,
93 reviews: numberReviews,
94 category: category,
95 address: getText('button[data-tooltip="Copy address"]'),
96 website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
97 phone: getText('button[data-tooltip="Copy phone number"]')
98 };
99 });
100
101 await page.close();
102 return { ...details, url };
103 }));
104
105 // Push data to Apify's dataset
106 for (const item of batchData) {
107 await Actor.pushData(item);
108 console.log(`Data pushed for URL: ${item.url}`);
109 }
110
111 data.push(...batchData);
112 }
113 await Actor.exit();
114
115 console.timeEnd("Execution Time");
116})();

src/main3.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5 Actor.init();
6
7 // Fetch the input
8 const input = await Actor.getInput();
9 // Use the provided Google Maps URL from the input
10 const googleMapsURL = input.googleMapsURL;
11
12 // Launch browser in headless mode for better performance
13 console.time("Execution Time");
14 const browser = await chromium.launch({ headless: true });
15 const context = await browser.newContext();
16 const page = await context.newPage();
17
18 // Enter URL and wait for DOM content to load
19 await page.goto(googleMapsURL, { waitUntil: 'domcontentloaded' });
20 await page.waitForSelector('[jstcache="3"]');
21
22 let urls = [];
23
24 // Scroll and extract URLs
25 while (true) {
26 const pageContent = await page.content();
27 if (pageContent.includes("You've reached the end of the list.")) {
28 console.log("Reached the end of the list.");
29 break;
30 } else {
31 await page.evaluate(() => {
32 const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
33 scrollElement.scrollTop += 500;
34 });
35 }
36 }
37
38 urls = await page.evaluate(() => {
39 let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
40 return elements.map(element => element.href);
41 });
42
43 await page.close();
44
45 console.log(`Number of URLs extracted: ${urls.length}`);
46
47 // Process URLs in parallel
48 const concurrency = 5; // Maximum number of pages to process simultaneously
49 const promises = [];
50
51 for (let url of urls) {
52 const p = processUrl(url, context).then(details => {
53 // Push data to Apify's dataset
54 Actor.pushData(details);
55 console.log(`Data pushed for URL: ${details.url}`);
56 }).catch(error => {
57 console.error(`Error processing URL ${url}: ${error}`);
58 });
59
60 promises.push(p);
61
62 if (promises.length >= concurrency) {
63 await Promise.all(promises);
64 promises.length = 0; // Clear the array
65 }
66 }
67
68 // Process any remaining promises
69 await Promise.all(promises);
70 console.timeEnd("Execution Time");
71 await Actor.exit();
72})();
73
74// Function to process each URL
75async function processUrl(url, context) {
76 const page = await context.newPage();
77 await page.goto(url, { waitUntil: 'domcontentloaded' });
78 await page.waitForSelector('[jstcache="3"]');
79
80 // Selectors to pull the information
81 const details = await page.evaluate(() => {
82
83 // Function for text
84 const getText = (selector) => {
85 const element = document.querySelector(selector);
86 return element ? element.innerText : '';
87 };
88
89 // Function for href
90 const getHref = (primarySelector, fallbackSelector) => {
91 let element = document.querySelector(primarySelector);
92 if (!element) {
93 element = document.querySelector(fallbackSelector);
94 }
95 return element && element.href ? element.href : '';
96 };
97
98 // Function for xpath
99 const getTextFromXPath = (xpath) => {
100 const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
101 return result.singleNodeValue ? result.singleNodeValue.innerText : '';
102 };
103
104 const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
105 const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
106 let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
107 numberReviews = numberReviews.replace(/\(|\)/g, '');
108 const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
109
110
111 return {
112 company: companyName,
113 rating: rating,
114 reviews: numberReviews,
115 category: category,
116 address: getText('button[data-tooltip="Copy address"]'),
117 website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
118 phone: getText('button[data-tooltip="Copy phone number"]')
119 };
120 });
121
122 await page.close();
123 return { ...details, url };
124}

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage

package.json

{
"name": "crawlee-playwright-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.1.10",
"crawlee": "^3.5.4",
"playwright": "*"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "It's not you it's me",
"license": "ISC"
}