Google Maps Easy Scrape avatar
Google Maps Easy Scrape

Pricing

Pay per usage

Go to Store
Google Maps Easy Scrape

Google Maps Easy Scrape

Developed by

Mike Powers

Maintained by Community

5.0 (1)

Pricing

Pay per usage

21

Monthly users

68

Runs succeeded

87%

Last modified

a year ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-13",
4    "title": "Project Playwright Crawler JavaScript",
5    "description": "Crawlee and Playwright project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-playwright-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6      "googleMapsURL": {
7        "title": "Google Maps URL",
8        "type": "string",
9        "description": "The URL of the Google Maps search results to scrape.",
10        "editor": "textfield",
11        "prefill": "https://www.google.com/maps/search/gym/@44.3267641,-84.7358592,12.73z/data=!4m2!2m1!6e1?entry=ttu"
12      }
13  },
14  "required": ["googleMapsURL"]
15}

src/main.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5
6    Actor.init()
7
8    // Fetch the input
9    const input = await Actor.getInput();
10    // Use the provided Google Maps URL from the input
11    const googleMapsURL = input.googleMapsURL;
12
13    // Launch browser
14    console.time("Execution Time");
15    const browser = await chromium.launch();
16    const context = await browser.newContext();
17    const page = await context.newPage();
18
19    // Enter URL
20    await page.goto(googleMapsURL);
21    await page.waitForSelector('[jstcache="3"]');
22
23    let urls = [];
24
25    // Scroll within the specific element identified by XPath
26    while (true) {
27        const pageContent = await page.content();
28        if (pageContent.includes("You've reached the end of the list.")) {
29            console.log("Reached the end of the list.");
30            break;
31        } else {
32            await page.evaluate(() => {
33                const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
34                scrollElement.scrollTop += 500;
35            });
36        }
37    }
38
39    // Extract URLs
40    urls = await page.evaluate(() => {
41        let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
42        return elements.map(element => element.href);
43    });
44
45    console.log(`Number of URLs extracted: ${urls.length}`);
46
47    let data = [];
48    const batchSize = 5; 
49
50    // Pull info for each site
51    for (let i = 0; i < urls.length; i += batchSize) {
52        console.log(`Processing batch: ${i/batchSize + 1}/${Math.ceil(urls.length/batchSize)}`);
53        const batchUrls = urls.slice(i, i + batchSize);
54        const batchData = await Promise.all(batchUrls.map(async (url) => {
55            const page = await context.newPage();
56            await page.goto(url);
57            await page.waitForSelector('[jstcache="3"]');
58            
59            // Selectors to pull the information
60            const details = await page.evaluate(() => {
61
62                // Function for text
63                const getText = (selector) => {
64                    const element = document.querySelector(selector);
65                    return element ? element.innerText : '';
66                };
67
68                // Function for href
69                const getHref = (primarySelector, fallbackSelector) => {
70                    let element = document.querySelector(primarySelector);
71                    if (!element) {
72                        element = document.querySelector(fallbackSelector);
73                    }
74                    return element && element.href ? element.href : '';
75                };
76            
77                // Function for xpath
78                const getTextFromXPath = (xpath) => {
79                    const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
80                    return result.singleNodeValue ? result.singleNodeValue.innerText : '';
81                };
82            
83                const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
84                const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
85                let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
86                numberReviews = numberReviews.replace(/\(|\)/g, '');
87                const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
88                
89            
90                return {
91                    company: companyName,
92                    rating: rating,
93                    reviews: numberReviews,
94                    category: category,
95                    address: getText('button[data-tooltip="Copy address"]'),
96                    website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
97                    phone: getText('button[data-tooltip="Copy phone number"]')
98                };                             
99            });
100
101            await page.close();
102            return { ...details, url };
103        }));
104
105        // Push data to Apify's dataset
106        for (const item of batchData) {
107            await Actor.pushData(item);
108            console.log(`Data pushed for URL: ${item.url}`);
109        }
110
111        data.push(...batchData);
112    }
113    await Actor.exit();
114
115    console.timeEnd("Execution Time");
116})();

src/main3.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5    Actor.init();
6
7    // Fetch the input
8    const input = await Actor.getInput();
9    // Use the provided Google Maps URL from the input
10    const googleMapsURL = input.googleMapsURL;
11
12    // Launch browser in headless mode for better performance
13    console.time("Execution Time");
14    const browser = await chromium.launch({ headless: true });
15    const context = await browser.newContext();
16    const page = await context.newPage();
17
18    // Enter URL and wait for DOM content to load
19    await page.goto(googleMapsURL, { waitUntil: 'domcontentloaded' });
20    await page.waitForSelector('[jstcache="3"]');
21
22    let urls = [];
23
24    // Scroll and extract URLs
25    while (true) {
26        const pageContent = await page.content();
27        if (pageContent.includes("You've reached the end of the list.")) {
28            console.log("Reached the end of the list.");
29            break;
30        } else {
31            await page.evaluate(() => {
32                const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
33                scrollElement.scrollTop += 500;
34            });
35        }
36    }
37
38    urls = await page.evaluate(() => {
39        let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
40        return elements.map(element => element.href);
41    });
42
43    await page.close();
44
45    console.log(`Number of URLs extracted: ${urls.length}`);
46
47    // Process URLs in parallel
48    const concurrency = 5; // Maximum number of pages to process simultaneously
49    const promises = [];
50
51    for (let url of urls) {
52        const p = processUrl(url, context).then(details => {
53            // Push data to Apify's dataset
54            Actor.pushData(details);
55            console.log(`Data pushed for URL: ${details.url}`);
56        }).catch(error => {
57            console.error(`Error processing URL ${url}: ${error}`);
58        });
59
60        promises.push(p);
61
62        if (promises.length >= concurrency) {
63            await Promise.all(promises);
64            promises.length = 0; // Clear the array
65        }
66    }
67
68    // Process any remaining promises
69    await Promise.all(promises);
70    console.timeEnd("Execution Time");
71    await Actor.exit();
72})();
73
74// Function to process each URL
75async function processUrl(url, context) {
76    const page = await context.newPage();
77    await page.goto(url, { waitUntil: 'domcontentloaded' });
78    await page.waitForSelector('[jstcache="3"]');
79
80    // Selectors to pull the information
81    const details = await page.evaluate(() => {
82
83        // Function for text
84        const getText = (selector) => {
85            const element = document.querySelector(selector);
86            return element ? element.innerText : '';
87        };
88
89        // Function for href
90        const getHref = (primarySelector, fallbackSelector) => {
91            let element = document.querySelector(primarySelector);
92            if (!element) {
93                element = document.querySelector(fallbackSelector);
94            }
95            return element && element.href ? element.href : '';
96        };
97
98        // Function for xpath
99        const getTextFromXPath = (xpath) => {
100            const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
101            return result.singleNodeValue ? result.singleNodeValue.innerText : '';
102        };
103
104        const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
105        const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
106        let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
107        numberReviews = numberReviews.replace(/\(|\)/g, '');
108        const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
109        
110
111        return {
112            company: companyName,
113            rating: rating,
114            reviews: numberReviews,
115            category: category,
116            address: getText('button[data-tooltip="Copy address"]'),
117            website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
118            phone: getText('button[data-tooltip="Copy phone number"]')
119        };                             
120    });
121
122    await page.close();
123    return { ...details, url };
124}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-playwright-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.1.10",
8        "crawlee": "^3.5.4",
9        "playwright": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "lint": "eslint ./src --ext .js,.jsx",
18        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
19        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
20        "postinstall": "npx crawlee install-playwright-browsers"
21    },
22    "author": "It's not you it's me",
23    "license": "ISC"
24}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.