Google Maps Easy Scrape avatar

Google Maps Easy Scrape

Try for free

No credit card required

View all Actors
Google Maps Easy Scrape

Google Maps Easy Scrape

mikepowers/google-maps-easy-scrape
Try for free

No credit card required

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-13",
4    "title": "Project Playwright Crawler JavaScript",
5    "description": "Crawlee and Playwright project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-playwright-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6      "googleMapsURL": {
7        "title": "Google Maps URL",
8        "type": "string",
9        "description": "The URL of the Google Maps search results to scrape.",
10        "editor": "textfield",
11        "prefill": "https://www.google.com/maps/search/gym/@44.3267641,-84.7358592,12.73z/data=!4m2!2m1!6e1?entry=ttu"
12      }
13  },
14  "required": ["googleMapsURL"]
15}

src/main.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5
6    Actor.init()
7
8    // Fetch the input
9    const input = await Actor.getInput();
10    // Use the provided Google Maps URL from the input
11    const googleMapsURL = input.googleMapsURL;
12
13    // Launch browser
14    console.time("Execution Time");
15    const browser = await chromium.launch();
16    const context = await browser.newContext();
17    const page = await context.newPage();
18
19    // Enter URL
20    await page.goto(googleMapsURL);
21    await page.waitForSelector('[jstcache="3"]');
22
23    let urls = [];
24
25    // Scroll within the specific element identified by XPath
26    while (true) {
27        const pageContent = await page.content();
28        if (pageContent.includes("You've reached the end of the list.")) {
29            console.log("Reached the end of the list.");
30            break;
31        } else {
32            await page.evaluate(() => {
33                const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
34                scrollElement.scrollTop += 500;
35            });
36        }
37    }
38
39    // Extract URLs
40    urls = await page.evaluate(() => {
41        let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
42        return elements.map(element => element.href);
43    });
44
45    console.log(`Number of URLs extracted: ${urls.length}`);
46
47    let data = [];
48    const batchSize = 5; 
49
50    // Pull info for each site
51    for (let i = 0; i < urls.length; i += batchSize) {
52        console.log(`Processing batch: ${i/batchSize + 1}/${Math.ceil(urls.length/batchSize)}`);
53        const batchUrls = urls.slice(i, i + batchSize);
54        const batchData = await Promise.all(batchUrls.map(async (url) => {
55            const page = await context.newPage();
56            await page.goto(url);
57            await page.waitForSelector('[jstcache="3"]');
58            
59            // Selectors to pull the information
60            const details = await page.evaluate(() => {
61
62                // Function for text
63                const getText = (selector) => {
64                    const element = document.querySelector(selector);
65                    return element ? element.innerText : '';
66                };
67
68                // Function for href
69                const getHref = (primarySelector, fallbackSelector) => {
70                    let element = document.querySelector(primarySelector);
71                    if (!element) {
72                        element = document.querySelector(fallbackSelector);
73                    }
74                    return element && element.href ? element.href : '';
75                };
76            
77                // Function for xpath
78                const getTextFromXPath = (xpath) => {
79                    const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
80                    return result.singleNodeValue ? result.singleNodeValue.innerText : '';
81                };
82            
83                const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
84                const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
85                let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
86                numberReviews = numberReviews.replace(/\(|\)/g, '');
87                const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
88                
89            
90                return {
91                    company: companyName,
92                    rating: rating,
93                    reviews: numberReviews,
94                    category: category,
95                    address: getText('button[data-tooltip="Copy address"]'),
96                    website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
97                    phone: getText('button[data-tooltip="Copy phone number"]')
98                };                             
99            });
100
101            await page.close();
102            return { ...details, url };
103        }));
104
105        // Push data to Apify's dataset
106        for (const item of batchData) {
107            await Actor.pushData(item);
108            console.log(`Data pushed for URL: ${item.url}`);
109        }
110
111        data.push(...batchData);
112    }
113    await Actor.exit();
114
115    console.timeEnd("Execution Time");
116})();

src/main3.js

1import { chromium } from 'playwright';
2import { Actor } from 'apify';
3
4(async () => {
5    Actor.init();
6
7    // Fetch the input
8    const input = await Actor.getInput();
9    // Use the provided Google Maps URL from the input
10    const googleMapsURL = input.googleMapsURL;
11
12    // Launch browser in headless mode for better performance
13    console.time("Execution Time");
14    const browser = await chromium.launch({ headless: true });
15    const context = await browser.newContext();
16    const page = await context.newPage();
17
18    // Enter URL and wait for DOM content to load
19    await page.goto(googleMapsURL, { waitUntil: 'domcontentloaded' });
20    await page.waitForSelector('[jstcache="3"]');
21
22    let urls = [];
23
24    // Scroll and extract URLs
25    while (true) {
26        const pageContent = await page.content();
27        if (pageContent.includes("You've reached the end of the list.")) {
28            console.log("Reached the end of the list.");
29            break;
30        } else {
31            await page.evaluate(() => {
32                const scrollElement = document.evaluate('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
33                scrollElement.scrollTop += 500;
34            });
35        }
36    }
37
38    urls = await page.evaluate(() => {
39        let elements = Array.from(document.querySelectorAll('a[href*="https://www.google.com/maps/place"]'));
40        return elements.map(element => element.href);
41    });
42
43    await page.close();
44
45    console.log(`Number of URLs extracted: ${urls.length}`);
46
47    // Process URLs in parallel
48    const concurrency = 5; // Maximum number of pages to process simultaneously
49    const promises = [];
50
51    for (let url of urls) {
52        const p = processUrl(url, context).then(details => {
53            // Push data to Apify's dataset
54            Actor.pushData(details);
55            console.log(`Data pushed for URL: ${details.url}`);
56        }).catch(error => {
57            console.error(`Error processing URL ${url}: ${error}`);
58        });
59
60        promises.push(p);
61
62        if (promises.length >= concurrency) {
63            await Promise.all(promises);
64            promises.length = 0; // Clear the array
65        }
66    }
67
68    // Process any remaining promises
69    await Promise.all(promises);
70    console.timeEnd("Execution Time");
71    await Actor.exit();
72})();
73
74// Function to process each URL
75async function processUrl(url, context) {
76    const page = await context.newPage();
77    await page.goto(url, { waitUntil: 'domcontentloaded' });
78    await page.waitForSelector('[jstcache="3"]');
79
80    // Selectors to pull the information
81    const details = await page.evaluate(() => {
82
83        // Function for text
84        const getText = (selector) => {
85            const element = document.querySelector(selector);
86            return element ? element.innerText : '';
87        };
88
89        // Function for href
90        const getHref = (primarySelector, fallbackSelector) => {
91            let element = document.querySelector(primarySelector);
92            if (!element) {
93                element = document.querySelector(fallbackSelector);
94            }
95            return element && element.href ? element.href : '';
96        };
97
98        // Function for xpath
99        const getTextFromXPath = (xpath) => {
100            const result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
101            return result.singleNodeValue ? result.singleNodeValue.innerText : '';
102        };
103
104        const companyName = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[1]/h1');
105        const rating = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[1]/span[1]');
106        let numberReviews = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/span[2]/span/span');
107        numberReviews = numberReviews.replace(/\(|\)/g, '');
108        const category = getTextFromXPath('/html/body/div[2]/div[3]/div[8]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button');
109        
110
111        return {
112            company: companyName,
113            rating: rating,
114            reviews: numberReviews,
115            category: category,
116            address: getText('button[data-tooltip="Copy address"]'),
117            website: getHref('a[data-tooltip="Open website"]', 'a[data-tooltip="Open menu link"]'),
118            phone: getText('button[data-tooltip="Copy phone number"]')
119        };                             
120    });
121
122    await page.close();
123    return { ...details, url };
124}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-playwright-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.1.10",
8        "crawlee": "^3.5.4",
9        "playwright": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.4.0",
13        "eslint": "^8.50.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "lint": "eslint ./src --ext .js,.jsx",
18        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
19        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
20        "postinstall": "npx crawlee install-playwright-browsers"
21    },
22    "author": "It's not you it's me",
23    "license": "ISC"
24}
Developer
Maintained by Community
Actor metrics
  • 74 monthly users
  • 13 stars
  • 91.0% runs succeeded
  • Created in Jan 2024
  • Modified 10 months ago