DentalPlans.com Dentist Scraper 🦷 avatar
DentalPlans.com Dentist Scraper 🦷

Pricing

$19.99/month + usage

Go to Store
DentalPlans.com Dentist Scraper 🦷

DentalPlans.com Dentist Scraper 🦷

Developed by

EasyApi

Maintained by Community

Extract detailed dentist information from DentalPlans.com search results, including practice details, contact info, and appointment availability. Perfect for healthcare research, provider analysis, and dental market insights. 🦷

5.0 (1)

Pricing

$19.99/month + usage

1

Monthly users

3

Runs succeeded

>99%

Last modified

15 days ago

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install NPM packages, skip optional and development dependencies to
14# keep the image small. Avoid logging too much and print the dependency
15# tree for debugging
16RUN npm --quiet set progress=false \
17    && npm install --omit=dev --omit=optional \
18    && echo "Installed NPM packages:" \
19    && (npm list --omit=dev --all || true) \
20    && echo "Node.js version:" \
21    && node --version \
22    && echo "NPM version:" \
23    && npm --version \
24    && rm -r ~/.npm
25
26# Next, copy the remaining files and directories with the source code.
27# Since we do this after NPM install, quick build will be really fast
28# for most source file changes.
29COPY . ./
30
31
32# Run the image.
33CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "dentalplans-com-dentist-scraper",
4    "title": "Scrape single page in JavaScript",
5    "description": "Scrape data from single page with provided URL.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-start"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "DentalPlans.com Dentist Scraper Input",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "searchUrls": {
7            "title": "Search URLs",
8            "type": "array",
9            "description": "List of DentalPlans.com search result URLs to scrape",
10            "editor": "stringList",
11            "default": ["https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="],
12            "uniqueItems": true
13        },
14        "maxItems": {
15            "title": "Maximum Items",
16            "type": "integer",
17            "description": "Maximum number of dentist records to scrape",
18            "default": 100,
19            "minimum": 1
20        },
21        "proxyConfiguration": {
22            "title": "Proxy Configuration",
23            "type": "object",
24            "description": "Proxy settings for making requests",
25            "default": {},
26            "editor": "proxy"
27        }
28    },
29    "required": ["searchUrls"]
30}

src/main.js

1import { Actor } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3import puppeteerExtra from 'puppeteer-extra';
4import randomUseragent from 'random-useragent';
5
6import rebrowserPuppeteer from 'rebrowser-puppeteer-core'
7
8puppeteerExtra.use(rebrowserPuppeteer)
9// puppeteerExtra.use(stealthPlugin());
10
11// const redisClient = new Redis("rediss://default:AY9xAAIjcDE2YTMyNjQyYTI5ODQ0NDQ3YjVhOTE1YWRiYWEyMTZjN3AxMA@sharing-wren-36721.upstash.io:6379");
12// let sessionid = await redisClient.get('instagram-sessionid-cookie');
13
14class DentalplansDentistScraper {
15    
16    async run(input) {
17        const { searchUrls, maxItems = Infinity, proxyConfiguration } = input;
18        this.maxItems = maxItems; // 存储maxItems为类属性
19
20        const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
21
22        for (const searchUrl of searchUrls) {
23            this.searchUrl = searchUrl;
24            let url = searchUrl;
25            const crawler = new PuppeteerCrawler({
26                proxyConfiguration: proxyConfig,
27                launchContext: {
28                    launcher: puppeteerExtra,
29                    launchOptions: {
30                        headless: true,
31                        args: [
32                            '--no-sandbox',
33                            '--disable-setuid-sandbox',
34                            '--disable-dev-shm-usage',
35                            '--disable-accelerated-2d-canvas',
36                            '--no-first-run',
37                            '--no-zygote',
38                            '--disable-gpu',
39                            '--disable-geolocation',
40                            '--disable-notifications',
41                            `--user-agent=${randomUseragent.getRandom(function (ua) {
42                                return ua.browserName === 'Chrome' && parseFloat(ua.browserVersion) >= 80;
43                            })}`,
44                        ],
45                    },
46                },
47                requestHandlerTimeoutSecs: 3600,    // 1小时超时
48                navigationTimeoutSecs: 300,         // 5分钟导航超时
49                preNavigationHooks: [
50                    async ({ page }) => {
51                        // 设置页面级别的超时时间
52                        await page.setDefaultNavigationTimeout(300000);  // 5分钟
53                        await page.setDefaultTimeout(300000);           // 5分钟
54                        // await this.setCookies(page);
55                    },
56                ],
57                requestHandler: async ({ page, request }) => {
58                    console.log(`Processing ${this.searchUrl}...`);
59                    let count = 0;
60                    let endOfResults = false;
61                    let interceptedData = []; // 初始化局部变量
62                    let lastDataLength = 0;
63                    let noNewDataCount = 0;
64                    const MAX_NO_NEW_DATA_COUNT = 5; // 10秒 / 2秒 = 5次
65
66                    await this.setupInterceptors(page, interceptedData);
67
68                    await page.goto(url);
69
70                    await this.randomDelay(2000, 3000);
71
72                    // while (!endOfResults && interceptedData.length < maxItems) {
73                    //     count++;
74                    //     console.log(`Scrolling time...${count}`);
75                    //     await this.randomDelay();
76                    //     await this.scrollPage(page);
77                    //     await this.randomDelay(2000, 3000);
78    
79
80                    //     endOfResults = await page.evaluate(() => {
81                    //         return document.body?.innerText?.includes('End of results');
82                    //     });
83
84                    //     if (endOfResults) {
85                    //         console.log('Reached end of results');
86                    //     } else if (interceptedData.length === lastDataLength) {
87                    //         noNewDataCount++;
88                    //         // console.log(`No new data for ${noNewDataCount * 5} seconds`);
89                    //         if (noNewDataCount >= MAX_NO_NEW_DATA_COUNT) {
90                    //             // console.log('No new data for 20 seconds, assuming end of results');
91                    //             endOfResults = true;
92                    //         }
93                    //     } else {
94                    //         noNewDataCount = 0;
95                    //         lastDataLength = interceptedData.length;
96                    //     }
97                    // }
98                },
99            });
100
101            await crawler.run([url]);
102        }
103    }
104
105    async setCookies(page) {
106        const cookies = [
107            {
108                name: '__Secure-ENID',
109                value: '25.SE=d-L9vS6M8r24MzgdqlRwPkwZ4XuLFnylPDuaGEeggzRxH7eCqUoC-FQmW_RkU9qFB1frSGan9ohnhJWzssbPggQub_gJCkNvv91vxYoy44oYgNXdZXTJuws_pKtdXhz7bPqxkLrLHVIXCBBiV1eY2ezXD0qJIByKpoFlCdMFqcQ6SUg6ZrMyrWE_omcoueLO-NOKhM0N1xG6yghXBVOZlZWwb8MkeNN1Ef5izW_yqbFxgrQgU4EDzHpjmPb-e93ySTJNBByYZVtugHaQyOJjG9NfaXftg97fOxEGw6WH3CTzgNX7dU2dsrim37epIaIg93Z5j4EcxImW_76hO1ulyFJySedL7yva2HtUUu8rxOS63rb_iifxWH2SO8MmSOlhg11rXon00umveuBjZNjhb_bHRYNqDwUl3wPlgZ7mhf5N9lGGwU9rRJ2bxgidWB1oxh82ut9BGplTQMy8mk_XzWCwlKhqDmNi9uOJCMn34uodH4fpCqYmhjYavGP7jtkOzmvAHaepAaO5RlGRseLeMq0k3_5E6p7bs8wnEc9yOcvi',
110                domain: '.google.com',
111                path: '/',
112                expires: new Date('2026-03-07T17:04:30.711Z').getTime() / 1000,
113                secure: true,
114                sameSite: 'Lax'
115            }
116            
117        ];
118
119        // 批量设置所有cookie
120        await Promise.all(cookies.map(cookie => page.setCookie(cookie)));
121    }
122
123    async setupInterceptors(page, interceptedData) {
124        await page.setRequestInterception(true);
125        page.on('request', (request) => {
126            // 获取资源类型
127            const resourceType = request.resourceType();
128            
129            // 需要过滤的资源类型
130            const blockedResourceTypes = [
131                // 'image',
132                // 'media', 
133                // 'font',
134                // 'texttrack',
135                // 'stylesheet',
136                // 'ping'
137            ];
138
139            // 过滤掉不需要的资源类型
140            if (blockedResourceTypes.includes(resourceType)) {
141                request.abort();
142            } else {
143                request.continue();
144            }
145        });
146        page.on('response', async (response) => {
147            const request = response.request();
148            const url = request.url();
149            if (await this.shouldInterceptRequest(url, request)) {
150                try {
151                    let responseBody = await response.json();
152                    await this.processResponseData(responseBody, interceptedData);
153                } catch (error) {
154                    // console.error(`Error fetching data: `, error);
155                }
156            }
157        });
158    }
159
160    async processResponseData(responseBody, interceptedData) {
161        let currentData = [];
162
163        // 每次处理 50 条数据
164        let list = responseBody?.dentists || [];
165        for (let i = 0; i < list.length; i += 50) {
166            let batchData = list.slice(i, i + 50);
167            batchData?.forEach((item) => {
168                if (item) {
169                    let data = {
170                        searchUrl: this.searchUrl,
171                        ...item
172                    };
173                    data.profiledentistDetailUrlUrl = 'https://www.dentalplans.com/' + item?.dentistDetailUrl;
174                    currentData.push(data);
175                    interceptedData.push(data);
176                }
177            });
178            
179            let limit = false;
180            // 如果超过最大数量,截断currentData
181            if (interceptedData.length > this.maxItems) {
182                const overflow = interceptedData.length - this.maxItems;
183                currentData = currentData.slice(0, -overflow);
184                interceptedData = interceptedData.slice(0, this.maxItems);
185                limit = true;
186            }
187    
188            if (currentData.length > 0) {
189                console.log(`Saved ${currentData.length} items`);
190                await Actor.pushData(currentData);
191            }
192
193            if (limit) {
194                break;
195            }
196
197        }
198        
199        
200    }
201
202    async shouldInterceptRequest(url, request) {
203        if (url.includes('/proxy/dentist/v1/search/') && request.method() === 'GET') {
204            
205            try {
206                let payload = {};
207                const postData = await request.postData();
208                if (postData) {
209                    try {
210                        // 尝试解析为 JSON
211                        const jsonData = JSON.parse(postData);
212                        payload = jsonData;
213                    } catch (e) {
214                        // 如果不是 JSON,则按 URL 编码处理
215                        const parsedData = {};
216                        const searchParams = new URLSearchParams(postData);
217                        for (const [key, value] of searchParams) {
218                            parsedData[key] = value;
219                        }
220                        payload = parsedData;
221                    }
222                } else {
223                }
224                // return payload?.operationName === 'getReviews';
225                return true;
226            } catch (error) {
227                console.error('Error parsing POST data:', error);
228            }
229        }
230        return false;
231    }
232
233    async scrollPage(page) {
234        await page.evaluate(() => {
235            document.querySelector('a[data-testid="paginator-navigation-button"]')?.click();
236            // document.querySelector('a[data-test="pagination-next"]')?.click();
237            // window.scrollTo(0, document.querySelector('div[data-test-id="profile-followers-feed"]')?.scrollHeight + 100)
238            // window.scrollTo(0, document.body.scrollHeight + 100)
239            // document.querySelector('div[data-test-id="profile-following"]')?.lastChild?.firstChild?.click()
240
241            // 点击下一页
242            // document.querySelector('a.next_page')?.click();
243            
244            // const scrollHeight = document.body.scrollHeight;
245            // const viewportHeight = window.innerHeight;
246            // const scrollSteps = Math.floor(Math.random() * 3) + 3; // 随机3-5步
247            // const stepSize = (scrollHeight - viewportHeight) / scrollSteps;
248            
249            // let currentScroll = 0;
250            // const scrollInterval = setInterval(() => {
251            //     if (currentScroll >= scrollHeight - viewportHeight) {
252            //         clearInterval(scrollInterval);
253            //         return;
254            //     }
255            //     currentScroll += stepSize + (Math.random() * 100 - 50); // 添加一些随机性
256            //     window.scrollTo(0, currentScroll);
257            // }, 100 + Math.random() * 200); // 随机间隔100-300毫秒
258        });
259    }
260
261    // 在滚动和其他操作之间添加随机延迟
262    async randomDelay(min = 1000, max = 3000) {
263        const delay = Math.floor(Math.random() * (max - min + 1) + min);
264        await new Promise(resolve => setTimeout(resolve, delay));
265    }
266
267
268}
269
270
271await Actor.init();
272
273Actor.main(async () => {
274    const input = await Actor.getInput();
275    // const input = {
276    //     "searchUrls": [
277    //         "https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="
278    //     ],
279    //     "maxItems": 130
280    // };
281    const crawler = new DentalplansDentistScraper();
282    await crawler.run(input);
283});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json

package.json

1{
2    "name": "js-scrape-single-page",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10		"apify": "^3.2.5",
11		"crawlee": "^3.11.5",
12		"puppeteer": "^23.5.0",
13		"puppeteer-core": "^23.5.0",
14		"puppeteer-extra": "^3.3.6",
15		"random-useragent": "^0.5.0",
16        "rebrowser-puppeteer": "^23.10.3",
17		"rebrowser-puppeteer-core": "^23.10.3"
18    },
19    "scripts": {
20        "start": "node ./src/main.js",
21        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
22    },
23    "author": "It's not you it's me",
24    "license": "ISC"
25}

Pricing

Pricing model

Rental 

To use this Actor, you have to pay a monthly rental fee to the developer. The rent is subtracted from your prepaid usage every month after the free trial period. You also pay for the Apify platform usage.

Free trial

2 hours

Price

$19.99