
DentalPlans.com Dentist Scraper 🦷
Pricing
$19.99/month + usage
Go to Store

DentalPlans.com Dentist Scraper 🦷
Extract detailed dentist information from DentalPlans.com search results, including practice details, contact info, and appointment availability. Perfect for healthcare research, provider analysis, and dental market insights. 🦷
5.0 (1)
Pricing
$19.99/month + usage
1
Monthly users
3
Runs succeeded
>99%
Last modified
15 days ago
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install NPM packages, skip optional and development dependencies to
14# keep the image small. Avoid logging too much and print the dependency
15# tree for debugging
16RUN npm --quiet set progress=false \
17 && npm install --omit=dev --omit=optional \
18 && echo "Installed NPM packages:" \
19 && (npm list --omit=dev --all || true) \
20 && echo "Node.js version:" \
21 && node --version \
22 && echo "NPM version:" \
23 && npm --version \
24 && rm -r ~/.npm
25
26# Next, copy the remaining files and directories with the source code.
27# Since we do this after NPM install, quick build will be really fast
28# for most source file changes.
29COPY . ./
30
31
32# Run the image.
33CMD npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "dentalplans-com-dentist-scraper",
4 "title": "Scrape single page in JavaScript",
5 "description": "Scrape data from single page with provided URL.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-start"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "DentalPlans.com Dentist Scraper Input",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "searchUrls": {
7 "title": "Search URLs",
8 "type": "array",
9 "description": "List of DentalPlans.com search result URLs to scrape",
10 "editor": "stringList",
11 "default": ["https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="],
12 "uniqueItems": true
13 },
14 "maxItems": {
15 "title": "Maximum Items",
16 "type": "integer",
17 "description": "Maximum number of dentist records to scrape",
18 "default": 100,
19 "minimum": 1
20 },
21 "proxyConfiguration": {
22 "title": "Proxy Configuration",
23 "type": "object",
24 "description": "Proxy settings for making requests",
25 "default": {},
26 "editor": "proxy"
27 }
28 },
29 "required": ["searchUrls"]
30}
src/main.js
1import { Actor } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3import puppeteerExtra from 'puppeteer-extra';
4import randomUseragent from 'random-useragent';
5
6import rebrowserPuppeteer from 'rebrowser-puppeteer-core'
7
8puppeteerExtra.use(rebrowserPuppeteer)
9// puppeteerExtra.use(stealthPlugin());
10
11// const redisClient = new Redis("rediss://default:AY9xAAIjcDE2YTMyNjQyYTI5ODQ0NDQ3YjVhOTE1YWRiYWEyMTZjN3AxMA@sharing-wren-36721.upstash.io:6379");
12// let sessionid = await redisClient.get('instagram-sessionid-cookie');
13
14class DentalplansDentistScraper {
15
16 async run(input) {
17 const { searchUrls, maxItems = Infinity, proxyConfiguration } = input;
18 this.maxItems = maxItems; // 存储maxItems为类属性
19
20 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);
21
22 for (const searchUrl of searchUrls) {
23 this.searchUrl = searchUrl;
24 let url = searchUrl;
25 const crawler = new PuppeteerCrawler({
26 proxyConfiguration: proxyConfig,
27 launchContext: {
28 launcher: puppeteerExtra,
29 launchOptions: {
30 headless: true,
31 args: [
32 '--no-sandbox',
33 '--disable-setuid-sandbox',
34 '--disable-dev-shm-usage',
35 '--disable-accelerated-2d-canvas',
36 '--no-first-run',
37 '--no-zygote',
38 '--disable-gpu',
39 '--disable-geolocation',
40 '--disable-notifications',
41 `--user-agent=${randomUseragent.getRandom(function (ua) {
42 return ua.browserName === 'Chrome' && parseFloat(ua.browserVersion) >= 80;
43 })}`,
44 ],
45 },
46 },
47 requestHandlerTimeoutSecs: 3600, // 1小时超时
48 navigationTimeoutSecs: 300, // 5分钟导航超时
49 preNavigationHooks: [
50 async ({ page }) => {
51 // 设置页面级别的超时时间
52 await page.setDefaultNavigationTimeout(300000); // 5分钟
53 await page.setDefaultTimeout(300000); // 5分钟
54 // await this.setCookies(page);
55 },
56 ],
57 requestHandler: async ({ page, request }) => {
58 console.log(`Processing ${this.searchUrl}...`);
59 let count = 0;
60 let endOfResults = false;
61 let interceptedData = []; // 初始化局部变量
62 let lastDataLength = 0;
63 let noNewDataCount = 0;
64 const MAX_NO_NEW_DATA_COUNT = 5; // 10秒 / 2秒 = 5次
65
66 await this.setupInterceptors(page, interceptedData);
67
68 await page.goto(url);
69
70 await this.randomDelay(2000, 3000);
71
72 // while (!endOfResults && interceptedData.length < maxItems) {
73 // count++;
74 // console.log(`Scrolling time...${count}`);
75 // await this.randomDelay();
76 // await this.scrollPage(page);
77 // await this.randomDelay(2000, 3000);
78
79
80 // endOfResults = await page.evaluate(() => {
81 // return document.body?.innerText?.includes('End of results');
82 // });
83
84 // if (endOfResults) {
85 // console.log('Reached end of results');
86 // } else if (interceptedData.length === lastDataLength) {
87 // noNewDataCount++;
88 // // console.log(`No new data for ${noNewDataCount * 5} seconds`);
89 // if (noNewDataCount >= MAX_NO_NEW_DATA_COUNT) {
90 // // console.log('No new data for 20 seconds, assuming end of results');
91 // endOfResults = true;
92 // }
93 // } else {
94 // noNewDataCount = 0;
95 // lastDataLength = interceptedData.length;
96 // }
97 // }
98 },
99 });
100
101 await crawler.run([url]);
102 }
103 }
104
105 async setCookies(page) {
106 const cookies = [
107 {
108 name: '__Secure-ENID',
109 value: '25.SE=d-L9vS6M8r24MzgdqlRwPkwZ4XuLFnylPDuaGEeggzRxH7eCqUoC-FQmW_RkU9qFB1frSGan9ohnhJWzssbPggQub_gJCkNvv91vxYoy44oYgNXdZXTJuws_pKtdXhz7bPqxkLrLHVIXCBBiV1eY2ezXD0qJIByKpoFlCdMFqcQ6SUg6ZrMyrWE_omcoueLO-NOKhM0N1xG6yghXBVOZlZWwb8MkeNN1Ef5izW_yqbFxgrQgU4EDzHpjmPb-e93ySTJNBByYZVtugHaQyOJjG9NfaXftg97fOxEGw6WH3CTzgNX7dU2dsrim37epIaIg93Z5j4EcxImW_76hO1ulyFJySedL7yva2HtUUu8rxOS63rb_iifxWH2SO8MmSOlhg11rXon00umveuBjZNjhb_bHRYNqDwUl3wPlgZ7mhf5N9lGGwU9rRJ2bxgidWB1oxh82ut9BGplTQMy8mk_XzWCwlKhqDmNi9uOJCMn34uodH4fpCqYmhjYavGP7jtkOzmvAHaepAaO5RlGRseLeMq0k3_5E6p7bs8wnEc9yOcvi',
110 domain: '.google.com',
111 path: '/',
112 expires: new Date('2026-03-07T17:04:30.711Z').getTime() / 1000,
113 secure: true,
114 sameSite: 'Lax'
115 }
116
117 ];
118
119 // 批量设置所有cookie
120 await Promise.all(cookies.map(cookie => page.setCookie(cookie)));
121 }
122
123 async setupInterceptors(page, interceptedData) {
124 await page.setRequestInterception(true);
125 page.on('request', (request) => {
126 // 获取资源类型
127 const resourceType = request.resourceType();
128
129 // 需要过滤的资源类型
130 const blockedResourceTypes = [
131 // 'image',
132 // 'media',
133 // 'font',
134 // 'texttrack',
135 // 'stylesheet',
136 // 'ping'
137 ];
138
139 // 过滤掉不需要的资源类型
140 if (blockedResourceTypes.includes(resourceType)) {
141 request.abort();
142 } else {
143 request.continue();
144 }
145 });
146 page.on('response', async (response) => {
147 const request = response.request();
148 const url = request.url();
149 if (await this.shouldInterceptRequest(url, request)) {
150 try {
151 let responseBody = await response.json();
152 await this.processResponseData(responseBody, interceptedData);
153 } catch (error) {
154 // console.error(`Error fetching data: `, error);
155 }
156 }
157 });
158 }
159
160 async processResponseData(responseBody, interceptedData) {
161 let currentData = [];
162
163 // 每次处理 50 条数据
164 let list = responseBody?.dentists || [];
165 for (let i = 0; i < list.length; i += 50) {
166 let batchData = list.slice(i, i + 50);
167 batchData?.forEach((item) => {
168 if (item) {
169 let data = {
170 searchUrl: this.searchUrl,
171 ...item
172 };
173 data.profiledentistDetailUrlUrl = 'https://www.dentalplans.com/' + item?.dentistDetailUrl;
174 currentData.push(data);
175 interceptedData.push(data);
176 }
177 });
178
179 let limit = false;
180 // 如果超过最大数量,截断currentData
181 if (interceptedData.length > this.maxItems) {
182 const overflow = interceptedData.length - this.maxItems;
183 currentData = currentData.slice(0, -overflow);
184 interceptedData = interceptedData.slice(0, this.maxItems);
185 limit = true;
186 }
187
188 if (currentData.length > 0) {
189 console.log(`Saved ${currentData.length} items`);
190 await Actor.pushData(currentData);
191 }
192
193 if (limit) {
194 break;
195 }
196
197 }
198
199
200 }
201
202 async shouldInterceptRequest(url, request) {
203 if (url.includes('/proxy/dentist/v1/search/') && request.method() === 'GET') {
204
205 try {
206 let payload = {};
207 const postData = await request.postData();
208 if (postData) {
209 try {
210 // 尝试解析为 JSON
211 const jsonData = JSON.parse(postData);
212 payload = jsonData;
213 } catch (e) {
214 // 如果不是 JSON,则按 URL 编码处理
215 const parsedData = {};
216 const searchParams = new URLSearchParams(postData);
217 for (const [key, value] of searchParams) {
218 parsedData[key] = value;
219 }
220 payload = parsedData;
221 }
222 } else {
223 }
224 // return payload?.operationName === 'getReviews';
225 return true;
226 } catch (error) {
227 console.error('Error parsing POST data:', error);
228 }
229 }
230 return false;
231 }
232
233 async scrollPage(page) {
234 await page.evaluate(() => {
235 document.querySelector('a[data-testid="paginator-navigation-button"]')?.click();
236 // document.querySelector('a[data-test="pagination-next"]')?.click();
237 // window.scrollTo(0, document.querySelector('div[data-test-id="profile-followers-feed"]')?.scrollHeight + 100)
238 // window.scrollTo(0, document.body.scrollHeight + 100)
239 // document.querySelector('div[data-test-id="profile-following"]')?.lastChild?.firstChild?.click()
240
241 // 点击下一页
242 // document.querySelector('a.next_page')?.click();
243
244 // const scrollHeight = document.body.scrollHeight;
245 // const viewportHeight = window.innerHeight;
246 // const scrollSteps = Math.floor(Math.random() * 3) + 3; // 随机3-5步
247 // const stepSize = (scrollHeight - viewportHeight) / scrollSteps;
248
249 // let currentScroll = 0;
250 // const scrollInterval = setInterval(() => {
251 // if (currentScroll >= scrollHeight - viewportHeight) {
252 // clearInterval(scrollInterval);
253 // return;
254 // }
255 // currentScroll += stepSize + (Math.random() * 100 - 50); // 添加一些随机性
256 // window.scrollTo(0, currentScroll);
257 // }, 100 + Math.random() * 200); // 随机间隔100-300毫秒
258 });
259 }
260
261 // 在滚动和其他操作之间添加随机延迟
262 async randomDelay(min = 1000, max = 3000) {
263 const delay = Math.floor(Math.random() * (max - min + 1) + min);
264 await new Promise(resolve => setTimeout(resolve, delay));
265 }
266
267
268}
269
270
271await Actor.init();
272
273Actor.main(async () => {
274 const input = await Actor.getInput();
275 // const input = {
276 // "searchUrls": [
277 // "https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="
278 // ],
279 // "maxItems": 130
280 // };
281 const crawler = new DentalplansDentistScraper();
282 await crawler.run(input);
283});
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.gitignore
1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
package.json
1{
2 "name": "js-scrape-single-page",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.2.5",
11 "crawlee": "^3.11.5",
12 "puppeteer": "^23.5.0",
13 "puppeteer-core": "^23.5.0",
14 "puppeteer-extra": "^3.3.6",
15 "random-useragent": "^0.5.0",
16 "rebrowser-puppeteer": "^23.10.3",
17 "rebrowser-puppeteer-core": "^23.10.3"
18 },
19 "scripts": {
20 "start": "node ./src/main.js",
21 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
22 },
23 "author": "It's not you it's me",
24 "license": "ISC"
25}
Pricing
Pricing model
RentalTo use this Actor, you have to pay a monthly rental fee to the developer. The rent is subtracted from your prepaid usage every month after the free trial period. You also pay for the Apify platform usage.
Free trial
2 hours
Price
$19.99