
DentalPlans.com Dentist Scraper 🦷
Pricing
$19.99/month + usage
Go to Store

DentalPlans.com Dentist Scraper 🦷
Extract detailed dentist information from DentalPlans.com search results, including practice details, contact info, and appointment availability. Perfect for healthcare research, provider analysis, and dental market insights. 🦷
5.0 (1)
Pricing
$19.99/month + usage
1
Total users
3
Monthly users
3
Runs succeeded
>99%
Last modified
a month ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "dentalplans-com-dentist-scraper", "title": "Scrape single page in JavaScript", "description": "Scrape data from single page with provided URL.", "version": "0.0", "meta": { "templateId": "js-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "DentalPlans.com Dentist Scraper Input", "type": "object", "schemaVersion": 1, "properties": { "searchUrls": { "title": "Search URLs", "type": "array", "description": "List of DentalPlans.com search result URLs to scrape", "editor": "stringList", "default": ["https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="], "uniqueItems": true }, "maxItems": { "title": "Maximum Items", "type": "integer", "description": "Maximum number of dentist records to scrape", "default": 100, "minimum": 1 }, "proxyConfiguration": { "title": "Proxy Configuration", "type": "object", "description": "Proxy settings for making requests", "default": {}, "editor": "proxy" } }, "required": ["searchUrls"]}
src/main.js
1import { Actor } from 'apify';2import { PuppeteerCrawler } from 'crawlee';3import puppeteerExtra from 'puppeteer-extra';4import randomUseragent from 'random-useragent';5
6import rebrowserPuppeteer from 'rebrowser-puppeteer-core'7
8puppeteerExtra.use(rebrowserPuppeteer)9// puppeteerExtra.use(stealthPlugin());10
11// const redisClient = new Redis("rediss://default:AY9xAAIjcDE2YTMyNjQyYTI5ODQ0NDQ3YjVhOTE1YWRiYWEyMTZjN3AxMA@sharing-wren-36721.upstash.io:6379");12// let sessionid = await redisClient.get('instagram-sessionid-cookie');13
14class DentalplansDentistScraper {15 16 async run(input) {17 const { searchUrls, maxItems = Infinity, proxyConfiguration } = input;18 this.maxItems = maxItems; // 存储maxItems为类属性19
20 const proxyConfig = await Actor.createProxyConfiguration(proxyConfiguration);21
22 for (const searchUrl of searchUrls) {23 this.searchUrl = searchUrl;24 let url = searchUrl;25 const crawler = new PuppeteerCrawler({26 proxyConfiguration: proxyConfig,27 launchContext: {28 launcher: puppeteerExtra,29 launchOptions: {30 headless: true,31 args: [32 '--no-sandbox',33 '--disable-setuid-sandbox',34 '--disable-dev-shm-usage',35 '--disable-accelerated-2d-canvas',36 '--no-first-run',37 '--no-zygote',38 '--disable-gpu',39 '--disable-geolocation',40 '--disable-notifications',41 `--user-agent=${randomUseragent.getRandom(function (ua) {42 return ua.browserName === 'Chrome' && parseFloat(ua.browserVersion) >= 80;43 })}`,44 ],45 },46 },47 requestHandlerTimeoutSecs: 3600, // 1小时超时48 navigationTimeoutSecs: 300, // 5分钟导航超时49 preNavigationHooks: [50 async ({ page }) => {51 // 设置页面级别的超时时间52 await page.setDefaultNavigationTimeout(300000); // 5分钟53 await page.setDefaultTimeout(300000); // 5分钟54 // await this.setCookies(page);55 },56 ],57 requestHandler: async ({ page, request }) => {58 console.log(`Processing ${this.searchUrl}...`);59 let count = 0;60 let endOfResults = false;61 let interceptedData = []; // 初始化局部变量62 let lastDataLength = 0;63 let noNewDataCount = 0;64 const MAX_NO_NEW_DATA_COUNT = 5; // 10秒 / 2秒 = 5次65
66 await this.setupInterceptors(page, interceptedData);67
68 await page.goto(url);69
70 await this.randomDelay(2000, 3000);71
72 // while (!endOfResults && interceptedData.length < maxItems) {73 // count++;74 // console.log(`Scrolling time...${count}`);75 // await this.randomDelay();76 // await this.scrollPage(page);77 // await this.randomDelay(2000, 3000);78 79
80 // endOfResults = await page.evaluate(() => {81 // return document.body?.innerText?.includes('End of results');82 // });83
84 // if (endOfResults) {85 // console.log('Reached end of results');86 // } else if (interceptedData.length === lastDataLength) {87 // noNewDataCount++;88 // // console.log(`No new data for ${noNewDataCount * 5} seconds`);89 // if (noNewDataCount >= MAX_NO_NEW_DATA_COUNT) {90 // // console.log('No new data for 20 seconds, assuming end of results');91 // endOfResults = true;92 // }93 // } else {94 // noNewDataCount = 0;95 // lastDataLength = interceptedData.length;96 // }97 // }98 },99 });100
101 await crawler.run([url]);102 }103 }104
105 async setCookies(page) {106 const cookies = [107 {108 name: '__Secure-ENID',109 value: '25.SE=d-L9vS6M8r24MzgdqlRwPkwZ4XuLFnylPDuaGEeggzRxH7eCqUoC-FQmW_RkU9qFB1frSGan9ohnhJWzssbPggQub_gJCkNvv91vxYoy44oYgNXdZXTJuws_pKtdXhz7bPqxkLrLHVIXCBBiV1eY2ezXD0qJIByKpoFlCdMFqcQ6SUg6ZrMyrWE_omcoueLO-NOKhM0N1xG6yghXBVOZlZWwb8MkeNN1Ef5izW_yqbFxgrQgU4EDzHpjmPb-e93ySTJNBByYZVtugHaQyOJjG9NfaXftg97fOxEGw6WH3CTzgNX7dU2dsrim37epIaIg93Z5j4EcxImW_76hO1ulyFJySedL7yva2HtUUu8rxOS63rb_iifxWH2SO8MmSOlhg11rXon00umveuBjZNjhb_bHRYNqDwUl3wPlgZ7mhf5N9lGGwU9rRJ2bxgidWB1oxh82ut9BGplTQMy8mk_XzWCwlKhqDmNi9uOJCMn34uodH4fpCqYmhjYavGP7jtkOzmvAHaepAaO5RlGRseLeMq0k3_5E6p7bs8wnEc9yOcvi',110 domain: '.google.com',111 path: '/',112 expires: new Date('2026-03-07T17:04:30.711Z').getTime() / 1000,113 secure: true,114 sameSite: 'Lax'115 }116 117 ];118
119 // 批量设置所有cookie120 await Promise.all(cookies.map(cookie => page.setCookie(cookie)));121 }122
123 async setupInterceptors(page, interceptedData) {124 await page.setRequestInterception(true);125 page.on('request', (request) => {126 // 获取资源类型127 const resourceType = request.resourceType();128 129 // 需要过滤的资源类型130 const blockedResourceTypes = [131 // 'image',132 // 'media', 133 // 'font',134 // 'texttrack',135 // 'stylesheet',136 // 'ping'137 ];138
139 // 过滤掉不需要的资源类型140 if (blockedResourceTypes.includes(resourceType)) {141 request.abort();142 } else {143 request.continue();144 }145 });146 page.on('response', async (response) => {147 const request = response.request();148 const url = request.url();149 if (await this.shouldInterceptRequest(url, request)) {150 try {151 let responseBody = await response.json();152 await this.processResponseData(responseBody, interceptedData);153 } catch (error) {154 // console.error(`Error fetching data: `, error);155 }156 }157 });158 }159
160 async processResponseData(responseBody, interceptedData) {161 let currentData = [];162
163 // 每次处理 50 条数据164 let list = responseBody?.dentists || [];165 for (let i = 0; i < list.length; i += 50) {166 let batchData = list.slice(i, i + 50);167 batchData?.forEach((item) => {168 if (item) {169 let data = {170 searchUrl: this.searchUrl,171 ...item172 };173 data.profiledentistDetailUrlUrl = 'https://www.dentalplans.com/' + item?.dentistDetailUrl;174 currentData.push(data);175 interceptedData.push(data);176 }177 });178 179 let limit = false;180 // 如果超过最大数量,截断currentData181 if (interceptedData.length > this.maxItems) {182 const overflow = interceptedData.length - this.maxItems;183 currentData = currentData.slice(0, -overflow);184 interceptedData = interceptedData.slice(0, this.maxItems);185 limit = true;186 }187 188 if (currentData.length > 0) {189 console.log(`Saved ${currentData.length} items`);190 await Actor.pushData(currentData);191 }192
193 if (limit) {194 break;195 }196
197 }198 199 200 }201
202 async shouldInterceptRequest(url, request) {203 if (url.includes('/proxy/dentist/v1/search/') && request.method() === 'GET') {204 205 try {206 let payload = {};207 const postData = await request.postData();208 if (postData) {209 try {210 // 尝试解析为 JSON211 const jsonData = JSON.parse(postData);212 payload = jsonData;213 } catch (e) {214 // 如果不是 JSON,则按 URL 编码处理215 const parsedData = {};216 const searchParams = new URLSearchParams(postData);217 for (const [key, value] of searchParams) {218 parsedData[key] = value;219 }220 payload = parsedData;221 }222 } else {223 }224 // return payload?.operationName === 'getReviews';225 return true;226 } catch (error) {227 console.error('Error parsing POST data:', error);228 }229 }230 return false;231 }232
233 async scrollPage(page) {234 await page.evaluate(() => {235 document.querySelector('a[data-testid="paginator-navigation-button"]')?.click();236 // document.querySelector('a[data-test="pagination-next"]')?.click();237 // window.scrollTo(0, document.querySelector('div[data-test-id="profile-followers-feed"]')?.scrollHeight + 100)238 // window.scrollTo(0, document.body.scrollHeight + 100)239 // document.querySelector('div[data-test-id="profile-following"]')?.lastChild?.firstChild?.click()240
241 // 点击下一页242 // document.querySelector('a.next_page')?.click();243 244 // const scrollHeight = document.body.scrollHeight;245 // const viewportHeight = window.innerHeight;246 // const scrollSteps = Math.floor(Math.random() * 3) + 3; // 随机3-5步247 // const stepSize = (scrollHeight - viewportHeight) / scrollSteps;248 249 // let currentScroll = 0;250 // const scrollInterval = setInterval(() => {251 // if (currentScroll >= scrollHeight - viewportHeight) {252 // clearInterval(scrollInterval);253 // return;254 // }255 // currentScroll += stepSize + (Math.random() * 100 - 50); // 添加一些随机性256 // window.scrollTo(0, currentScroll);257 // }, 100 + Math.random() * 200); // 随机间隔100-300毫秒258 });259 }260
261 // 在滚动和其他操作之间添加随机延迟262 async randomDelay(min = 1000, max = 3000) {263 const delay = Math.floor(Math.random() * (max - min + 1) + min);264 await new Promise(resolve => setTimeout(resolve, delay));265 }266
267
268}269
270
271await Actor.init();272
273Actor.main(async () => {274 const input = await Actor.getInput();275 // const input = {276 // "searchUrls": [277 // "https://www.dentalplans.com/dentist-search-results/?zip=90001&searchTerm="278 // ],279 // "maxItems": 130280 // };281 const crawler = new DentalplansDentistScraper();282 await crawler.run(input);283});
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.gitignore
# This file tells Git which files shouldn't be added to source control.DS_Store.ideadistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
package.json
{ "name": "js-scrape-single-page", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.2.5", "crawlee": "^3.11.5", "puppeteer": "^23.5.0", "puppeteer-core": "^23.5.0", "puppeteer-extra": "^3.3.6", "random-useragent": "^0.5.0", "rebrowser-puppeteer": "^23.10.3", "rebrowser-puppeteer-core": "^23.10.3" }, "scripts": { "start": "node ./src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}