
Linkedin Email Scraper
Pricing
$10.00/month + usage
Go to Store

Linkedin Email Scraper
Linkedin Email Scraper- Scrap Emails from linkedin specific profile using google search engine
1.0 (2)
Pricing
$10.00/month + usage
46
Total users
1.7k
Monthly users
229
Runs succeeded
>99%
Issues response
20 hours
Last modified
5 days ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "Linkedin Email Scraper", "title": "Linkedin Email Scraper", "description": "Scrap Emails from Linkedin social media profile using Search Engine", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "Linkedin Email Scraper", "views": { "titles": { "title": "Linkedin Email Scraper", "transformation": { "fields": [ "Email", "title", "Description", "Detail_Link" ] }, "display": { "component": "table", "properties": { "Email": { "label": "Email", "format": "text" }, "title": { "label": "Title", "format": "text" } , "Description": { "label": "Description", "format": "text" } , "Detail_Link": { "label": "Detail_Link", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Linkedin Email Scraper", "type": "object", "schemaVersion": 1, "properties": { "Keyword": { "title": "Keyword", "description": "Keyword", "type": "string", "editor": "textfield", "prefill":"TestKeyword" }, "location": { "title": "Location(optional)", "description": "Location", "type": "string", "editor": "textfield" }, "social_network": { "title": "Social Network", "description": "Social Network", "type": "string", "editor": "select", "default": "linkedin.com/", "enum": ["linkedin.com/"], "enumTitles": ["Linkedin"] }, "Country": { "title": "Country", "description": "Country", "type": "string", "editor": "select", "default": "www", "enum": ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"], "enumTitles": ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"] }, "Email_Type": { "title": "Email Type", "description": "Email Type", "type": "string", "editor": "select", "default": "0", "enum": ["0", "1"], "enumTitles": ["Popular Emails only(@gmail.com or @yahoo.com or @hotmail.com)", "Other Private Email"] }, "Other_Email_Type": { "title": "Other Email Type(Example @domain.com)", "description": "Other Email Type(Example @domain.com)", "type": "string", "editor": "textfield" }, "proxySettings": { "title": "Proxy configuration", "type": "object", "description": "Select proxies to be used by your crawler.", "editor": "proxy", "prefill": { "useApifyProxy": false } }
}, "required": ["Keyword","social_network","Email_Type","Country"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1"""2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.3
4Feel free to modify this file to suit your specific needs.5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:7https://docs.apify.com/sdk/python8"""9
10from urllib.parse import urljoin11from seleniumwire import webdriver12#from selenium.webdriver.chrome.options import Options as ChromeOptions13from selenium.webdriver.common.by import By14from selenium.webdriver.common.action_chains import ActionChains15from apify import Actor16import re17from urllib.parse import urljoin18from bs4 import BeautifulSoup19import requests20import requests.exceptions21import time22import undetected_chromedriver as uc23import random24
25# To run this Actor locally, you need to have the Selenium Chromedriver installed.26# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/27# When running on the Apify platform, it is already included in the Actor's Docker image.28
29
30def scrape_contact_emails(link):31 res = requests.get(link)32 domain = link.split(".")33 mailaddr = link34 soup = BeautifulSoup(res.text,"lxml")35 links = soup.find_all("a")36 contact_link = ''37 final_result = ""38 try:39 # Check if there is any email address in the homepage. 40 emails = soup.find_all(text=re.compile('.*@'+domain[1]+'.'+domain[2].replace("/","")))41 emails.sort(key=len)42 print(emails[0].replace("\n",""))43 final_result = emails[0]44 except:45 # Searching for Contact Us Page's url.46 try:47 flag = 048 for link in links:49 if "contact" in link.get("href") or "Contact" in link.get("href") or "CONTACT" in link.get("href") or 'contact' in link.text or 'Contact' in link.text or 'CONTACT' in link.text:50 if len(link.get("href"))>2 and flag<2:51 flag = flag + 152 contact_link = link.get("href")53
54 except:55 pass56
57 domain = domain[0]+"."+domain[1]+"."+domain[2]58 if(len(contact_link)<len(domain)):59 domain = domain+contact_link.replace("/","")60 else:61 domain = contact_link62
63 try:64 # Check if there is any email address in the Contact Us Page. 65 res = requests.get(domain)66 soup = BeautifulSoup(res.text,"lxml")67 emails = soup.find_all(text=re.compile('.*@'+mailaddr[7:].replace("/","")))68 emails.sort(key=len)69 try:70 print(emails[0].replace("\n",""))71 final_result = emails[0]72 return final_result73 except:74 pass75 except Exception as e:76 pass77
78 return ""79
80async def main() -> None:81 """82 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function83 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,84 and it also enhances performance in the field of web scraping significantly.85 """86 async with Actor:87 # Read the Actor input88 actor_input = await Actor.get_input() or {}89 Keyword_val = actor_input.get('Keyword')90 location_val = actor_input.get('location')91 social_network_val = actor_input.get('social_network')92 Country_val = actor_input.get('Country')93 Email_Type_val = actor_input.get('Email_Type')94 Other_Email_Type_val = actor_input.get('Other_Email_Type')95 proxy_settings = actor_input.get('proxySettings')96 proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings)97 Actor.log.info(proxy_configuration)98 proxyurl = ''99 if proxy_configuration and proxy_settings:100 proxyurl =await proxy_configuration.new_url()101 Actor.log.info(proxyurl)102 103
104 if not Keyword_val:105 Actor.log.info('Please insert keyword')106 await Actor.push_data({'Email': 'Please insert keyword'})107 await Actor.exit()108 return109 110 if Keyword_val=='TestKeyword':111 Actor.log.info('Please insert keyword')112 await Actor.push_data({'Email': 'Please insert Your Keyword'})113 await Actor.exit()114 return115
116 l1 = ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"]117 l2= ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]118 select_index=1119 select_country='United States'120 for count, ele in enumerate(l1):121 if(ele==Country_val):122 select_index=count123 break124 #print(count)125 #print(ele)126
127 for count, ele in enumerate(l2):128 if(count==select_index):129 select_country=ele130 break131
132 print(select_country) 133
134 concatstring = ""135 concatstring = concatstring + Keyword_val136 option = "( @gmail.com OR @hotmail.com OR @yahoo.com)";137 if Email_Type_val=="1":138 if not Other_Email_Type_val:139 Actor.log.info('Please insert Email Type Domain')140 await Actor.push_data({'Email': 'Please insert Email Type Domain'})141 await Actor.exit()142 return143 if Other_Email_Type_val.find("@") > -1:144 option = " ( " + Other_Email_Type_val + " )"145 else: 146 option = " ( @" + Other_Email_Type_val + " )" 147 concatstring = concatstring + option148 if location_val:149 concatstring = concatstring+ " in "+ location_val150 151 152 153 if social_network_val:154 concatstring = concatstring + " site:"155
156 if social_network_val == "linkedin.com/" or social_network_val == "pinterest.com/" :157 concatstring = concatstring + Country_val + ".";158 159
160 161
162 if social_network_val == "amazon.com/" :163 if Country_val=='gm':164 Country_val='de'165 elif Country_val=='sp':166 Country_val='es'167 elif Country_val=='fr':168 Country_val='fr' 169 elif Country_val=='uk':170 Country_val='co.uk'171 elif Country_val=='as':172 Country_val='com.au'173 elif Country_val=='www':174 Country_val='com' 175 elif Country_val=='in':176 Country_val='in'177 elif Country_val=='be':178 Country_val='com.be'179 elif Country_val=='br':180 Country_val='com.br'181 elif Country_val=='ca':182 Country_val='ca'183 elif Country_val=='ch':184 Country_val='cn' 185 elif Country_val=='eg': 186 Country_val='eg'187 elif Country_val=='it':188 Country_val='it' 189 elif Country_val=='ja':190 Country_val='co.jp'191 elif Country_val=='mx':192 Country_val='com.mx'193 elif Country_val=='nl':194 Country_val='nl'195 elif Country_val=='pl':196 Country_val='pl' 197 elif Country_val=='sa':198 Country_val='sa'199 elif Country_val=='sn':200 Country_val='sg' 201 elif Country_val=='sw':202 Country_val='se'203 elif Country_val=='tu':204 Country_val='com.tr' 205 elif Country_val=='ae':206 Country_val='ae' 207 elif Country_val=='ae':208 Country_val='ae'209 else : 210 Country_val=='com'211 212 social_network_val=social_network_val.replace('.com','.'+Country_val)213 214
215 concatstring = concatstring + "" + social_network_val + "";216 217 SearchEngine='Google'218 desktop_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",219 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",220 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Safari/537.36",221 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",222 # Firefox - Windows223 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",224 "Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0",225 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",226 # Edge - Windows227 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",228 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36 Edg/122.0.2365.66",229 # Opera - Windows230 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/96.0.0.0",231 # Chrome - macOS232 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",233 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",234 # Firefox - macOS235 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:124.0) Gecko/20100101 Firefox/124.0",236 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",237 # Safari - macOS238 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",239 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",240 # Chrome - Linux241 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.184 Safari/537.36",242 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",243 # Firefox - Linux244 "Mozilla/5.0 (X11; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",245 "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",246 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",247 # Safari - iMac (intel)248 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",249 # Edge - macOS250 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",251 # Opera - macOS252 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/96.0.0.0"253 ]254
255 mobile_user_agents = [256 # Chrome - Android257 "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",258 "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",259 "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Mobile Safari/537.36",260
261 # Samsung Internet - Android262 "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/24.0 Chrome/123.0.0.0 Mobile Safari/537.36",263 "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-A525F) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/121.0.0.0 Mobile Safari/537.36",264
265 # Chrome - Xiaomi266 "Mozilla/5.0 (Linux; Android 13; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",267
268 # Safari - iPhone269 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",270 "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",271 "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",272
273 # Safari - iPad274 "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",275 "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",276
277 # Chrome - iPhone278 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",279
280 # Opera - Android281 "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 OPR/76.0.4017.123",282
283 # Edge - Android284 "Mozilla/5.0 (Linux; Android 13; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36 EdgA/123.0.2420.64",285
286 # Firefox - Android287 "Mozilla/5.0 (Android 13; Mobile; rv:124.0) Gecko/124.0 Firefox/124.0",288 "Mozilla/5.0 (Android 12; Mobile; rv:122.0) Gecko/122.0 Firefox/122.0"289] 290
291 292 all_user_agents = desktop_user_agents + mobile_user_agents293 random_user_agent = random.choice(all_user_agents)294
295
296 qry="https://google.com/search?q="+ concatstring297 Actor.log.info(concatstring)298 # Launch a new Selenium Chrome WebDriver299 Actor.log.info('Launching Chrome WebDriver...')300 chrome_options = uc.ChromeOptions()301 chrome_options.add_argument(f'user-agent={random_user_agent}')302 chrome_options.add_argument("--disable-blink-features=AutomationControlled")303 #chrome_options.add_extension('./NopeCHA-CAPTCHA-Solver.crx')304 #chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36")305 chrome_options.add_argument('--no-sandbox')306 chrome_options.add_argument('--disable-dev-shm-usage')307 chrome_options.add_argument("--disable-blink-features=AutomationControlled")308 #options.add_experimental_option("useAutomationExtension", False)309 # chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])310 #chrome_options.add_experimental_option('useAutomationExtension', False)311 312 seleniumwire_options={}313 if(proxyurl):314 print('apply proxy')315 chrome_options.add_argument(f" - proxy-server={proxyurl}")316 #seleniumwire_options = {'proxy': { 'https': proxyurl,'https': proxyurl} }317 # Initialize SeleniumAuthenticatedProxy318 #proxy_helper = SeleniumAuthenticatedProxy(proxy_url=proxyurl)319 # Enrich Chrome options with proxy authentication320 #proxy_helper.enrich_chrome_options(chrome_options)321
322 #print(seleniumwire_options)323 #driver = webdriver.Chrome(options=chrome_options,seleniumwire_options=seleniumwire_options)324 driver = uc.Chrome(options=chrome_options, use_subprocess=False,version_main = 135)325 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {326 "source": """327 Object.defineProperty(navigator, 'webdriver', {328 get: () => undefined329 })330 """331 })332
333 #driver.get('https://nopecha.com/setup#aqoz34cu2ob3qmhn')334 #time.sleep(3)335 #print("key : "+driver.execute_script("return document.querySelectorAll('body')[0].innerText"))336
337 # Visit the test URL to check your proxy IP338 driver.get("https://httpbin.io/ip")339 # Select the body tag containing the current IP address340 ip_address = driver.find_element(By.TAG_NAME, "body").text341 # Print your current IP342 print(ip_address)343 try:344 driver.get(qry)345 except Exception as e:346 print(e)347 await Actor.push_data({'Email': e})348 driver.quit()349 350 checkount=0351 try:352 while(driver.current_url.find("sorry/index")>-1 and checkount<100):353 print('captcha')354 print(driver.current_url)355 time.sleep(3)356 checkount=checkount+1357 except Exception as e:358 print(e) 359
360 all_users = []361 start=0362
363 if(SearchEngine=='Yahoo'):364 print('yahoo')365 else:366 try:367 while True:368 consent = driver.find_elements(By.XPATH,"//iframe[contains(@src, 'consent.google.com')]")369 if len(consent)>0 :370 driver.switch_to.frame(consent[0]);371 driver.find_element(By.id("id")).click();372
373 popup = driver.find_elements(By.CSS_SELECTOR,"button[class='yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m']");374 if (len(popup) > 0):375 Actor.log.info("popup") 376 for n in range(0,len(popup)):377 if (popup[n].text=="Accept all"):378 popup[n].click();379 380 popup_accept = driver.find_elements(By.CSS_SELECTOR,"button[id='L2AGLb']");381 if (len(popup_accept) > 0):382 popup_accept[0].click();383 time.sleep(1)384 385 checkconsent=0;386 while len(driver.find_elements(By.CSS_SELECTOR,"div[class='HTjtHe'][style*='display: block']")) > 0 and checkconsent<100:387 Actor.log.info('checkconsent '+str(checkconsent))388 Actor.log.info("homeurl" + driver.current_url) 389 checkconsent=checkconsent+1390 time.sleep(1)391 392 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc") 393 print(len(AllClasses)) 394 #checkconsent_youtube=0; 395 #while len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")) <= 0 and checkconsent_youtube<20:396 #Actor.log.info('checkconsent_youtube '+str(checkconsent_youtube))397 #Actor.log.info("homeurl" + driver.current_url) 398 #checkconsent_youtube=checkconsent_youtube+1399 #time.sleep(1)400 homeurl = driver.current_url401 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")402 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":403 AllClasses = driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")404 Actor.log.info("homeurl" + driver.current_url)405
406 checkount=0407 try:408 while(driver.current_url.index("sorry/index")>-1 and checkount<100):409 print('captcha')410 time.sleep(3)411 checkount=checkount+1412 except Exception as e:413 print(e) 414
415 if(len(AllClasses)==0):416 print('check new attribut')417 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.fP1Qef")418 print(len(AllClasses))419 print(driver.find_elements(By.CSS_SELECTOR,"div[id='main']"))420 421 #res = requests.get(driver.current_url)422 #await Actor.push_data({'Email': res.text})423 if len(AllClasses)>0:424 start=1425 Actor.log.info("Result" + str(len(AllClasses)))426 for gr in range(0,len(AllClasses)):427 try:428 BusinessName=""429 DetailsLink=""430 Email=""431 Address=""432 businessdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"h3.LC20lb")433 if len(businessdetail) > 0:434 BusinessName = businessdetail[0].text;435 DetailsLink = businessdetail[0].find_element(By.XPATH,"parent::*").get_attribute("href")436 if not DetailsLink:437 businessdetailnew = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.TbwUpd")438 if len(businessdetailnew) > 0:439 DetailsLink = businessdetailnew[0].find_elements(By.XPATH,"parent::*").get_attribute("href")440 Ele_addressdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.VwiC3b")441 if len(Ele_addressdetail) > 0:442 Address = Ele_addressdetail[0].text.replace(";", "-").replace(",", "-");443 444 alltext = AllClasses[gr].text445 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext) 446 Actor.log.info('match email '+str(len(match)))447 if len(match)>0:448 for i in match:449 Email=i450 Actor.log.info('email '+Email)451 else:452 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', BusinessName+Address)453 for i in match_website:454 Website = "http://www."+i;455 Actor.log.info('Website '+Website)456 Email=scrape_contact_emails(Website)457 if Email :458 existindb=False459 #Actor.log.info('Email Store '+Email)460 if len(all_users)>0: 461 for item in all_users:462 if item['Email'] == Email :463 existindb=True464 break465 if existindb==False:466 all_users.append({'Email': Email}); 467 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':alltext,'Detail_Link':DetailsLink})468
469 470 except Exception as err:471 Actor.log.info(f"Unexpected {err=}, {type(err)=}") 472
473 if len(driver.find_elements(By.CSS_SELECTOR,"a#pnnext")) > 0:474 Actor.log.info('Click Next')475 str_url = driver.current_url476 #driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0].click();477 driver.execute_script("arguments[0].click();", driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0]);478 time.sleep(3)479 while driver.current_url == str_url:480 tryresult = 0;481 time.sleep(1);482 tryresult = tryresult + 1;483 if tryresult > 20:484 driver.quit();485 break;486 else:487 Actor.log.info('Click Next_1')488 AllCOUNT_PREVIOUS = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))489 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":490 AllCOUNT_PREVIOUS = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))491 Actor.log.info('Click Next_1_T1')492 str_url = driver.current_url493 action = ActionChains(driver)494 # perform the operation 495 #action.move_to_element(element).click().perform()496 tryresult = 0;497 if len(driver.find_elements(By.CSS_SELECTOR,"span.RVQdVd")) > 0:498 action = ActionChains(driver)499 selectedlink1 = driver.find_element(By.CSS_SELECTOR,"span.RVQdVd")500 action.move_to_element(selectedlink1).click().perform()501 502 time.sleep(3)503 Actor.log.info('Click Next_1_T2')504 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))505 if social_network_val=="youtube.com/":506 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))507 508 while (AllCOUNT_PREVIOUS == AllCOUNT_Now):509 time.sleep(1)510 tryresult = tryresult + 1;511 Actor.log.info('Click Next_1_T3_tryresult '+str(tryresult))512 if (tryresult > 20):513 driver.quit();514 break;515 516 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))517 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/" :518 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")) 519 print('AllCOUNT_PREVIOUS'+str(AllCOUNT_PREVIOUS))520 print('AllCOUNT_Now'+str(AllCOUNT_Now))521 if(AllCOUNT_PREVIOUS==AllCOUNT_Now):522 break 523 time.sleep(3)524 else:525 if(start==0):526 await Actor.push_data({'Email': 'No Data Found, Due to google not respond. May be proxy problem'})527 driver.quit();528 break;529 print('done')530 except Exception:531 Actor.log.exception(f'Cannot extract data from .') 532
533 driver.quit()
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
NopeCHA-CAPTCHA-Solver.crx
Downloadrequirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05selenium ~= 4.14.06blinker ~=1.7.07selenium-wire ~=5.1.08beautifulsoup4 ~= 4.12.29httpx ~= 0.25.210types-beautifulsoup4 ~= 4.12.0.711requests12undetected-chromedriver