1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from urllib.parse import urljoin
11from seleniumwire import webdriver
12
13from selenium.webdriver.common.by import By
14from selenium.webdriver.common.action_chains import ActionChains
15from apify import Actor
16import re
17from urllib.parse import urljoin
18from bs4 import BeautifulSoup
19import requests
20import requests.exceptions
21import time
22import undetected_chromedriver as uc
23import random
24from .datascrapify import StartProcess, parse_proxy_url,on_aborting_event
25from apify_shared.consts import ActorEventTypes as Event
26import asyncio
27from urllib.parse import urlparse, urlsplit, urlunsplit, parse_qs, urlencode
28
29
30
31
32
33
34def scrape_contact_emails(link):
35 res = requests.get(link)
36 domain = link.split(".")
37 mailaddr = link
38 soup = BeautifulSoup(res.text,"lxml")
39 links = soup.find_all("a")
40 contact_link = ''
41 final_result = ""
42 try:
43
44 emails = soup.find_all(text=re.compile('.*@'+domain[1]+'.'+domain[2].replace("/","")))
45 emails.sort(key=len)
46 print(emails[0].replace("\n",""))
47 final_result = emails[0]
48 except:
49
50 try:
51 flag = 0
52 for link in links:
53 if "contact" in link.get("href") or "Contact" in link.get("href") or "CONTACT" in link.get("href") or 'contact' in link.text or 'Contact' in link.text or 'CONTACT' in link.text:
54 if len(link.get("href"))>2 and flag<2:
55 flag = flag + 1
56 contact_link = link.get("href")
57
58 except:
59 pass
60
61 domain = domain[0]+"."+domain[1]+"."+domain[2]
62 if(len(contact_link)<len(domain)):
63 domain = domain+contact_link.replace("/","")
64 else:
65 domain = contact_link
66
67 try:
68
69 res = requests.get(domain)
70 soup = BeautifulSoup(res.text,"lxml")
71 emails = soup.find_all(text=re.compile('.*@'+mailaddr[7:].replace("/","")))
72 emails.sort(key=len)
73 try:
74 print(emails[0].replace("\n",""))
75 final_result = emails[0]
76 return final_result
77 except:
78 pass
79 except Exception as e:
80 pass
81
82 return ""
83
84async def main() -> None:
85 """
86 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
87 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
88 and it also enhances performance in the field of web scraping significantly.
89 """
90 async with Actor:
91
92 actor_input = await Actor.get_input() or {}
93 Keyword_val = actor_input.get('Keyword')
94 location_val = actor_input.get('location')
95 social_network_val = actor_input.get('social_network')
96 Country_val = actor_input.get('Country')
97 Email_Type_val = actor_input.get('Email_Type')
98 Other_Email_Type_val = actor_input.get('Other_Email_Type')
99 proxy_settings = actor_input.get('proxySettings')
100 Limit_val =actor_input.get('Limit')
101
102
103 proxy_configuration = await Actor.create_proxy_configuration(groups=['GOOGLE_SERP'])
104
105 proxyurl =await proxy_configuration.new_url()
106 if proxy_configuration and proxy_settings:
107 proxyurl =await proxy_configuration.new_url()
108
109
110
111 if not Keyword_val:
112 Actor.log.info('Please insert keyword')
113 await Actor.push_data({'Email': 'Please insert keyword'})
114 await Actor.exit()
115 return
116
117 if Keyword_val=='TestKeyword':
118 Actor.log.info('Please insert keyword')
119 await Actor.push_data({'Email': 'Please insert Your Keyword'})
120 await Actor.exit()
121 return
122 '''
123 me = await Actor.apify_client.user('me').get()
124 username=me["username"]
125 isPaying='PayingUser'
126 if(me["isPaying"]==False):
127 isPaying='FreeUser'
128
129 try:
130 proxyurl=''
131 USE_Proxy=False
132 if proxyurl:
133 USE_Proxy=True
134 my_proxy_settings = parse_proxy_url(proxyurl)
135 # Call the function from the imported module
136 await StartProcess(
137 "Apify_camp_LinkedinEmail_"+username+"_"+isPaying,
138 "ALLINONE",
139 Keyword_val,
140 location_val,
141 social_network_val,
142 Country_val,
143 "Google",
144 USE_Proxy,
145 my_proxy_settings
146 )
147
148 finally:
149 # Code that always executes (e.g., cleanup)
150 print("This block always runs, regardless of exceptions.")
151
152 await Actor.exit();
153 '''
154
155 l1 = ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"]
156 l2= ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]
157 select_index=1
158 select_country='United States'
159 for count, ele in enumerate(l1):
160 if(ele==Country_val):
161 select_index=count
162 break
163
164
165
166 for count, ele in enumerate(l2):
167 if(count==select_index):
168 select_country=ele
169 break
170
171 print(select_country)
172
173 concatstring = ""
174 concatstring = concatstring + Keyword_val
175 option = "( @gmail.com OR @hotmail.com OR @yahoo.com)";
176 if Email_Type_val=="1":
177 if not Other_Email_Type_val:
178 Actor.log.info('Please insert Email Type Domain')
179 await Actor.push_data({'Email': 'Please insert Email Type Domain'})
180 await Actor.exit()
181 return
182 if Other_Email_Type_val.find("@") > -1:
183 option = " ( " + Other_Email_Type_val + " )"
184 else:
185 option = " ( @" + Other_Email_Type_val + " )"
186 concatstring = concatstring + option
187 if location_val:
188 concatstring = concatstring+ " in "+ location_val
189
190
191
192 if social_network_val:
193 concatstring = concatstring + " site:"
194
195 if social_network_val == "linkedin.com/" or social_network_val == "pinterest.com/" :
196 concatstring = concatstring + Country_val + ".";
197
198
199
200
201 if social_network_val == "amazon.com/" :
202 if Country_val=='gm':
203 Country_val='de'
204 elif Country_val=='sp':
205 Country_val='es'
206 elif Country_val=='fr':
207 Country_val='fr'
208 elif Country_val=='uk':
209 Country_val='co.uk'
210 elif Country_val=='as':
211 Country_val='com.au'
212 elif Country_val=='www':
213 Country_val='com'
214 elif Country_val=='in':
215 Country_val='in'
216 elif Country_val=='be':
217 Country_val='com.be'
218 elif Country_val=='br':
219 Country_val='com.br'
220 elif Country_val=='ca':
221 Country_val='ca'
222 elif Country_val=='ch':
223 Country_val='cn'
224 elif Country_val=='eg':
225 Country_val='eg'
226 elif Country_val=='it':
227 Country_val='it'
228 elif Country_val=='ja':
229 Country_val='co.jp'
230 elif Country_val=='mx':
231 Country_val='com.mx'
232 elif Country_val=='nl':
233 Country_val='nl'
234 elif Country_val=='pl':
235 Country_val='pl'
236 elif Country_val=='sa':
237 Country_val='sa'
238 elif Country_val=='sn':
239 Country_val='sg'
240 elif Country_val=='sw':
241 Country_val='se'
242 elif Country_val=='tu':
243 Country_val='com.tr'
244 elif Country_val=='ae':
245 Country_val='ae'
246 elif Country_val=='ae':
247 Country_val='ae'
248 else :
249 Country_val=='com'
250
251 social_network_val=social_network_val.replace('.com','.'+Country_val)
252
253
254 concatstring = concatstring + "" + social_network_val + "";
255
256 SearchEngine='Google'
257 desktop_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
258 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
259 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Safari/537.36",
260 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
261
262 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
263 "Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0",
264 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
265
266 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
267 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36 Edg/122.0.2365.66",
268
269 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/96.0.0.0",
270
271 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
272 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
273
274 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:124.0) Gecko/20100101 Firefox/124.0",
275 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
276
277 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
278 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
279
280 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.184 Safari/537.36",
281 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
282
283 "Mozilla/5.0 (X11; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
284 "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
285 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
286
287 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
288
289 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
290
291 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/96.0.0.0"
292 ]
293
294 mobile_user_agents = [
295
296 "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
297 "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
298 "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Mobile Safari/537.36",
299
300
301 "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/24.0 Chrome/123.0.0.0 Mobile Safari/537.36",
302 "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-A525F) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/121.0.0.0 Mobile Safari/537.36",
303
304
305 "Mozilla/5.0 (Linux; Android 13; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
306
307
308 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
309 "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
310 "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
311
312
313 "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
314 "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
315
316
317 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
318
319
320 "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 OPR/76.0.4017.123",
321
322
323 "Mozilla/5.0 (Linux; Android 13; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36 EdgA/123.0.2420.64",
324
325
326 "Mozilla/5.0 (Android 13; Mobile; rv:124.0) Gecko/124.0 Firefox/124.0",
327 "Mozilla/5.0 (Android 12; Mobile; rv:122.0) Gecko/122.0 Firefox/122.0"
328]
329
330
331 all_user_agents = desktop_user_agents + mobile_user_agents
332 random_user_agent = random.choice(all_user_agents)
333
334
335 qry="https://google.com/search?q="+ concatstring
336 Actor.log.info(concatstring)
337
338
339
340 all_users = []
341 query =concatstring
342 max_pages = 100
343 results = []
344 count_result=0
345 has_next=True
346 for page in range(max_pages):
347 try:
348 start = page * 10
349 Actor.log.info('Check Page '+str(page))
350 url = f"http://www.google.com/search?q={query}&num=10&hl=en&start={start}"
351
352
353
354 proxies=None
355 if proxyurl:
356 proxies = {'http': proxyurl, 'https': proxyurl}
357
358 response = requests.get(
359 url,
360 proxies=proxies,
361 headers={
362 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
363 "AppleWebKit/537.36 (KHTML, like Gecko) "
364 "Chrome/120.0.0.0 Safari/537.36"
365 }
366 )
367
368 if response.status_code != 200:
369 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
370 continue
371
372 soup = BeautifulSoup(response.text, "html.parser")
373 result_blocks = soup.select("div.g, div.tF2Cxc")
374
375 for j, block in enumerate(result_blocks):
376 Email=''
377 title_el = block.select_one("h3")
378 link_el = block.select_one("a")
379 snippet_el = block.select_one(".VwiC3b, .IsZvec, .aCOpRe")
380
381
382
383 title= title_el.get_text(strip=True)
384 url =link_el.get("href")
385 snippet= snippet_el.get_text(strip=True)
386 print(title)
387 alltext = title+snippet
388 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
389 Actor.log.info('match email '+str(len(match)))
390 if len(match)>0:
391 for i in match:
392 Email=i
393 Actor.log.info('email '+Email)
394 else:
395 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', alltext)
396 for i in match_website:
397 Website = "http://www."+i;
398 Actor.log.info('Website '+Website)
399 Email=scrape_contact_emails(Website)
400 if Email :
401 existindb=False
402
403 if len(all_users)>0:
404 for item in all_users:
405 if item['Email'] == Email :
406 existindb=True
407 break
408
409 if existindb==False:
410 all_users.append({'Email': Email});
411 await Actor.push_data({'Email': Email, 'title': title,'Description':alltext,'Detail_Link':url})
412 count_result=count_result+1
413 if(Limit_val):
414 if Limit_val!='0':
415 if(count_result>=int(Limit_val)):
416 has_next = False
417 print('Limit Exceed')
418 break
419
420 if has_next==False:
421 break
422 await asyncio.sleep(random.uniform(1.5, 3.0))
423 except:
424 break
425
426
427 print('Check in Yahoo Search Engine')
428 query=concatstring.replace("OR", "or")
429
430
431 for page in range(max_pages):
432 try:
433 if has_next==False:
434 break
435 start = page * 10
436 Actor.log.info('Check Page '+str(page))
437
438 url = "https://search.yahoo.com/search;_ylt=Awr.2lgGCQ9p1C0DekZXNyoA;_ylu=Y29sbwNncTEEcG9zAzEEdnRpZAMEc2VjA3BhZ2luYXRpb24-"
439 params = {
440 "p": query,
441 "b": start,
442 "pz": 10,
443 "bct": 0,
444 "xargs": 0
445 }
446
447
448 split_url = urlsplit(url)
449 query_str = urlencode(params)
450 full_url = urlunsplit((split_url.scheme, split_url.netloc, split_url.path, query_str, ""))
451
452 print(f"Fetching page {full_url}")
453
454
455 proxies=None
456
457
458
459 response = requests.get(
460 full_url,
461 proxies=proxies,
462 headers={
463 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
464 "AppleWebKit/537.36 (KHTML, like Gecko) "
465 "Chrome/120.0.0.0 Safari/537.36"
466 }
467 )
468
469 if response.status_code != 200:
470 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
471 break
472
473 soup = BeautifulSoup(response.text, "html.parser")
474 search_results = soup.select("div.dd.algo.algo-sr")
475
476 if not search_results:
477 log("⚠️ No more results found, ending.")
478 break
479
480 for r in search_results:
481 try:
482 Email=''
483 Website=''
484 Address=''
485 BusinessName=''
486 DetailsLink=''
487 Category=''
488 link_tag = r.select_one("h3.title")
489 link_tag1 = r.select_one("a")
490 if link_tag:
491 BusinessName = link_tag.get_text(strip=True)
492 DetailsLink = link_tag1.get("href")
493 Category = urlparse(DetailsLink).netloc
494
495 addr_tag = r.select_one("div.compText.aAbs")
496 if addr_tag:
497 Address = addr_tag.get_text(separator=" ").strip()
498
499 combined = r.get_text(strip=True)
500 print(combined)
501 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', combined)
502 Actor.log.info('match email '+str(len(match)))
503 if len(match)>0:
504 for i in match:
505 Email=i
506 Actor.log.info('email '+Email)
507 else:
508 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', combined)
509 for i in match_website:
510 Website = "http://www."+i;
511 Actor.log.info('Website '+Website)
512 Email=scrape_contact_emails(Website)
513 if Email :
514 existindb=False
515
516 if len(all_users)>0:
517 for item in all_users:
518 if item['Email'] == Email :
519 existindb=True
520 break
521
522 if existindb==False:
523 all_users.append({'Email': Email});
524 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':combined,'Detail_Link':DetailsLink})
525 count_result=count_result+1
526 if(Limit_val):
527 if Limit_val!='0':
528 if(count_result>=int(Limit_val)):
529 has_next = False
530 print('Limit Exceed')
531 break
532
533 except Exception as ex:
534 print(f"⚠️ Parse error: {ex}")
535 continue
536
537 if has_next==False:
538 break
539 await asyncio.sleep(random.uniform(1.5, 3.0))
540 except:
541
542 break
543 await Actor.exit()
544
545
546
547 Actor.log.info('Launching Chrome WebDriver...')
548 chrome_options = uc.ChromeOptions()
549 chrome_options.add_argument(f'user-agent={random_user_agent}')
550 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
551
552
553 chrome_options.add_argument('--no-sandbox')
554 chrome_options.add_argument('--disable-dev-shm-usage')
555 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
556
557
558
559
560 seleniumwire_options={}
561 if(proxyurl):
562 print('apply proxy')
563 chrome_options.add_argument(f" - proxy-server={proxyurl}")
564
565
566
567
568
569
570
571
572 driver = uc.Chrome(options=chrome_options, use_subprocess=False,version_main = 137)
573 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
574 "source": """
575 Object.defineProperty(navigator, 'webdriver', {
576 get: () => undefined
577 })
578 """
579 })
580
581
582
583
584
585
586 driver.get("https://httpbin.io/ip")
587
588 ip_address = driver.find_element(By.TAG_NAME, "body").text
589
590 print(ip_address)
591 try:
592 driver.get(qry)
593 except Exception as e:
594 print(e)
595 await Actor.push_data({'Email': e})
596 driver.quit()
597
598 checkount=0
599 try:
600 while(driver.current_url.find("sorry/index")>-1 and checkount<100):
601 print('captcha')
602 print(driver.current_url)
603 time.sleep(3)
604 checkount=checkount+1
605 except Exception as e:
606 print(e)
607
608 all_users = []
609 start=0
610
611 if(SearchEngine=='Yahoo'):
612 print('yahoo')
613 else:
614 try:
615 while True:
616 consent = driver.find_elements(By.XPATH,"//iframe[contains(@src, 'consent.google.com')]")
617 if len(consent)>0 :
618 driver.switch_to.frame(consent[0]);
619 driver.find_element(By.id("id")).click();
620
621 popup = driver.find_elements(By.CSS_SELECTOR,"button[class='yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m']");
622 if (len(popup) > 0):
623 Actor.log.info("popup")
624 for n in range(0,len(popup)):
625 if (popup[n].text=="Accept all"):
626 popup[n].click();
627
628 popup_accept = driver.find_elements(By.CSS_SELECTOR,"button[id='L2AGLb']");
629 if (len(popup_accept) > 0):
630 popup_accept[0].click();
631 time.sleep(1)
632
633 checkconsent=0;
634 while len(driver.find_elements(By.CSS_SELECTOR,"div[class='HTjtHe'][style*='display: block']")) > 0 and checkconsent<100:
635 Actor.log.info('checkconsent '+str(checkconsent))
636 Actor.log.info("homeurl" + driver.current_url)
637 checkconsent=checkconsent+1
638 time.sleep(1)
639
640 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
641 print(len(AllClasses))
642
643
644
645
646
647
648 homeurl = driver.current_url
649 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
650 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
651 AllClasses = driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")
652 Actor.log.info("homeurl" + driver.current_url)
653
654 checkount=0
655 try:
656 while(driver.current_url.index("sorry/index")>-1 and checkount<100):
657 print('captcha')
658 time.sleep(3)
659 checkount=checkount+1
660 except Exception as e:
661 print(e)
662
663 if(len(AllClasses)==0):
664 print('check new attribut')
665 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.fP1Qef")
666 print(len(AllClasses))
667 print(driver.find_elements(By.CSS_SELECTOR,"div[id='main']"))
668
669
670
671 if len(AllClasses)>0:
672 start=1
673 Actor.log.info("Result" + str(len(AllClasses)))
674 for gr in range(0,len(AllClasses)):
675 try:
676 BusinessName=""
677 DetailsLink=""
678 Email=""
679 Address=""
680 businessdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"h3.LC20lb")
681 if len(businessdetail) > 0:
682 BusinessName = businessdetail[0].text;
683 DetailsLink = businessdetail[0].find_element(By.XPATH,"parent::*").get_attribute("href")
684 if not DetailsLink:
685 businessdetailnew = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.TbwUpd")
686 if len(businessdetailnew) > 0:
687 DetailsLink = businessdetailnew[0].find_elements(By.XPATH,"parent::*").get_attribute("href")
688 Ele_addressdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.VwiC3b")
689 if len(Ele_addressdetail) > 0:
690 Address = Ele_addressdetail[0].text.replace(";", "-").replace(",", "-");
691
692 alltext = AllClasses[gr].text
693 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
694 Actor.log.info('match email '+str(len(match)))
695 if len(match)>0:
696 for i in match:
697 Email=i
698 Actor.log.info('email '+Email)
699 else:
700 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', BusinessName+Address)
701 for i in match_website:
702 Website = "http://www."+i;
703 Actor.log.info('Website '+Website)
704 Email=scrape_contact_emails(Website)
705 if Email :
706 existindb=False
707
708 if len(all_users)>0:
709 for item in all_users:
710 if item['Email'] == Email :
711 existindb=True
712 break
713 if existindb==False:
714 all_users.append({'Email': Email});
715 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':alltext,'Detail_Link':DetailsLink})
716
717
718 except Exception as err:
719 Actor.log.info(f"Unexpected {err=}, {type(err)=}")
720
721 if len(driver.find_elements(By.CSS_SELECTOR,"a#pnnext")) > 0:
722 Actor.log.info('Click Next')
723 str_url = driver.current_url
724
725 driver.execute_script("arguments[0].click();", driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0]);
726 time.sleep(3)
727 while driver.current_url == str_url:
728 tryresult = 0;
729 time.sleep(1);
730 tryresult = tryresult + 1;
731 if tryresult > 20:
732 driver.quit();
733 break;
734 else:
735 Actor.log.info('Click Next_1')
736 AllCOUNT_PREVIOUS = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
737 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
738 AllCOUNT_PREVIOUS = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
739 Actor.log.info('Click Next_1_T1')
740 str_url = driver.current_url
741 action = ActionChains(driver)
742
743
744 tryresult = 0;
745 if len(driver.find_elements(By.CSS_SELECTOR,"span.RVQdVd")) > 0:
746 action = ActionChains(driver)
747 selectedlink1 = driver.find_element(By.CSS_SELECTOR,"span.RVQdVd")
748 action.move_to_element(selectedlink1).click().perform()
749
750 time.sleep(3)
751 Actor.log.info('Click Next_1_T2')
752 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
753 if social_network_val=="youtube.com/":
754 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
755
756 while (AllCOUNT_PREVIOUS == AllCOUNT_Now):
757 time.sleep(1)
758 tryresult = tryresult + 1;
759 Actor.log.info('Click Next_1_T3_tryresult '+str(tryresult))
760 if (tryresult > 20):
761 driver.quit();
762 break;
763
764 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
765 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/" :
766 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
767 print('AllCOUNT_PREVIOUS'+str(AllCOUNT_PREVIOUS))
768 print('AllCOUNT_Now'+str(AllCOUNT_Now))
769 if(AllCOUNT_PREVIOUS==AllCOUNT_Now):
770 break
771 time.sleep(3)
772 else:
773 if(start==0):
774 await Actor.push_data({'Email': 'No Data Found, Due to google not respond. May be proxy problem'})
775 driver.quit();
776 break;
777 print('done')
778 except Exception:
779 Actor.log.exception(f'Cannot extract data from .')
780
781 driver.quit()