1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from urllib.parse import urljoin
11
12from selenium.webdriver.common.by import By
13from selenium.webdriver.common.action_chains import ActionChains
14from apify import Actor
15import re
16from urllib.parse import urljoin
17from bs4 import BeautifulSoup
18import requests
19import requests.exceptions
20import time
21import undetected_chromedriver as uc
22import random
23from .datascrapify import StartProcess, parse_proxy_url,on_aborting_event
24from apify_shared.consts import ActorEventTypes as Event
25import asyncio
26from urllib.parse import urlparse, urlsplit, urlunsplit, parse_qs, urlencode
27
28
29
30
31
32
33def scrape_contact_emails(link):
34 res = requests.get(link)
35 domain = link.split(".")
36 mailaddr = link
37 soup = BeautifulSoup(res.text,"lxml")
38 links = soup.find_all("a")
39 contact_link = ''
40 final_result = ""
41 try:
42
43 emails = soup.find_all(text=re.compile('.*@'+domain[1]+'.'+domain[2].replace("/","")))
44 emails.sort(key=len)
45 print(emails[0].replace("\n",""))
46 final_result = emails[0]
47 except:
48
49 try:
50 flag = 0
51 for link in links:
52 if "contact" in link.get("href") or "Contact" in link.get("href") or "CONTACT" in link.get("href") or 'contact' in link.text or 'Contact' in link.text or 'CONTACT' in link.text:
53 if len(link.get("href"))>2 and flag<2:
54 flag = flag + 1
55 contact_link = link.get("href")
56
57 except:
58 pass
59
60 domain = domain[0]+"."+domain[1]+"."+domain[2]
61 if(len(contact_link)<len(domain)):
62 domain = domain+contact_link.replace("/","")
63 else:
64 domain = contact_link
65
66 try:
67
68 res = requests.get(domain)
69 soup = BeautifulSoup(res.text,"lxml")
70 emails = soup.find_all(text=re.compile('.*@'+mailaddr[7:].replace("/","")))
71 emails.sort(key=len)
72 try:
73 print(emails[0].replace("\n",""))
74 final_result = emails[0]
75 return final_result
76 except:
77 pass
78 except Exception as e:
79 pass
80
81 return ""
82
83async def main() -> None:
84 """
85 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
86 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
87 and it also enhances performance in the field of web scraping significantly.
88 """
89 async with Actor:
90
91 actor_input = await Actor.get_input() or {}
92 Keyword_val = actor_input.get('Keyword')
93 location_val = actor_input.get('location')
94 social_network_val = actor_input.get('social_network')
95 Country_val = actor_input.get('Country')
96 Email_Type_val = actor_input.get('Email_Type')
97 Other_Email_Type_val = actor_input.get('Other_Email_Type')
98 proxy_settings = actor_input.get('proxySettings')
99 Limit_val =actor_input.get('Limit')
100
101
102 proxy_configuration = await Actor.create_proxy_configuration(groups=['GOOGLE_SERP'])
103
104 proxyurl =await proxy_configuration.new_url()
105 if proxy_configuration and proxy_settings:
106 proxyurl =await proxy_configuration.new_url()
107
108
109
110 if not Keyword_val:
111 Actor.log.info('Please insert keyword')
112 await Actor.push_data({'Email': 'Please insert keyword'})
113 await Actor.exit()
114 return
115
116 if Keyword_val=='TestKeyword':
117 Actor.log.info('Please insert keyword')
118 await Actor.push_data({'Email': 'Please insert Your Keyword'})
119 await Actor.exit()
120 return
121
122 me = await Actor.apify_client.user('me').get()
123 username=me["username"]
124 isPaying='PayingUser'
125 if(me["isPaying"]==False):
126 isPaying='FreeUser'
127 Limit_val=20
128 await Actor.push_data({'Email': 'Free User Get only 20 Records'})
129
130 '''
131 try:
132 proxyurl=''
133 USE_Proxy=False
134 if proxyurl:
135 USE_Proxy=True
136 my_proxy_settings = parse_proxy_url(proxyurl)
137 # Call the function from the imported module
138 await StartProcess(
139 "Apify_camp_LinkedinEmail_"+username+"_"+isPaying,
140 "ALLINONE",
141 Keyword_val,
142 location_val,
143 social_network_val,
144 Country_val,
145 "Google",
146 USE_Proxy,
147 my_proxy_settings
148 )
149
150 finally:
151 # Code that always executes (e.g., cleanup)
152 print("This block always runs, regardless of exceptions.")
153
154 await Actor.exit();
155 '''
156
157 l1 = ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"]
158 l2= ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]
159 select_index=1
160 select_country='United States'
161 for count, ele in enumerate(l1):
162 if(ele==Country_val):
163 select_index=count
164 break
165
166
167
168 for count, ele in enumerate(l2):
169 if(count==select_index):
170 select_country=ele
171 break
172
173 print(select_country)
174
175 concatstring = ""
176 concatstring = concatstring + Keyword_val
177 option = "( @gmail.com OR @hotmail.com OR @yahoo.com)";
178 if Email_Type_val=="1":
179 if not Other_Email_Type_val:
180 Actor.log.info('Please insert Email Type Domain')
181 await Actor.push_data({'Email': 'Please insert Email Type Domain'})
182 await Actor.exit()
183 return
184 if Other_Email_Type_val.find("@") > -1:
185 option = " ( " + Other_Email_Type_val + " )"
186 else:
187 option = " ( @" + Other_Email_Type_val + " )"
188 concatstring = concatstring + option
189 if location_val:
190 concatstring = concatstring+ " in "+ location_val
191
192
193
194 if social_network_val:
195 concatstring = concatstring + " site:"
196
197 if social_network_val == "linkedin.com/" or social_network_val == "pinterest.com/" :
198 concatstring = concatstring + Country_val + ".";
199
200
201
202
203 if social_network_val == "amazon.com/" :
204 if Country_val=='gm':
205 Country_val='de'
206 elif Country_val=='sp':
207 Country_val='es'
208 elif Country_val=='fr':
209 Country_val='fr'
210 elif Country_val=='uk':
211 Country_val='co.uk'
212 elif Country_val=='as':
213 Country_val='com.au'
214 elif Country_val=='www':
215 Country_val='com'
216 elif Country_val=='in':
217 Country_val='in'
218 elif Country_val=='be':
219 Country_val='com.be'
220 elif Country_val=='br':
221 Country_val='com.br'
222 elif Country_val=='ca':
223 Country_val='ca'
224 elif Country_val=='ch':
225 Country_val='cn'
226 elif Country_val=='eg':
227 Country_val='eg'
228 elif Country_val=='it':
229 Country_val='it'
230 elif Country_val=='ja':
231 Country_val='co.jp'
232 elif Country_val=='mx':
233 Country_val='com.mx'
234 elif Country_val=='nl':
235 Country_val='nl'
236 elif Country_val=='pl':
237 Country_val='pl'
238 elif Country_val=='sa':
239 Country_val='sa'
240 elif Country_val=='sn':
241 Country_val='sg'
242 elif Country_val=='sw':
243 Country_val='se'
244 elif Country_val=='tu':
245 Country_val='com.tr'
246 elif Country_val=='ae':
247 Country_val='ae'
248 elif Country_val=='ae':
249 Country_val='ae'
250 else :
251 Country_val=='com'
252
253 social_network_val=social_network_val.replace('.com','.'+Country_val)
254
255
256 concatstring = concatstring + "" + social_network_val + "";
257
258 SearchEngine='Google'
259 desktop_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
260 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
261 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Safari/537.36",
262 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
263
264 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
265 "Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0",
266 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
267
268 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
269 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36 Edg/122.0.2365.66",
270
271 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/96.0.0.0",
272
273 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
274 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
275
276 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:124.0) Gecko/20100101 Firefox/124.0",
277 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
278
279 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
280 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
281
282 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.184 Safari/537.36",
283 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
284
285 "Mozilla/5.0 (X11; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
286 "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
287 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
288
289 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
290
291 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
292
293 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/96.0.0.0"
294 ]
295
296 mobile_user_agents = [
297
298 "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
299 "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
300 "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Mobile Safari/537.36",
301
302
303 "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/24.0 Chrome/123.0.0.0 Mobile Safari/537.36",
304 "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-A525F) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/121.0.0.0 Mobile Safari/537.36",
305
306
307 "Mozilla/5.0 (Linux; Android 13; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
308
309
310 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
311 "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
312 "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
313
314
315 "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
316 "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
317
318
319 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
320
321
322 "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 OPR/76.0.4017.123",
323
324
325 "Mozilla/5.0 (Linux; Android 13; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36 EdgA/123.0.2420.64",
326
327
328 "Mozilla/5.0 (Android 13; Mobile; rv:124.0) Gecko/124.0 Firefox/124.0",
329 "Mozilla/5.0 (Android 12; Mobile; rv:122.0) Gecko/122.0 Firefox/122.0"
330]
331
332
333 all_user_agents = desktop_user_agents + mobile_user_agents
334 random_user_agent = random.choice(all_user_agents)
335
336
337 qry="https://google.com/search?q="+ concatstring
338 Actor.log.info(concatstring)
339
340 gl_map = {
341 "it": "it",
342 "www": "us",
343 "gm": "de",
344 "fr": "fr",
345 "sp": "es",
346 "uk": "uk",
347 "al": "al",
348 "ag": "dz",
349 "ar": "ar",
350 "am": "am",
351 "as": "au",
352 "au": "at",
353 "aj": "az",
354 "bg": "bd",
355 "bo": "by",
356 "be": "be",
357 "bh": "bz",
358 "bk": "ba",
359 "br": "br",
360 "bu": "bg",
361 "ca": "ca",
362 "ci": "cl",
363 "ch": "cn",
364 "co": "co",
365 "cs": "cr",
366 "hr": "hr",
367 "cu": "cu",
368 "ez": "ez",
369 "dk": "dk",
370 "ec": "ec",
371 "eg": "eg",
372 "en": "ee",
373 "fi": "fi",
374 "gg": "ge",
375 "gr": "gr",
376 "hk": "hk",
377 "hu": "hu",
378 "ic": "is",
379 "in": "in",
380 "id": "id",
381 "ir": "ir",
382 "iz": "iq",
383 "ei": "ie",
384 "is": "il",
385 "jm": "jm",
386 "ja": "jp",
387
388 "ke": "ke",
389 "kn": "kp",
390 "ks": "kr",
391 "ku": "kw",
392 "lg": "lv",
393 "ly": "ly",
394 "ls": "li",
395 "lh": "lt",
396 "lu": "lu",
397 "mc": "mo",
398 "mk": "mk",
399 "my": "my",
400 "mt": "mt",
401 "mx": "mx",
402 "md": "md",
403 "mn": "mc",
404 "mj": "me",
405 "mo": "ma",
406 "np": "np",
407 "nl": "nl",
408 "nz": "nz",
409 "ni": "ng",
410 "no": "no",
411 "pk": "pk",
412 "we": "ps",
413 "pm": "pa",
414 "pa": "py",
415 "pe": "pe",
416 "rp": "ph",
417 "pl": "pl",
418 "po": "pt",
419 "rq": "pr",
420 "qa": "qa",
421 "ro": "ro",
422 "rs": "ru",
423 "sm": "sm",
424 "sa": "sa",
425 "sg": "sn",
426 "ri": "rs",
427 "sn": "sg",
428 "lo": "sk",
429 "si": "si",
430 "sf": "za",
431 "sw": "se",
432 "sz": "ch",
433 "sy": "sy",
434 "tw": "tw",
435 "th": "th",
436 "ts": "tn",
437 "tu": "tu",
438 "ua": "ua",
439 "ae": "ae",
440 "uy": "uy",
441 "uz": "uz",
442 "ve": "ve",
443 "vn": "vn",
444 "lk": "lk",
445 }
446
447 gl = gl_map.get(Country_val, Country_val)
448
449
450
451 all_users = []
452 query =concatstring
453 max_pages = 100
454 results = []
455 count_result=0
456 has_next=True
457 for page in range(max_pages):
458 try:
459 start = page * 10
460 Actor.log.info('Check Page '+str(page))
461 url = f"http://www.google.com/search?q={query}&num=10&hl=en&start={start}&gl={gl}"
462
463
464
465 proxies=None
466 if proxyurl:
467 proxies = {'http': proxyurl, 'https': proxyurl}
468
469 response = requests.get(
470 url,
471 proxies=proxies,
472 headers={
473 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
474 "AppleWebKit/537.36 (KHTML, like Gecko) "
475 "Chrome/120.0.0.0 Safari/537.36"
476 }
477 )
478
479 if response.status_code != 200:
480 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
481 continue
482
483 soup = BeautifulSoup(response.text, "html.parser")
484 result_blocks = soup.select("div.g, div.tF2Cxc")
485
486 for j, block in enumerate(result_blocks):
487 Email=''
488 title_el = block.select_one("h3")
489 link_el = block.select_one("a")
490 snippet_el = block.select_one(".VwiC3b, .IsZvec, .aCOpRe")
491
492
493
494 title= title_el.get_text(strip=True)
495 url =link_el.get("href")
496 snippet= snippet_el.get_text(strip=True)
497 print(title)
498 alltext = title+snippet
499 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
500 Actor.log.info('match email '+str(len(match)))
501 if len(match)>0:
502 for i in match:
503 Email=i
504 Actor.log.info('email '+Email)
505 else:
506 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', alltext)
507 for i in match_website:
508 Website = "http://www."+i;
509 Actor.log.info('Website '+Website)
510 Email=scrape_contact_emails(Website)
511 if Email :
512 existindb=False
513
514 if len(all_users)>0:
515 for item in all_users:
516 if item['Email'] == Email :
517 existindb=True
518 break
519
520 if existindb==False:
521 all_users.append({'Email': Email});
522 await Actor.push_data({'Email': Email, 'title': title,'Description':alltext,'Detail_Link':url})
523 count_result=count_result+1
524 if(Limit_val):
525 if Limit_val!='0':
526 if(count_result>=int(Limit_val)):
527 has_next = False
528 print('Limit Exceed')
529 break
530
531 if has_next==False:
532 break
533 await asyncio.sleep(random.uniform(1.5, 3.0))
534 except:
535 break
536
537
538 print('Check in Yahoo Search Engine')
539 query=concatstring.replace("OR", "or")
540
541
542 for page in range(max_pages):
543 try:
544 if has_next==False:
545 break
546 start = page * 10
547 Actor.log.info('Check Page '+str(page))
548
549 url = "https://search.yahoo.com/search;_ylt=Awr.2lgGCQ9p1C0DekZXNyoA;_ylu=Y29sbwNncTEEcG9zAzEEdnRpZAMEc2VjA3BhZ2luYXRpb24-"
550 params = {
551 "p": query,
552 "b": start,
553 "pz": 10,
554 "bct": 0,
555 "xargs": 0
556 }
557
558
559 split_url = urlsplit(url)
560 query_str = urlencode(params)
561 full_url = urlunsplit((split_url.scheme, split_url.netloc, split_url.path, query_str, ""))
562
563 print(f"Fetching page {full_url}")
564
565
566 proxies=None
567
568
569
570 response = requests.get(
571 full_url,
572 proxies=proxies,
573 headers={
574 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
575 "AppleWebKit/537.36 (KHTML, like Gecko) "
576 "Chrome/120.0.0.0 Safari/537.36"
577 }
578 )
579
580 if response.status_code != 200:
581 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
582 break
583
584 soup = BeautifulSoup(response.text, "html.parser")
585 search_results = soup.select("div.dd.algo.algo-sr")
586
587 if not search_results:
588 log("⚠️ No more results found, ending.")
589 break
590
591 for r in search_results:
592 try:
593 Email=''
594 Website=''
595 Address=''
596 BusinessName=''
597 DetailsLink=''
598 Category=''
599 link_tag = r.select_one("h3.title")
600 link_tag1 = r.select_one("a")
601 if link_tag:
602 BusinessName = link_tag.get_text(strip=True)
603 DetailsLink = link_tag1.get("href")
604 Category = urlparse(DetailsLink).netloc
605
606 addr_tag = r.select_one("div.compText.aAbs")
607 if addr_tag:
608 Address = addr_tag.get_text(separator=" ").strip()
609
610 combined = r.get_text(strip=True)
611 print(combined)
612 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', combined)
613 Actor.log.info('match email '+str(len(match)))
614 if len(match)>0:
615 for i in match:
616 Email=i
617 Actor.log.info('email '+Email)
618 else:
619 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', combined)
620 for i in match_website:
621 Website = "http://www."+i;
622 Actor.log.info('Website '+Website)
623 Email=scrape_contact_emails(Website)
624 if Email :
625 existindb=False
626
627 if len(all_users)>0:
628 for item in all_users:
629 if item['Email'] == Email :
630 existindb=True
631 break
632
633 if existindb==False:
634 all_users.append({'Email': Email});
635 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':combined,'Detail_Link':DetailsLink})
636 count_result=count_result+1
637 if(Limit_val):
638 if Limit_val!='0':
639 if(count_result>=int(Limit_val)):
640 has_next = False
641 print('Limit Exceed')
642 break
643
644 except Exception as ex:
645 print(f"⚠️ Parse error: {ex}")
646 continue
647
648 if has_next==False:
649 break
650 await asyncio.sleep(random.uniform(1.5, 3.0))
651 except:
652
653 break
654 await Actor.exit()
655
656
657
658 Actor.log.info('Launching Chrome WebDriver...')
659 chrome_options = uc.ChromeOptions()
660 chrome_options.add_argument(f'user-agent={random_user_agent}')
661 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
662
663
664 chrome_options.add_argument('--no-sandbox')
665 chrome_options.add_argument('--disable-dev-shm-usage')
666 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
667
668
669
670
671 seleniumwire_options={}
672 if(proxyurl):
673 print('apply proxy')
674 chrome_options.add_argument(f" - proxy-server={proxyurl}")
675
676
677
678
679
680
681
682
683
684 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
685 "source": """
686 Object.defineProperty(navigator, 'webdriver', {
687 get: () => undefined
688 })
689 """
690 })
691
692
693
694
695
696
697 driver.get("https://httpbin.io/ip")
698
699 ip_address = driver.find_element(By.TAG_NAME, "body").text
700
701 print(ip_address)
702 try:
703 driver.get(qry)
704 except Exception as e:
705 print(e)
706 await Actor.push_data({'Email': e})
707 driver.quit()
708
709 checkount=0
710 try:
711 while(driver.current_url.find("sorry/index")>-1 and checkount<100):
712 print('captcha')
713 print(driver.current_url)
714 time.sleep(3)
715 checkount=checkount+1
716 except Exception as e:
717 print(e)
718
719 all_users = []
720 start=0
721
722 if(SearchEngine=='Yahoo'):
723 print('yahoo')
724 else:
725 try:
726 while True:
727 consent = driver.find_elements(By.XPATH,"//iframe[contains(@src, 'consent.google.com')]")
728 if len(consent)>0 :
729 driver.switch_to.frame(consent[0]);
730 driver.find_element(By.id("id")).click();
731
732 popup = driver.find_elements(By.CSS_SELECTOR,"button[class='yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m']");
733 if (len(popup) > 0):
734 Actor.log.info("popup")
735 for n in range(0,len(popup)):
736 if (popup[n].text=="Accept all"):
737 popup[n].click();
738
739 popup_accept = driver.find_elements(By.CSS_SELECTOR,"button[id='L2AGLb']");
740 if (len(popup_accept) > 0):
741 popup_accept[0].click();
742 time.sleep(1)
743
744 checkconsent=0;
745 while len(driver.find_elements(By.CSS_SELECTOR,"div[class='HTjtHe'][style*='display: block']")) > 0 and checkconsent<100:
746 Actor.log.info('checkconsent '+str(checkconsent))
747 Actor.log.info("homeurl" + driver.current_url)
748 checkconsent=checkconsent+1
749 time.sleep(1)
750
751 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
752 print(len(AllClasses))
753
754
755
756
757
758
759 homeurl = driver.current_url
760 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
761 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
762 AllClasses = driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")
763 Actor.log.info("homeurl" + driver.current_url)
764
765 checkount=0
766 try:
767 while(driver.current_url.index("sorry/index")>-1 and checkount<100):
768 print('captcha')
769 time.sleep(3)
770 checkount=checkount+1
771 except Exception as e:
772 print(e)
773
774 if(len(AllClasses)==0):
775 print('check new attribut')
776 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.fP1Qef")
777 print(len(AllClasses))
778 print(driver.find_elements(By.CSS_SELECTOR,"div[id='main']"))
779
780
781
782 if len(AllClasses)>0:
783 start=1
784 Actor.log.info("Result" + str(len(AllClasses)))
785 for gr in range(0,len(AllClasses)):
786 try:
787 BusinessName=""
788 DetailsLink=""
789 Email=""
790 Address=""
791 businessdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"h3.LC20lb")
792 if len(businessdetail) > 0:
793 BusinessName = businessdetail[0].text;
794 DetailsLink = businessdetail[0].find_element(By.XPATH,"parent::*").get_attribute("href")
795 if not DetailsLink:
796 businessdetailnew = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.TbwUpd")
797 if len(businessdetailnew) > 0:
798 DetailsLink = businessdetailnew[0].find_elements(By.XPATH,"parent::*").get_attribute("href")
799 Ele_addressdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.VwiC3b")
800 if len(Ele_addressdetail) > 0:
801 Address = Ele_addressdetail[0].text.replace(";", "-").replace(",", "-");
802
803 alltext = AllClasses[gr].text
804 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
805 Actor.log.info('match email '+str(len(match)))
806 if len(match)>0:
807 for i in match:
808 Email=i
809 Actor.log.info('email '+Email)
810 else:
811 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', BusinessName+Address)
812 for i in match_website:
813 Website = "http://www."+i;
814 Actor.log.info('Website '+Website)
815 Email=scrape_contact_emails(Website)
816 if Email :
817 existindb=False
818
819 if len(all_users)>0:
820 for item in all_users:
821 if item['Email'] == Email :
822 existindb=True
823 break
824 if existindb==False:
825 all_users.append({'Email': Email});
826 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':alltext,'Detail_Link':DetailsLink})
827
828
829 except Exception as err:
830 Actor.log.info(f"Unexpected {err=}, {type(err)=}")
831
832 if len(driver.find_elements(By.CSS_SELECTOR,"a#pnnext")) > 0:
833 Actor.log.info('Click Next')
834 str_url = driver.current_url
835
836 driver.execute_script("arguments[0].click();", driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0]);
837 time.sleep(3)
838 while driver.current_url == str_url:
839 tryresult = 0;
840 time.sleep(1);
841 tryresult = tryresult + 1;
842 if tryresult > 20:
843 driver.quit();
844 break;
845 else:
846 Actor.log.info('Click Next_1')
847 AllCOUNT_PREVIOUS = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
848 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
849 AllCOUNT_PREVIOUS = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
850 Actor.log.info('Click Next_1_T1')
851 str_url = driver.current_url
852 action = ActionChains(driver)
853
854
855 tryresult = 0;
856 if len(driver.find_elements(By.CSS_SELECTOR,"span.RVQdVd")) > 0:
857 action = ActionChains(driver)
858 selectedlink1 = driver.find_element(By.CSS_SELECTOR,"span.RVQdVd")
859 action.move_to_element(selectedlink1).click().perform()
860
861 time.sleep(3)
862 Actor.log.info('Click Next_1_T2')
863 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
864 if social_network_val=="youtube.com/":
865 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
866
867 while (AllCOUNT_PREVIOUS == AllCOUNT_Now):
868 time.sleep(1)
869 tryresult = tryresult + 1;
870 Actor.log.info('Click Next_1_T3_tryresult '+str(tryresult))
871 if (tryresult > 20):
872 driver.quit();
873 break;
874
875 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
876 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/" :
877 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
878 print('AllCOUNT_PREVIOUS'+str(AllCOUNT_PREVIOUS))
879 print('AllCOUNT_Now'+str(AllCOUNT_Now))
880 if(AllCOUNT_PREVIOUS==AllCOUNT_Now):
881 break
882 time.sleep(3)
883 else:
884 if(start==0):
885 await Actor.push_data({'Email': 'No Data Found, Due to google not respond. May be proxy problem'})
886 driver.quit();
887 break;
888 print('done')
889 except Exception:
890 Actor.log.exception(f'Cannot extract data from .')
891
892 driver.quit()