1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from urllib.parse import urljoin
11
12from selenium.webdriver.common.by import By
13from selenium.webdriver.common.action_chains import ActionChains
14from apify import Actor
15import re
16from urllib.parse import urljoin
17from bs4 import BeautifulSoup
18import requests
19import requests.exceptions
20import time
21import undetected_chromedriver as uc
22import random
23from .datascrapify import StartProcess, parse_proxy_url,on_aborting_event
24from apify_shared.consts import ActorEventTypes as Event
25import asyncio
26from urllib.parse import urlparse, urlsplit, urlunsplit, parse_qs, urlencode
27
28
29
30
31
32
33def scrape_contact_emails(link):
34 res = requests.get(link)
35 domain = link.split(".")
36 mailaddr = link
37 soup = BeautifulSoup(res.text,"lxml")
38 links = soup.find_all("a")
39 contact_link = ''
40 final_result = ""
41 try:
42
43 emails = soup.find_all(text=re.compile('.*@'+domain[1]+'.'+domain[2].replace("/","")))
44 emails.sort(key=len)
45 print(emails[0].replace("\n",""))
46 final_result = emails[0]
47 except:
48
49 try:
50 flag = 0
51 for link in links:
52 if "contact" in link.get("href") or "Contact" in link.get("href") or "CONTACT" in link.get("href") or 'contact' in link.text or 'Contact' in link.text or 'CONTACT' in link.text:
53 if len(link.get("href"))>2 and flag<2:
54 flag = flag + 1
55 contact_link = link.get("href")
56
57 except:
58 pass
59
60 domain = domain[0]+"."+domain[1]+"."+domain[2]
61 if(len(contact_link)<len(domain)):
62 domain = domain+contact_link.replace("/","")
63 else:
64 domain = contact_link
65
66 try:
67
68 res = requests.get(domain)
69 soup = BeautifulSoup(res.text,"lxml")
70 emails = soup.find_all(text=re.compile('.*@'+mailaddr[7:].replace("/","")))
71 emails.sort(key=len)
72 try:
73 print(emails[0].replace("\n",""))
74 final_result = emails[0]
75 return final_result
76 except:
77 pass
78 except Exception as e:
79 pass
80
81 return ""
82
83async def main() -> None:
84 """
85 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
86 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
87 and it also enhances performance in the field of web scraping significantly.
88 """
89 async with Actor:
90
91 actor_input = await Actor.get_input() or {}
92 Keyword_val = actor_input.get('Keyword')
93 location_val = actor_input.get('location')
94 social_network_val = actor_input.get('social_network')
95 Country_val = actor_input.get('Country')
96 Email_Type_val = actor_input.get('Email_Type')
97 Other_Email_Type_val = actor_input.get('Other_Email_Type')
98 proxy_settings = actor_input.get('proxySettings')
99 Limit_val =actor_input.get('Limit')
100
101
102 proxy_configuration = await Actor.create_proxy_configuration(groups=['GOOGLE_SERP'])
103
104 proxyurl =await proxy_configuration.new_url()
105 if proxy_configuration and proxy_settings:
106 proxyurl =await proxy_configuration.new_url()
107
108
109
110 if not Keyword_val:
111 Actor.log.info('Please insert keyword')
112 await Actor.push_data({'Email': 'Please insert keyword'})
113 await Actor.exit()
114 return
115
116 if Keyword_val=='TestKeyword':
117 Actor.log.info('Please insert keyword')
118 await Actor.push_data({'Email': 'Please insert Your Keyword'})
119 await Actor.exit()
120 return
121 '''
122 me = await Actor.apify_client.user('me').get()
123 username=me["username"]
124 isPaying='PayingUser'
125 if(me["isPaying"]==False):
126 isPaying='FreeUser'
127
128 try:
129 proxyurl=''
130 USE_Proxy=False
131 if proxyurl:
132 USE_Proxy=True
133 my_proxy_settings = parse_proxy_url(proxyurl)
134 # Call the function from the imported module
135 await StartProcess(
136 "Apify_camp_LinkedinEmail_"+username+"_"+isPaying,
137 "ALLINONE",
138 Keyword_val,
139 location_val,
140 social_network_val,
141 Country_val,
142 "Google",
143 USE_Proxy,
144 my_proxy_settings
145 )
146
147 finally:
148 # Code that always executes (e.g., cleanup)
149 print("This block always runs, regardless of exceptions.")
150
151 await Actor.exit();
152 '''
153
154 l1 = ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"]
155 l2= ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]
156 select_index=1
157 select_country='United States'
158 for count, ele in enumerate(l1):
159 if(ele==Country_val):
160 select_index=count
161 break
162
163
164
165 for count, ele in enumerate(l2):
166 if(count==select_index):
167 select_country=ele
168 break
169
170 print(select_country)
171
172 concatstring = ""
173 concatstring = concatstring + Keyword_val
174 option = "( @gmail.com OR @hotmail.com OR @yahoo.com)";
175 if Email_Type_val=="1":
176 if not Other_Email_Type_val:
177 Actor.log.info('Please insert Email Type Domain')
178 await Actor.push_data({'Email': 'Please insert Email Type Domain'})
179 await Actor.exit()
180 return
181 if Other_Email_Type_val.find("@") > -1:
182 option = " ( " + Other_Email_Type_val + " )"
183 else:
184 option = " ( @" + Other_Email_Type_val + " )"
185 concatstring = concatstring + option
186 if location_val:
187 concatstring = concatstring+ " in "+ location_val
188
189
190
191 if social_network_val:
192 concatstring = concatstring + " site:"
193
194 if social_network_val == "linkedin.com/" or social_network_val == "pinterest.com/" :
195 concatstring = concatstring + Country_val + ".";
196
197
198
199
200 if social_network_val == "amazon.com/" :
201 if Country_val=='gm':
202 Country_val='de'
203 elif Country_val=='sp':
204 Country_val='es'
205 elif Country_val=='fr':
206 Country_val='fr'
207 elif Country_val=='uk':
208 Country_val='co.uk'
209 elif Country_val=='as':
210 Country_val='com.au'
211 elif Country_val=='www':
212 Country_val='com'
213 elif Country_val=='in':
214 Country_val='in'
215 elif Country_val=='be':
216 Country_val='com.be'
217 elif Country_val=='br':
218 Country_val='com.br'
219 elif Country_val=='ca':
220 Country_val='ca'
221 elif Country_val=='ch':
222 Country_val='cn'
223 elif Country_val=='eg':
224 Country_val='eg'
225 elif Country_val=='it':
226 Country_val='it'
227 elif Country_val=='ja':
228 Country_val='co.jp'
229 elif Country_val=='mx':
230 Country_val='com.mx'
231 elif Country_val=='nl':
232 Country_val='nl'
233 elif Country_val=='pl':
234 Country_val='pl'
235 elif Country_val=='sa':
236 Country_val='sa'
237 elif Country_val=='sn':
238 Country_val='sg'
239 elif Country_val=='sw':
240 Country_val='se'
241 elif Country_val=='tu':
242 Country_val='com.tr'
243 elif Country_val=='ae':
244 Country_val='ae'
245 elif Country_val=='ae':
246 Country_val='ae'
247 else :
248 Country_val=='com'
249
250 social_network_val=social_network_val.replace('.com','.'+Country_val)
251
252
253 concatstring = concatstring + "" + social_network_val + "";
254
255 SearchEngine='Google'
256 desktop_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
257 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
258 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Safari/537.36",
259 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
260
261 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
262 "Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0",
263 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
264
265 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
266 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36 Edg/122.0.2365.66",
267
268 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/96.0.0.0",
269
270 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
271 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
272
273 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:124.0) Gecko/20100101 Firefox/124.0",
274 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
275
276 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
277 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
278
279 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.184 Safari/537.36",
280 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
281
282 "Mozilla/5.0 (X11; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
283 "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
284 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
285
286 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
287
288 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
289
290 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/96.0.0.0"
291 ]
292
293 mobile_user_agents = [
294
295 "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
296 "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
297 "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Mobile Safari/537.36",
298
299
300 "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/24.0 Chrome/123.0.0.0 Mobile Safari/537.36",
301 "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-A525F) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/121.0.0.0 Mobile Safari/537.36",
302
303
304 "Mozilla/5.0 (Linux; Android 13; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
305
306
307 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
308 "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
309 "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
310
311
312 "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
313 "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
314
315
316 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
317
318
319 "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 OPR/76.0.4017.123",
320
321
322 "Mozilla/5.0 (Linux; Android 13; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36 EdgA/123.0.2420.64",
323
324
325 "Mozilla/5.0 (Android 13; Mobile; rv:124.0) Gecko/124.0 Firefox/124.0",
326 "Mozilla/5.0 (Android 12; Mobile; rv:122.0) Gecko/122.0 Firefox/122.0"
327]
328
329
330 all_user_agents = desktop_user_agents + mobile_user_agents
331 random_user_agent = random.choice(all_user_agents)
332
333
334 qry="https://google.com/search?q="+ concatstring
335 Actor.log.info(concatstring)
336
337 gl_map = {
338 "it": "it",
339 "www": "us",
340 "gm": "de",
341 "fr": "fr",
342 "sp": "es",
343 "uk": "uk",
344 "al": "al",
345 "ag": "dz",
346 "ar": "ar",
347 "am": "am",
348 "as": "au",
349 "au": "at",
350 "aj": "az",
351 "bg": "bd",
352 "bo": "by",
353 "be": "be",
354 "bh": "bz",
355 "bk": "ba",
356 "br": "br",
357 "bu": "bg",
358 "ca": "ca",
359 "ci": "cl",
360 "ch": "cn",
361 "co": "co",
362 "cs": "cr",
363 "hr": "hr",
364 "cu": "cu",
365 "ez": "ez",
366 "dk": "dk",
367 "ec": "ec",
368 "eg": "eg",
369 "en": "ee",
370 "fi": "fi",
371 "gg": "ge",
372 "gr": "gr",
373 "hk": "hk",
374 "hu": "hu",
375 "ic": "is",
376 "in": "in",
377 "id": "id",
378 "ir": "ir",
379 "iz": "iq",
380 "ei": "ie",
381 "is": "il",
382 "jm": "jm",
383 "ja": "jp",
384
385 "ke": "ke",
386 "kn": "kp",
387 "ks": "kr",
388 "ku": "kw",
389 "lg": "lv",
390 "ly": "ly",
391 "ls": "li",
392 "lh": "lt",
393 "lu": "lu",
394 "mc": "mo",
395 "mk": "mk",
396 "my": "my",
397 "mt": "mt",
398 "mx": "mx",
399 "md": "md",
400 "mn": "mc",
401 "mj": "me",
402 "mo": "ma",
403 "np": "np",
404 "nl": "nl",
405 "nz": "nz",
406 "ni": "ng",
407 "no": "no",
408 "pk": "pk",
409 "we": "ps",
410 "pm": "pa",
411 "pa": "py",
412 "pe": "pe",
413 "rp": "ph",
414 "pl": "pl",
415 "po": "pt",
416 "rq": "pr",
417 "qa": "qa",
418 "ro": "ro",
419 "rs": "ru",
420 "sm": "sm",
421 "sa": "sa",
422 "sg": "sn",
423 "ri": "rs",
424 "sn": "sg",
425 "lo": "sk",
426 "si": "si",
427 "sf": "za",
428 "sw": "se",
429 "sz": "ch",
430 "sy": "sy",
431 "tw": "tw",
432 "th": "th",
433 "ts": "tn",
434 "tu": "tu",
435 "ua": "ua",
436 "ae": "ae",
437 "uy": "uy",
438 "uz": "uz",
439 "ve": "ve",
440 "vn": "vn",
441 "lk": "lk",
442 }
443
444 gl = gl_map.get(Country_val, Country_val)
445
446
447
448 all_users = []
449 query =concatstring
450 max_pages = 100
451 results = []
452 count_result=0
453 has_next=True
454 for page in range(max_pages):
455 try:
456 start = page * 10
457 Actor.log.info('Check Page '+str(page))
458 url = f"http://www.google.com/search?q={query}&num=10&hl=en&start={start}&gl={gl}"
459
460
461
462 proxies=None
463 if proxyurl:
464 proxies = {'http': proxyurl, 'https': proxyurl}
465
466 response = requests.get(
467 url,
468 proxies=proxies,
469 headers={
470 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
471 "AppleWebKit/537.36 (KHTML, like Gecko) "
472 "Chrome/120.0.0.0 Safari/537.36"
473 }
474 )
475
476 if response.status_code != 200:
477 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
478 continue
479
480 soup = BeautifulSoup(response.text, "html.parser")
481 result_blocks = soup.select("div.g, div.tF2Cxc")
482
483 for j, block in enumerate(result_blocks):
484 Email=''
485 title_el = block.select_one("h3")
486 link_el = block.select_one("a")
487 snippet_el = block.select_one(".VwiC3b, .IsZvec, .aCOpRe")
488
489
490
491 title= title_el.get_text(strip=True)
492 url =link_el.get("href")
493 snippet= snippet_el.get_text(strip=True)
494 print(title)
495 alltext = title+snippet
496 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
497 Actor.log.info('match email '+str(len(match)))
498 if len(match)>0:
499 for i in match:
500 Email=i
501 Actor.log.info('email '+Email)
502 else:
503 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', alltext)
504 for i in match_website:
505 Website = "http://www."+i;
506 Actor.log.info('Website '+Website)
507 Email=scrape_contact_emails(Website)
508 if Email :
509 existindb=False
510
511 if len(all_users)>0:
512 for item in all_users:
513 if item['Email'] == Email :
514 existindb=True
515 break
516
517 if existindb==False:
518 all_users.append({'Email': Email});
519 await Actor.push_data({'Email': Email, 'title': title,'Description':alltext,'Detail_Link':url})
520 count_result=count_result+1
521 if(Limit_val):
522 if Limit_val!='0':
523 if(count_result>=int(Limit_val)):
524 has_next = False
525 print('Limit Exceed')
526 break
527
528 if has_next==False:
529 break
530 await asyncio.sleep(random.uniform(1.5, 3.0))
531 except:
532 break
533
534
535 print('Check in Yahoo Search Engine')
536 query=concatstring.replace("OR", "or")
537
538
539 for page in range(max_pages):
540 try:
541 if has_next==False:
542 break
543 start = page * 10
544 Actor.log.info('Check Page '+str(page))
545
546 url = "https://search.yahoo.com/search;_ylt=Awr.2lgGCQ9p1C0DekZXNyoA;_ylu=Y29sbwNncTEEcG9zAzEEdnRpZAMEc2VjA3BhZ2luYXRpb24-"
547 params = {
548 "p": query,
549 "b": start,
550 "pz": 10,
551 "bct": 0,
552 "xargs": 0
553 }
554
555
556 split_url = urlsplit(url)
557 query_str = urlencode(params)
558 full_url = urlunsplit((split_url.scheme, split_url.netloc, split_url.path, query_str, ""))
559
560 print(f"Fetching page {full_url}")
561
562
563 proxies=None
564
565
566
567 response = requests.get(
568 full_url,
569 proxies=proxies,
570 headers={
571 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
572 "AppleWebKit/537.36 (KHTML, like Gecko) "
573 "Chrome/120.0.0.0 Safari/537.36"
574 }
575 )
576
577 if response.status_code != 200:
578 Actor.log.warning(f"⚠️ Request failed: {response.status_code} {response.reason}")
579 break
580
581 soup = BeautifulSoup(response.text, "html.parser")
582 search_results = soup.select("div.dd.algo.algo-sr")
583
584 if not search_results:
585 log("⚠️ No more results found, ending.")
586 break
587
588 for r in search_results:
589 try:
590 Email=''
591 Website=''
592 Address=''
593 BusinessName=''
594 DetailsLink=''
595 Category=''
596 link_tag = r.select_one("h3.title")
597 link_tag1 = r.select_one("a")
598 if link_tag:
599 BusinessName = link_tag.get_text(strip=True)
600 DetailsLink = link_tag1.get("href")
601 Category = urlparse(DetailsLink).netloc
602
603 addr_tag = r.select_one("div.compText.aAbs")
604 if addr_tag:
605 Address = addr_tag.get_text(separator=" ").strip()
606
607 combined = r.get_text(strip=True)
608 print(combined)
609 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', combined)
610 Actor.log.info('match email '+str(len(match)))
611 if len(match)>0:
612 for i in match:
613 Email=i
614 Actor.log.info('email '+Email)
615 else:
616 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', combined)
617 for i in match_website:
618 Website = "http://www."+i;
619 Actor.log.info('Website '+Website)
620 Email=scrape_contact_emails(Website)
621 if Email :
622 existindb=False
623
624 if len(all_users)>0:
625 for item in all_users:
626 if item['Email'] == Email :
627 existindb=True
628 break
629
630 if existindb==False:
631 all_users.append({'Email': Email});
632 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':combined,'Detail_Link':DetailsLink})
633 count_result=count_result+1
634 if(Limit_val):
635 if Limit_val!='0':
636 if(count_result>=int(Limit_val)):
637 has_next = False
638 print('Limit Exceed')
639 break
640
641 except Exception as ex:
642 print(f"⚠️ Parse error: {ex}")
643 continue
644
645 if has_next==False:
646 break
647 await asyncio.sleep(random.uniform(1.5, 3.0))
648 except:
649
650 break
651 await Actor.exit()
652
653
654
655 Actor.log.info('Launching Chrome WebDriver...')
656 chrome_options = uc.ChromeOptions()
657 chrome_options.add_argument(f'user-agent={random_user_agent}')
658 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
659
660
661 chrome_options.add_argument('--no-sandbox')
662 chrome_options.add_argument('--disable-dev-shm-usage')
663 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
664
665
666
667
668 seleniumwire_options={}
669 if(proxyurl):
670 print('apply proxy')
671 chrome_options.add_argument(f" - proxy-server={proxyurl}")
672
673
674
675
676
677
678
679
680
681 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
682 "source": """
683 Object.defineProperty(navigator, 'webdriver', {
684 get: () => undefined
685 })
686 """
687 })
688
689
690
691
692
693
694 driver.get("https://httpbin.io/ip")
695
696 ip_address = driver.find_element(By.TAG_NAME, "body").text
697
698 print(ip_address)
699 try:
700 driver.get(qry)
701 except Exception as e:
702 print(e)
703 await Actor.push_data({'Email': e})
704 driver.quit()
705
706 checkount=0
707 try:
708 while(driver.current_url.find("sorry/index")>-1 and checkount<100):
709 print('captcha')
710 print(driver.current_url)
711 time.sleep(3)
712 checkount=checkount+1
713 except Exception as e:
714 print(e)
715
716 all_users = []
717 start=0
718
719 if(SearchEngine=='Yahoo'):
720 print('yahoo')
721 else:
722 try:
723 while True:
724 consent = driver.find_elements(By.XPATH,"//iframe[contains(@src, 'consent.google.com')]")
725 if len(consent)>0 :
726 driver.switch_to.frame(consent[0]);
727 driver.find_element(By.id("id")).click();
728
729 popup = driver.find_elements(By.CSS_SELECTOR,"button[class='yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m']");
730 if (len(popup) > 0):
731 Actor.log.info("popup")
732 for n in range(0,len(popup)):
733 if (popup[n].text=="Accept all"):
734 popup[n].click();
735
736 popup_accept = driver.find_elements(By.CSS_SELECTOR,"button[id='L2AGLb']");
737 if (len(popup_accept) > 0):
738 popup_accept[0].click();
739 time.sleep(1)
740
741 checkconsent=0;
742 while len(driver.find_elements(By.CSS_SELECTOR,"div[class='HTjtHe'][style*='display: block']")) > 0 and checkconsent<100:
743 Actor.log.info('checkconsent '+str(checkconsent))
744 Actor.log.info("homeurl" + driver.current_url)
745 checkconsent=checkconsent+1
746 time.sleep(1)
747
748 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
749 print(len(AllClasses))
750
751
752
753
754
755
756 homeurl = driver.current_url
757 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
758 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
759 AllClasses = driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")
760 Actor.log.info("homeurl" + driver.current_url)
761
762 checkount=0
763 try:
764 while(driver.current_url.index("sorry/index")>-1 and checkount<100):
765 print('captcha')
766 time.sleep(3)
767 checkount=checkount+1
768 except Exception as e:
769 print(e)
770
771 if(len(AllClasses)==0):
772 print('check new attribut')
773 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.fP1Qef")
774 print(len(AllClasses))
775 print(driver.find_elements(By.CSS_SELECTOR,"div[id='main']"))
776
777
778
779 if len(AllClasses)>0:
780 start=1
781 Actor.log.info("Result" + str(len(AllClasses)))
782 for gr in range(0,len(AllClasses)):
783 try:
784 BusinessName=""
785 DetailsLink=""
786 Email=""
787 Address=""
788 businessdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"h3.LC20lb")
789 if len(businessdetail) > 0:
790 BusinessName = businessdetail[0].text;
791 DetailsLink = businessdetail[0].find_element(By.XPATH,"parent::*").get_attribute("href")
792 if not DetailsLink:
793 businessdetailnew = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.TbwUpd")
794 if len(businessdetailnew) > 0:
795 DetailsLink = businessdetailnew[0].find_elements(By.XPATH,"parent::*").get_attribute("href")
796 Ele_addressdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.VwiC3b")
797 if len(Ele_addressdetail) > 0:
798 Address = Ele_addressdetail[0].text.replace(";", "-").replace(",", "-");
799
800 alltext = AllClasses[gr].text
801 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
802 Actor.log.info('match email '+str(len(match)))
803 if len(match)>0:
804 for i in match:
805 Email=i
806 Actor.log.info('email '+Email)
807 else:
808 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', BusinessName+Address)
809 for i in match_website:
810 Website = "http://www."+i;
811 Actor.log.info('Website '+Website)
812 Email=scrape_contact_emails(Website)
813 if Email :
814 existindb=False
815
816 if len(all_users)>0:
817 for item in all_users:
818 if item['Email'] == Email :
819 existindb=True
820 break
821 if existindb==False:
822 all_users.append({'Email': Email});
823 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':alltext,'Detail_Link':DetailsLink})
824
825
826 except Exception as err:
827 Actor.log.info(f"Unexpected {err=}, {type(err)=}")
828
829 if len(driver.find_elements(By.CSS_SELECTOR,"a#pnnext")) > 0:
830 Actor.log.info('Click Next')
831 str_url = driver.current_url
832
833 driver.execute_script("arguments[0].click();", driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0]);
834 time.sleep(3)
835 while driver.current_url == str_url:
836 tryresult = 0;
837 time.sleep(1);
838 tryresult = tryresult + 1;
839 if tryresult > 20:
840 driver.quit();
841 break;
842 else:
843 Actor.log.info('Click Next_1')
844 AllCOUNT_PREVIOUS = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
845 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
846 AllCOUNT_PREVIOUS = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
847 Actor.log.info('Click Next_1_T1')
848 str_url = driver.current_url
849 action = ActionChains(driver)
850
851
852 tryresult = 0;
853 if len(driver.find_elements(By.CSS_SELECTOR,"span.RVQdVd")) > 0:
854 action = ActionChains(driver)
855 selectedlink1 = driver.find_element(By.CSS_SELECTOR,"span.RVQdVd")
856 action.move_to_element(selectedlink1).click().perform()
857
858 time.sleep(3)
859 Actor.log.info('Click Next_1_T2')
860 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
861 if social_network_val=="youtube.com/":
862 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
863
864 while (AllCOUNT_PREVIOUS == AllCOUNT_Now):
865 time.sleep(1)
866 tryresult = tryresult + 1;
867 Actor.log.info('Click Next_1_T3_tryresult '+str(tryresult))
868 if (tryresult > 20):
869 driver.quit();
870 break;
871
872 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
873 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/" :
874 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
875 print('AllCOUNT_PREVIOUS'+str(AllCOUNT_PREVIOUS))
876 print('AllCOUNT_Now'+str(AllCOUNT_Now))
877 if(AllCOUNT_PREVIOUS==AllCOUNT_Now):
878 break
879 time.sleep(3)
880 else:
881 if(start==0):
882 await Actor.push_data({'Email': 'No Data Found, Due to google not respond. May be proxy problem'})
883 driver.quit();
884 break;
885 print('done')
886 except Exception:
887 Actor.log.exception(f'Cannot extract data from .')
888
889 driver.quit()