Linkedin Email Scraper avatar
Linkedin Email Scraper

Pricing

$10.00/month + usage

Go to Store
Linkedin Email Scraper

Linkedin Email Scraper

Developed by

bhansalisoft

bhansalisoft

Maintained by Community

Linkedin Email Scraper- Scrap Emails from linkedin specific profile using google search engine

1.0 (2)

Pricing

$10.00/month + usage

46

Total users

1.7k

Monthly users

229

Runs succeeded

>99%

Issues response

20 hours

Last modified

5 days ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "Linkedin Email Scraper",
"title": "Linkedin Email Scraper",
"description": "Scrap Emails from Linkedin social media profile using Search Engine",
"version": "0.0",
"meta": {
"templateId": "python-selenium"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Linkedin Email Scraper",
"views": {
"titles": {
"title": "Linkedin Email Scraper",
"transformation": {
"fields": [
"Email",
"title",
"Description",
"Detail_Link"
]
},
"display": {
"component": "table",
"properties": {
"Email": {
"label": "Email",
"format": "text"
},
"title": {
"label": "Title",
"format": "text"
}
,
"Description": {
"label": "Description",
"format": "text"
}
,
"Detail_Link": {
"label": "Detail_Link",
"format": "text"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Linkedin Email Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"Keyword": {
"title": "Keyword",
"description": "Keyword",
"type": "string",
"editor": "textfield",
"prefill":"TestKeyword"
},
"location": {
"title": "Location(optional)",
"description": "Location",
"type": "string",
"editor": "textfield"
},
"social_network": {
"title": "Social Network",
"description": "Social Network",
"type": "string",
"editor": "select",
"default": "linkedin.com/",
"enum": ["linkedin.com/"],
"enumTitles": ["Linkedin"]
},
"Country": {
"title": "Country",
"description": "Country",
"type": "string",
"editor": "select",
"default": "www",
"enum": ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"],
"enumTitles": ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]
},
"Email_Type": {
"title": "Email Type",
"description": "Email Type",
"type": "string",
"editor": "select",
"default": "0",
"enum": ["0", "1"],
"enumTitles": ["Popular Emails only(@gmail.com or @yahoo.com or @hotmail.com)", "Other Private Email"]
},
"Other_Email_Type": {
"title": "Other Email Type(Example @domain.com)",
"description": "Other Email Type(Example @domain.com)",
"type": "string",
"editor": "textfield"
},
"proxySettings": {
"title": "Proxy configuration",
"type": "object",
"description": "Select proxies to be used by your crawler.",
"editor": "proxy",
"prefill": { "useApifyProxy": false }
}
},
"required": ["Keyword","social_network","Email_Type","Country"]
}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from urllib.parse import urljoin
11from seleniumwire import webdriver
12#from selenium.webdriver.chrome.options import Options as ChromeOptions
13from selenium.webdriver.common.by import By
14from selenium.webdriver.common.action_chains import ActionChains
15from apify import Actor
16import re
17from urllib.parse import urljoin
18from bs4 import BeautifulSoup
19import requests
20import requests.exceptions
21import time
22import undetected_chromedriver as uc
23import random
24
25# To run this Actor locally, you need to have the Selenium Chromedriver installed.
26# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
27# When running on the Apify platform, it is already included in the Actor's Docker image.
28
29
30def scrape_contact_emails(link):
31 res = requests.get(link)
32 domain = link.split(".")
33 mailaddr = link
34 soup = BeautifulSoup(res.text,"lxml")
35 links = soup.find_all("a")
36 contact_link = ''
37 final_result = ""
38 try:
39 # Check if there is any email address in the homepage.
40 emails = soup.find_all(text=re.compile('.*@'+domain[1]+'.'+domain[2].replace("/","")))
41 emails.sort(key=len)
42 print(emails[0].replace("\n",""))
43 final_result = emails[0]
44 except:
45 # Searching for Contact Us Page's url.
46 try:
47 flag = 0
48 for link in links:
49 if "contact" in link.get("href") or "Contact" in link.get("href") or "CONTACT" in link.get("href") or 'contact' in link.text or 'Contact' in link.text or 'CONTACT' in link.text:
50 if len(link.get("href"))>2 and flag<2:
51 flag = flag + 1
52 contact_link = link.get("href")
53
54 except:
55 pass
56
57 domain = domain[0]+"."+domain[1]+"."+domain[2]
58 if(len(contact_link)<len(domain)):
59 domain = domain+contact_link.replace("/","")
60 else:
61 domain = contact_link
62
63 try:
64 # Check if there is any email address in the Contact Us Page.
65 res = requests.get(domain)
66 soup = BeautifulSoup(res.text,"lxml")
67 emails = soup.find_all(text=re.compile('.*@'+mailaddr[7:].replace("/","")))
68 emails.sort(key=len)
69 try:
70 print(emails[0].replace("\n",""))
71 final_result = emails[0]
72 return final_result
73 except:
74 pass
75 except Exception as e:
76 pass
77
78 return ""
79
80async def main() -> None:
81 """
82 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
83 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
84 and it also enhances performance in the field of web scraping significantly.
85 """
86 async with Actor:
87 # Read the Actor input
88 actor_input = await Actor.get_input() or {}
89 Keyword_val = actor_input.get('Keyword')
90 location_val = actor_input.get('location')
91 social_network_val = actor_input.get('social_network')
92 Country_val = actor_input.get('Country')
93 Email_Type_val = actor_input.get('Email_Type')
94 Other_Email_Type_val = actor_input.get('Other_Email_Type')
95 proxy_settings = actor_input.get('proxySettings')
96 proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings)
97 Actor.log.info(proxy_configuration)
98 proxyurl = ''
99 if proxy_configuration and proxy_settings:
100 proxyurl =await proxy_configuration.new_url()
101 Actor.log.info(proxyurl)
102
103
104 if not Keyword_val:
105 Actor.log.info('Please insert keyword')
106 await Actor.push_data({'Email': 'Please insert keyword'})
107 await Actor.exit()
108 return
109
110 if Keyword_val=='TestKeyword':
111 Actor.log.info('Please insert keyword')
112 await Actor.push_data({'Email': 'Please insert Your Keyword'})
113 await Actor.exit()
114 return
115
116 l1 = ["it","www","gm","fr","sp","uk","al","ag","ar","am","as","au","aj","bg","bo","be","bh","bk","br","bu","ca","ci","ch","co","cs","hr","cu","ez","dk","ec","eg","en","fi","gg","gr","hk","hu","ic","in","id","ir","iz","ei","is","jm","ja","ke","kn","ks","ku","lg","ly","ls","lh","lu","mc","mk","my","mt","mx","md","mn","mj","mo","np","nl","nz","ni","no","pk","we","pm","pa","pe","rp","pl","po","rq","qa","ro","rs","sm","sa","sg","ri","sn","lo","si","sf","sw","sz","sy","tw","th","ts","tu","ua","ae","uy","uz","ve"]
117 l2= ["Italy","United States","Germany","France","Spain","United Kingdom","Albania","Algeria","Argentina","Armenia","Australia","Austria","Azerbaijan","Bangladesh","Belarus","Belgium","Belize","Bosnia and Herzegovina","Brazil","Bulgaria","Canada","Chile","China","Colombia","Costa Rica","Croatia","Cuba","Czechia","Denmark","Ecuador","Egypt","Estonia","Finland","Georgia","Greece","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Israel","Jamaica","Japan","Kenya","Korea","Korea, Republic of","Kuwait","Latvia","Libya","Liechtenstein","Lithuania","Luxembourg","Macao","Macedonia","Malaysia","Malta","Mexico","Moldova, Republic of","Monaco","Montenegro","Morocco","Nepal","Netherlands","New Zealand","Nigeria","Norway","Pakistan","Palestine, State of","Panama","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Romania","Russia","San Marino","Saudi Arabia","Senegal","Serbia","Singapore","Slovakia","Slovenia","South Africa","Sweden","Switzerland","Syrian Arab Republic","Taiwan","Thailand","Tunisia","Turkey","Ukraine","United Arab Emirates","Uruguay","Uzbekistan","Venezuela"]
118 select_index=1
119 select_country='United States'
120 for count, ele in enumerate(l1):
121 if(ele==Country_val):
122 select_index=count
123 break
124 #print(count)
125 #print(ele)
126
127 for count, ele in enumerate(l2):
128 if(count==select_index):
129 select_country=ele
130 break
131
132 print(select_country)
133
134 concatstring = ""
135 concatstring = concatstring + Keyword_val
136 option = "( @gmail.com OR @hotmail.com OR @yahoo.com)";
137 if Email_Type_val=="1":
138 if not Other_Email_Type_val:
139 Actor.log.info('Please insert Email Type Domain')
140 await Actor.push_data({'Email': 'Please insert Email Type Domain'})
141 await Actor.exit()
142 return
143 if Other_Email_Type_val.find("@") > -1:
144 option = " ( " + Other_Email_Type_val + " )"
145 else:
146 option = " ( @" + Other_Email_Type_val + " )"
147 concatstring = concatstring + option
148 if location_val:
149 concatstring = concatstring+ " in "+ location_val
150
151
152
153 if social_network_val:
154 concatstring = concatstring + " site:"
155
156 if social_network_val == "linkedin.com/" or social_network_val == "pinterest.com/" :
157 concatstring = concatstring + Country_val + ".";
158
159
160
161
162 if social_network_val == "amazon.com/" :
163 if Country_val=='gm':
164 Country_val='de'
165 elif Country_val=='sp':
166 Country_val='es'
167 elif Country_val=='fr':
168 Country_val='fr'
169 elif Country_val=='uk':
170 Country_val='co.uk'
171 elif Country_val=='as':
172 Country_val='com.au'
173 elif Country_val=='www':
174 Country_val='com'
175 elif Country_val=='in':
176 Country_val='in'
177 elif Country_val=='be':
178 Country_val='com.be'
179 elif Country_val=='br':
180 Country_val='com.br'
181 elif Country_val=='ca':
182 Country_val='ca'
183 elif Country_val=='ch':
184 Country_val='cn'
185 elif Country_val=='eg':
186 Country_val='eg'
187 elif Country_val=='it':
188 Country_val='it'
189 elif Country_val=='ja':
190 Country_val='co.jp'
191 elif Country_val=='mx':
192 Country_val='com.mx'
193 elif Country_val=='nl':
194 Country_val='nl'
195 elif Country_val=='pl':
196 Country_val='pl'
197 elif Country_val=='sa':
198 Country_val='sa'
199 elif Country_val=='sn':
200 Country_val='sg'
201 elif Country_val=='sw':
202 Country_val='se'
203 elif Country_val=='tu':
204 Country_val='com.tr'
205 elif Country_val=='ae':
206 Country_val='ae'
207 elif Country_val=='ae':
208 Country_val='ae'
209 else :
210 Country_val=='com'
211
212 social_network_val=social_network_val.replace('.com','.'+Country_val)
213
214
215 concatstring = concatstring + "" + social_network_val + "";
216
217 SearchEngine='Google'
218 desktop_user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
219 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
220 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Safari/537.36",
221 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
222 # Firefox - Windows
223 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
224 "Mozilla/5.0 (Windows NT 10.0; rv:124.0) Gecko/20100101 Firefox/124.0",
225 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
226 # Edge - Windows
227 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
228 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.112 Safari/537.36 Edg/122.0.2365.66",
229 # Opera - Windows
230 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 OPR/96.0.0.0",
231 # Chrome - macOS
232 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.129 Safari/537.36",
233 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_6_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
234 # Firefox - macOS
235 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.5; rv:124.0) Gecko/20100101 Firefox/124.0",
236 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0",
237 # Safari - macOS
238 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
239 "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
240 # Chrome - Linux
241 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.184 Safari/537.36",
242 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
243 # Firefox - Linux
244 "Mozilla/5.0 (X11; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
245 "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0",
246 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
247 # Safari - iMac (intel)
248 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
249 # Edge - macOS
250 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36 Edg/123.0.2420.65",
251 # Opera - macOS
252 "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/96.0.0.0"
253 ]
254
255 mobile_user_agents = [
256 # Chrome - Android
257 "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
258 "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
259 "Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.140 Mobile Safari/537.36",
260
261 # Samsung Internet - Android
262 "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S911B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/24.0 Chrome/123.0.0.0 Mobile Safari/537.36",
263 "Mozilla/5.0 (Linux; Android 12; SAMSUNG SM-A525F) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/121.0.0.0 Mobile Safari/537.36",
264
265 # Chrome - Xiaomi
266 "Mozilla/5.0 (Linux; Android 13; M2101K6G) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.105 Mobile Safari/537.36",
267
268 # Safari - iPhone
269 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
270 "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
271 "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
272
273 # Safari - iPad
274 "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
275 "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
276
277 # Chrome - iPhone
278 "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
279
280 # Opera - Android
281 "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36 OPR/76.0.4017.123",
282
283 # Edge - Android
284 "Mozilla/5.0 (Linux; Android 13; Pixel 6 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36 EdgA/123.0.2420.64",
285
286 # Firefox - Android
287 "Mozilla/5.0 (Android 13; Mobile; rv:124.0) Gecko/124.0 Firefox/124.0",
288 "Mozilla/5.0 (Android 12; Mobile; rv:122.0) Gecko/122.0 Firefox/122.0"
289]
290
291
292 all_user_agents = desktop_user_agents + mobile_user_agents
293 random_user_agent = random.choice(all_user_agents)
294
295
296 qry="https://google.com/search?q="+ concatstring
297 Actor.log.info(concatstring)
298 # Launch a new Selenium Chrome WebDriver
299 Actor.log.info('Launching Chrome WebDriver...')
300 chrome_options = uc.ChromeOptions()
301 chrome_options.add_argument(f'user-agent={random_user_agent}')
302 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
303 #chrome_options.add_extension('./NopeCHA-CAPTCHA-Solver.crx')
304 #chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36")
305 chrome_options.add_argument('--no-sandbox')
306 chrome_options.add_argument('--disable-dev-shm-usage')
307 chrome_options.add_argument("--disable-blink-features=AutomationControlled")
308 #options.add_experimental_option("useAutomationExtension", False)
309 # chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
310 #chrome_options.add_experimental_option('useAutomationExtension', False)
311
312 seleniumwire_options={}
313 if(proxyurl):
314 print('apply proxy')
315 chrome_options.add_argument(f" - proxy-server={proxyurl}")
316 #seleniumwire_options = {'proxy': { 'https': proxyurl,'https': proxyurl} }
317 # Initialize SeleniumAuthenticatedProxy
318 #proxy_helper = SeleniumAuthenticatedProxy(proxy_url=proxyurl)
319 # Enrich Chrome options with proxy authentication
320 #proxy_helper.enrich_chrome_options(chrome_options)
321
322 #print(seleniumwire_options)
323 #driver = webdriver.Chrome(options=chrome_options,seleniumwire_options=seleniumwire_options)
324 driver = uc.Chrome(options=chrome_options, use_subprocess=False,version_main = 135)
325 driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
326 "source": """
327 Object.defineProperty(navigator, 'webdriver', {
328 get: () => undefined
329 })
330 """
331 })
332
333 #driver.get('https://nopecha.com/setup#aqoz34cu2ob3qmhn')
334 #time.sleep(3)
335 #print("key : "+driver.execute_script("return document.querySelectorAll('body')[0].innerText"))
336
337 # Visit the test URL to check your proxy IP
338 driver.get("https://httpbin.io/ip")
339 # Select the body tag containing the current IP address
340 ip_address = driver.find_element(By.TAG_NAME, "body").text
341 # Print your current IP
342 print(ip_address)
343 try:
344 driver.get(qry)
345 except Exception as e:
346 print(e)
347 await Actor.push_data({'Email': e})
348 driver.quit()
349
350 checkount=0
351 try:
352 while(driver.current_url.find("sorry/index")>-1 and checkount<100):
353 print('captcha')
354 print(driver.current_url)
355 time.sleep(3)
356 checkount=checkount+1
357 except Exception as e:
358 print(e)
359
360 all_users = []
361 start=0
362
363 if(SearchEngine=='Yahoo'):
364 print('yahoo')
365 else:
366 try:
367 while True:
368 consent = driver.find_elements(By.XPATH,"//iframe[contains(@src, 'consent.google.com')]")
369 if len(consent)>0 :
370 driver.switch_to.frame(consent[0]);
371 driver.find_element(By.id("id")).click();
372
373 popup = driver.find_elements(By.CSS_SELECTOR,"button[class='yt-spec-button-shape-next yt-spec-button-shape-next--filled yt-spec-button-shape-next--mono yt-spec-button-shape-next--size-m']");
374 if (len(popup) > 0):
375 Actor.log.info("popup")
376 for n in range(0,len(popup)):
377 if (popup[n].text=="Accept all"):
378 popup[n].click();
379
380 popup_accept = driver.find_elements(By.CSS_SELECTOR,"button[id='L2AGLb']");
381 if (len(popup_accept) > 0):
382 popup_accept[0].click();
383 time.sleep(1)
384
385 checkconsent=0;
386 while len(driver.find_elements(By.CSS_SELECTOR,"div[class='HTjtHe'][style*='display: block']")) > 0 and checkconsent<100:
387 Actor.log.info('checkconsent '+str(checkconsent))
388 Actor.log.info("homeurl" + driver.current_url)
389 checkconsent=checkconsent+1
390 time.sleep(1)
391
392 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
393 print(len(AllClasses))
394 #checkconsent_youtube=0;
395 #while len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")) <= 0 and checkconsent_youtube<20:
396 #Actor.log.info('checkconsent_youtube '+str(checkconsent_youtube))
397 #Actor.log.info("homeurl" + driver.current_url)
398 #checkconsent_youtube=checkconsent_youtube+1
399 #time.sleep(1)
400 homeurl = driver.current_url
401 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc")
402 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
403 AllClasses = driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]")
404 Actor.log.info("homeurl" + driver.current_url)
405
406 checkount=0
407 try:
408 while(driver.current_url.index("sorry/index")>-1 and checkount<100):
409 print('captcha')
410 time.sleep(3)
411 checkount=checkount+1
412 except Exception as e:
413 print(e)
414
415 if(len(AllClasses)==0):
416 print('check new attribut')
417 AllClasses = driver.find_elements(By.CSS_SELECTOR,"div.fP1Qef")
418 print(len(AllClasses))
419 print(driver.find_elements(By.CSS_SELECTOR,"div[id='main']"))
420
421 #res = requests.get(driver.current_url)
422 #await Actor.push_data({'Email': res.text})
423 if len(AllClasses)>0:
424 start=1
425 Actor.log.info("Result" + str(len(AllClasses)))
426 for gr in range(0,len(AllClasses)):
427 try:
428 BusinessName=""
429 DetailsLink=""
430 Email=""
431 Address=""
432 businessdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"h3.LC20lb")
433 if len(businessdetail) > 0:
434 BusinessName = businessdetail[0].text;
435 DetailsLink = businessdetail[0].find_element(By.XPATH,"parent::*").get_attribute("href")
436 if not DetailsLink:
437 businessdetailnew = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.TbwUpd")
438 if len(businessdetailnew) > 0:
439 DetailsLink = businessdetailnew[0].find_elements(By.XPATH,"parent::*").get_attribute("href")
440 Ele_addressdetail = AllClasses[gr].find_elements(By.CSS_SELECTOR,"div.VwiC3b")
441 if len(Ele_addressdetail) > 0:
442 Address = Ele_addressdetail[0].text.replace(";", "-").replace(",", "-");
443
444 alltext = AllClasses[gr].text
445 match = re.findall(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[a-zA-Z]+', alltext)
446 Actor.log.info('match email '+str(len(match)))
447 if len(match)>0:
448 for i in match:
449 Email=i
450 Actor.log.info('email '+Email)
451 else:
452 match_website = re.findall(r'\\b(?:https?://|www\\.)\\S+\\b', BusinessName+Address)
453 for i in match_website:
454 Website = "http://www."+i;
455 Actor.log.info('Website '+Website)
456 Email=scrape_contact_emails(Website)
457 if Email :
458 existindb=False
459 #Actor.log.info('Email Store '+Email)
460 if len(all_users)>0:
461 for item in all_users:
462 if item['Email'] == Email :
463 existindb=True
464 break
465 if existindb==False:
466 all_users.append({'Email': Email});
467 await Actor.push_data({'Email': Email, 'title': BusinessName,'Description':alltext,'Detail_Link':DetailsLink})
468
469
470 except Exception as err:
471 Actor.log.info(f"Unexpected {err=}, {type(err)=}")
472
473 if len(driver.find_elements(By.CSS_SELECTOR,"a#pnnext")) > 0:
474 Actor.log.info('Click Next')
475 str_url = driver.current_url
476 #driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0].click();
477 driver.execute_script("arguments[0].click();", driver.find_elements(By.CSS_SELECTOR,"a#pnnext")[0]);
478 time.sleep(3)
479 while driver.current_url == str_url:
480 tryresult = 0;
481 time.sleep(1);
482 tryresult = tryresult + 1;
483 if tryresult > 20:
484 driver.quit();
485 break;
486 else:
487 Actor.log.info('Click Next_1')
488 AllCOUNT_PREVIOUS = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
489 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/":
490 AllCOUNT_PREVIOUS = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
491 Actor.log.info('Click Next_1_T1')
492 str_url = driver.current_url
493 action = ActionChains(driver)
494 # perform the operation
495 #action.move_to_element(element).click().perform()
496 tryresult = 0;
497 if len(driver.find_elements(By.CSS_SELECTOR,"span.RVQdVd")) > 0:
498 action = ActionChains(driver)
499 selectedlink1 = driver.find_element(By.CSS_SELECTOR,"span.RVQdVd")
500 action.move_to_element(selectedlink1).click().perform()
501
502 time.sleep(3)
503 Actor.log.info('Click Next_1_T2')
504 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
505 if social_network_val=="youtube.com/":
506 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
507
508 while (AllCOUNT_PREVIOUS == AllCOUNT_Now):
509 time.sleep(1)
510 tryresult = tryresult + 1;
511 Actor.log.info('Click Next_1_T3_tryresult '+str(tryresult))
512 if (tryresult > 20):
513 driver.quit();
514 break;
515
516 AllCOUNT_Now = len(driver.find_elements(By.CSS_SELECTOR,"div.tF2Cxc"))
517 if social_network_val=="youtube.com/" or social_network_val=="instagram.com/" :
518 AllCOUNT_Now = len(driver.find_elements(By.XPATH,"//div[contains(@class,'MjjYud')]"))
519 print('AllCOUNT_PREVIOUS'+str(AllCOUNT_PREVIOUS))
520 print('AllCOUNT_Now'+str(AllCOUNT_Now))
521 if(AllCOUNT_PREVIOUS==AllCOUNT_Now):
522 break
523 time.sleep(3)
524 else:
525 if(start==0):
526 await Actor.push_data({'Email': 'No Data Found, Due to google not respond. May be proxy problem'})
527 driver.quit();
528 break;
529 print('done')
530 except Exception:
531 Actor.log.exception(f'Cannot extract data from .')
532
533 driver.quit()

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache
.scrapy
*.log

NopeCHA-CAPTCHA-Solver.crx

Download

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5selenium ~= 4.14.0
6blinker ~=1.7.0
7selenium-wire ~=5.1.0
8beautifulsoup4 ~= 4.12.2
9httpx ~= 0.25.2
10types-beautifulsoup4 ~= 4.12.0.7
11requests
12undetected-chromedriver