1import random
2import json
3import logging
4import asyncio
5from bs4 import BeautifulSoup
6from httpx import AsyncClient
7from apify import Actor
8
9
10logging.basicConfig(
11 filename='error_log.txt',
12 level=logging.ERROR,
13 format='%(asctime)s - %(levelname)s - %(message)s'
14)
15
16
17CACHE_RESULTS = {}
18
19class Config:
20 API_KEYS = [
21 "AIzaSyDbEXo40_A2tLsQaWQmSvaF6SpJrqW34K0",
22 "AIzaSyDlToEKJqB2speXspWoYHP-fFsrMrby3gE",
23 "AIzaSyDSSfXv7fTCiB_xjCPdI9v43KCqF9LztZ8",
24 "AIzaSyAyKJyMXOVzsYAl-d8Nqjappzt_KBHdVm0",
25 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
26 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",
27 "AIzaSyC4FDovZCwUI0BqAyPIGhiHMXnA9qQl7lg",
28 "AIzaSyCbUxyyg8J_DUoKeYgCuAE3CwD9TKLFVWQ",
29 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
30 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",
31 "AIzaSyBruaD3zSc-g0jF-6b_Qu08KU3cjoKwivg",
32 "AIzaSyBBKYswGbKlLtJtRWDx428JtIU4KxT4ui0",
33 "AIzaSyDB3xuXOdjAIrtyolAyJt8LfxU5pOpSApw",
34 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
35 "AIzaSyDi3BWSNNFu5hZnF5Wtq8GxUMnYu2Zk5gg",
36 "AIzaSyAJP5AVdYOgvMuYoT2XEM0MjPrAYxnB8ns"
37 ]
38 CSE_IDS = [
39 "02671555df2224350", "45f738cd2cb484722", "248848c670d0f4f8c",
40 "3457c8f927de246f4", "70781f2a9a68c49ac", "d1df5a600b5b94ec3",
41 "f40e16c48afe34e3c", "a6072bc1dcf8c4d79", "60c9873c261204718",
42 "e26ede76286a544e8", "d4e2ebf258b944f91", "c481c3a651fe142c9",
43 "14fe45fe39ab2419a", "654800b8f8639452b", "74e2699945a994c1d",
44 "051f685c02e0d44ae", "133d48265e2e44adc", "64a279f8504804cbb",
45 "b35926315544d4e47", "212b86f2ce1f74dd1", "76c72f14d9dfe42ab",
46 "424836067803a4784", "130dd7fa414ee47d6", "c0d5232b82a734c46",
47 "93240ef1d6303432e", "b3603eb617bb94dfd", "f62f696dee17f4064",
48 "c14eb8d6211ac4f92", "70372df0a719c4528", "0486eb80373414435",
49 "b276e986bd43d4b2c", "46deaab41c5824eff", "f1c7d963e6bc143ad",
50 "610360fa85f1a493e", "13184172f1a814248", "0508abdbfea8f436b",
51 "a4d8c57ef19ae4020", "d42b9df0610ce4413", "e4a4b7263a9734bad",
52 "d06afca341bf34d7e", "92b3e7b373110446b", "03f3112c9093d4271",
53 "42ecb107930934e17", "3168c34a9ed4f4baf", "3168c34a9ed4f4baf",
54 "9368d0baf14994fa7", "a1f97ec8e8a094f83", "6142dfd71e5994a6f",
55 "c731eb764bccb49f1", "823cc36126bef4897", "51ecbae620e074ab9",
56 "b2ed19f0318b74759", "a38270c5e616041ae", "d43515117a00d493f"
57 ]
58 DEFAULT_REGION = "za"
59 EXPORT_FORMATS = ["json", "csv", "excel", "html"]
60 DEFAULT_QUERY_TERMS = ["company", "business", "directory", "profile"]
61
62
63 USER_AGENTS = [
64 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
65 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
66 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
67 "Mozilla/5.0 (Windows NT 6.3; rv:55.0) Gecko/20100101 Firefox/55.0",
68 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
69 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0",
70 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
71 "Mozilla/5.0 (Windows NT 6.1; rv:55.0) Gecko/20100101 Firefox/55.0",
72 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
73 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
74 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
75 "Mozilla/5.0 (Windows NT 6.3; rv:61.0) Gecko/20100101 Firefox/61.0",
76 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
77 "Mozilla/5.0 (Windows NT 6.3; rv:40.0) Gecko/20100101 Firefox/40.0",
78 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
79 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
80 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
81 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
82 "Mozilla/5.0 (Windows NT 6.3; rv:45.0) Gecko/20100101 Firefox/45.0",
83 "Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0",
84 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
85 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
86 "Mozilla/5.0 (Windows NT 6.3; rv:50.0) Gecko/20100101 Firefox/50.0",
87 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
88 "Mozilla/5.0 (Windows NT 6.1; rv:48.0) Gecko/20100101 Firefox/48.0",
89 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
90 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
91 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
92 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
93 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0"
94 ]
95
96def validate_api_keys() -> bool:
97 if not Config.API_KEYS or not Config.CSE_IDS:
98 Actor.log.error("Error: No API keys or CSE IDs available.")
99 return False
100 return True
101
102async def scrape_website(company_name: str, client: AsyncClient) -> list:
103 """
104 Scrape the website directly for company details if no API results are found.
105 """
106 try:
107 search_url = f"https://www.google.com/search?q={company_name}+business+directory"
108 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}
109 response = await client.get(search_url, headers=headers)
110 if response.status_code == 200:
111 soup = BeautifulSoup(response.text, "html.parser")
112 links = soup.find_all("a", href=True)
113 urls = [link['href'] for link in links if "http" in link['href']]
114 return urls
115 else:
116 Actor.log.error(f"Error fetching the page for {company_name}.")
117 return []
118 except Exception as e:
119 logging.error(f"Error scraping website: {e}")
120 return []
121
122async def search_company_for_directory(company_name: str, client: AsyncClient) -> tuple:
123 """
124 Searches for the company's business directory using Google Custom Search API.
125 Falls back to web scraping if no API results are found.
126 """
127 global CACHE_RESULTS
128
129 if company_name in CACHE_RESULTS:
130 Actor.log.info(f"Cache hit for '{company_name}'.")
131 return CACHE_RESULTS[company_name], "Success"
132
133 if not validate_api_keys():
134 return [], "Error: Missing API keys"
135
136 api_key = random.choice(Config.API_KEYS)
137 cse_id = random.choice(Config.CSE_IDS)
138 query = f'"{company_name}" {" OR ".join(Config.DEFAULT_QUERY_TERMS)}'
139 results = []
140 start_index = 1
141
142 while True:
143 try:
144 url = (f"https://www.googleapis.com/customsearch/v1?q={query}"
145 f"&key={api_key}&cx={cse_id}&start={start_index}&gl={Config.DEFAULT_REGION}")
146 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}
147 response = await client.get(url, headers=headers)
148 if response.status_code == 200:
149 data = response.json()
150 items = data.get("items", [])
151 if not items:
152 break
153 results.extend(item["link"] for item in items)
154 start_index += 10
155 await asyncio.sleep(1)
156 elif response.status_code in [403, 429]:
157 Actor.log.warning("API quota exceeded. Switching API key...")
158 await asyncio.sleep(2)
159 api_key = random.choice(Config.API_KEYS)
160 else:
161 logging.error(f"HTTP {response.status_code}: {response.text}")
162 return results, f"Error: {response.status_code}"
163 except Exception as e:
164 logging.error(f"Exception occurred: {e}")
165 return results, "Error: Exception encountered"
166
167 if not results:
168 Actor.log.info(f"No API results found for '{company_name}', attempting web scraping...")
169 results = await scrape_website(company_name, client)
170
171 CACHE_RESULTS[company_name] = results
172 return results, "Success"
173
174async def bulk_search(companies: list, client: AsyncClient) -> dict:
175 """
176 Processes a list of companies and retrieves search results for each.
177 """
178 all_results = {}
179 for company in companies:
180 results, status = await search_company_for_directory(company, client)
181 all_results[company] = results
182 return all_results
183
184async def main() -> None:
185 async with Actor:
186
187
188 actor_input = await Actor.get_input() or {}
189 companies = actor_input.get("companies")
190 if isinstance(companies, str):
191 companies = [c.strip() for c in companies.split(",") if c.strip()]
192 elif not isinstance(companies, list) or not companies:
193 Actor.log.error("No valid 'companies' provided in input.")
194 return
195
196 async with AsyncClient() as client:
197 Actor.log.info("Starting bulk search for companies.")
198 results = await bulk_search(companies, client)
199 Actor.log.info("Bulk search completed.")
200
201
202 await Actor.push_data(results)
203 Actor.log.info("Results have been pushed to the dataset.")
204
205if __name__ == "__main__":
206 asyncio.run(main())