
DirScrape
Under maintenance
Try for free
No credit card required
Go to Store
This Actor is under maintenance.
This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?
See alternative Actors
DirScrape
chow0x00/my-actor
Try for free
No credit card required
DirScrape is an Apify Actor for company directory searches. Provide a company name and it fetches profiles, contacts, and links via the Google Custom Search API with a web scraping fallback. It rotates API keys, logs errors, caches results, and exports data in JSON, CSV, Excel, and HTML.
Developer
Maintained by Community
Actor Metrics
6 Monthly users
No reviews yet
No bookmarks yet
>99% runs succeeded
Created in Feb 2025
Modified a month ago
Categories
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.12
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Scrape single page in Python",
5 "description": "Scrape data from single page with provided URL.",
6 "version": "0.0",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-start"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
.actor/input_schema.json
1{
2 "title": "Scrape data from a web page",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "url": {
7 "title": "URL of the page",
8 "type": "string",
9 "description": "The URL of website you want to get the data from.",
10 "editor": "textfield",
11 "prefill": "https://www.apify.com/"
12 }
13 },
14 "required": ["url"]
15}
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())
src/main.py
1import random
2import json
3import logging
4import asyncio
5from bs4 import BeautifulSoup
6from httpx import AsyncClient
7from apify import Actor
8
9# Set up logging
10logging.basicConfig(
11 filename='error_log.txt',
12 level=logging.ERROR,
13 format='%(asctime)s - %(levelname)s - %(message)s'
14)
15
16# Global cache for results
17CACHE_RESULTS = {}
18
19class Config:
20 API_KEYS = [
21 "AIzaSyDbEXo40_A2tLsQaWQmSvaF6SpJrqW34K0",
22 "AIzaSyDlToEKJqB2speXspWoYHP-fFsrMrby3gE",
23 "AIzaSyDSSfXv7fTCiB_xjCPdI9v43KCqF9LztZ8",
24 "AIzaSyAyKJyMXOVzsYAl-d8Nqjappzt_KBHdVm0",
25 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
26 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",
27 "AIzaSyC4FDovZCwUI0BqAyPIGhiHMXnA9qQl7lg",
28 "AIzaSyCbUxyyg8J_DUoKeYgCuAE3CwD9TKLFVWQ",
29 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
30 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",
31 "AIzaSyBruaD3zSc-g0jF-6b_Qu08KU3cjoKwivg",
32 "AIzaSyBBKYswGbKlLtJtRWDx428JtIU4KxT4ui0",
33 "AIzaSyDB3xuXOdjAIrtyolAyJt8LfxU5pOpSApw",
34 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",
35 "AIzaSyDi3BWSNNFu5hZnF5Wtq8GxUMnYu2Zk5gg",
36 "AIzaSyAJP5AVdYOgvMuYoT2XEM0MjPrAYxnB8ns"
37 ]
38 CSE_IDS = [
39 "02671555df2224350", "45f738cd2cb484722", "248848c670d0f4f8c",
40 "3457c8f927de246f4", "70781f2a9a68c49ac", "d1df5a600b5b94ec3",
41 "f40e16c48afe34e3c", "a6072bc1dcf8c4d79", "60c9873c261204718",
42 "e26ede76286a544e8", "d4e2ebf258b944f91", "c481c3a651fe142c9",
43 "14fe45fe39ab2419a", "654800b8f8639452b", "74e2699945a994c1d",
44 "051f685c02e0d44ae", "133d48265e2e44adc", "64a279f8504804cbb",
45 "b35926315544d4e47", "212b86f2ce1f74dd1", "76c72f14d9dfe42ab",
46 "424836067803a4784", "130dd7fa414ee47d6", "c0d5232b82a734c46",
47 "93240ef1d6303432e", "b3603eb617bb94dfd", "f62f696dee17f4064",
48 "c14eb8d6211ac4f92", "70372df0a719c4528", "0486eb80373414435",
49 "b276e986bd43d4b2c", "46deaab41c5824eff", "f1c7d963e6bc143ad",
50 "610360fa85f1a493e", "13184172f1a814248", "0508abdbfea8f436b",
51 "a4d8c57ef19ae4020", "d42b9df0610ce4413", "e4a4b7263a9734bad",
52 "d06afca341bf34d7e", "92b3e7b373110446b", "03f3112c9093d4271",
53 "42ecb107930934e17", "3168c34a9ed4f4baf", "3168c34a9ed4f4baf",
54 "9368d0baf14994fa7", "a1f97ec8e8a094f83", "6142dfd71e5994a6f",
55 "c731eb764bccb49f1", "823cc36126bef4897", "51ecbae620e074ab9",
56 "b2ed19f0318b74759", "a38270c5e616041ae", "d43515117a00d493f"
57 ]
58 DEFAULT_REGION = "za"
59 EXPORT_FORMATS = ["json", "csv", "excel", "html"]
60 DEFAULT_QUERY_TERMS = ["company", "business", "directory", "profile"]
61
62 # A list of user agents for rotation
63 USER_AGENTS = [
64 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
65 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
66 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
67 "Mozilla/5.0 (Windows NT 6.3; rv:55.0) Gecko/20100101 Firefox/55.0",
68 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
69 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0",
70 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
71 "Mozilla/5.0 (Windows NT 6.1; rv:55.0) Gecko/20100101 Firefox/55.0",
72 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
73 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
74 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
75 "Mozilla/5.0 (Windows NT 6.3; rv:61.0) Gecko/20100101 Firefox/61.0",
76 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
77 "Mozilla/5.0 (Windows NT 6.3; rv:40.0) Gecko/20100101 Firefox/40.0",
78 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",
79 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
80 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
81 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
82 "Mozilla/5.0 (Windows NT 6.3; rv:45.0) Gecko/20100101 Firefox/45.0",
83 "Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0",
84 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
85 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
86 "Mozilla/5.0 (Windows NT 6.3; rv:50.0) Gecko/20100101 Firefox/50.0",
87 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
88 "Mozilla/5.0 (Windows NT 6.1; rv:48.0) Gecko/20100101 Firefox/48.0",
89 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
90 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
91 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
92 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
93 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0"
94 ]
95
96def validate_api_keys() -> bool:
97 if not Config.API_KEYS or not Config.CSE_IDS:
98 Actor.log.error("Error: No API keys or CSE IDs available.")
99 return False
100 return True
101
102async def scrape_website(company_name: str, client: AsyncClient) -> list:
103 """
104 Scrape the website directly for company details if no API results are found.
105 """
106 try:
107 search_url = f"https://www.google.com/search?q={company_name}+business+directory"
108 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}
109 response = await client.get(search_url, headers=headers)
110 if response.status_code == 200:
111 soup = BeautifulSoup(response.text, "html.parser")
112 links = soup.find_all("a", href=True)
113 urls = [link['href'] for link in links if "http" in link['href']]
114 return urls
115 else:
116 Actor.log.error(f"Error fetching the page for {company_name}.")
117 return []
118 except Exception as e:
119 logging.error(f"Error scraping website: {e}")
120 return []
121
122async def search_company_for_directory(company_name: str, client: AsyncClient) -> tuple:
123 """
124 Searches for the company's business directory using Google Custom Search API.
125 Falls back to web scraping if no API results are found.
126 """
127 global CACHE_RESULTS
128
129 if company_name in CACHE_RESULTS:
130 Actor.log.info(f"Cache hit for '{company_name}'.")
131 return CACHE_RESULTS[company_name], "Success"
132
133 if not validate_api_keys():
134 return [], "Error: Missing API keys"
135
136 api_key = random.choice(Config.API_KEYS)
137 cse_id = random.choice(Config.CSE_IDS)
138 query = f'"{company_name}" {" OR ".join(Config.DEFAULT_QUERY_TERMS)}'
139 results = []
140 start_index = 1
141
142 while True:
143 try:
144 url = (f"https://www.googleapis.com/customsearch/v1?q={query}"
145 f"&key={api_key}&cx={cse_id}&start={start_index}&gl={Config.DEFAULT_REGION}")
146 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}
147 response = await client.get(url, headers=headers)
148 if response.status_code == 200:
149 data = response.json()
150 items = data.get("items", [])
151 if not items:
152 break
153 results.extend(item["link"] for item in items)
154 start_index += 10
155 await asyncio.sleep(1)
156 elif response.status_code in [403, 429]:
157 Actor.log.warning("API quota exceeded. Switching API key...")
158 await asyncio.sleep(2)
159 api_key = random.choice(Config.API_KEYS)
160 else:
161 logging.error(f"HTTP {response.status_code}: {response.text}")
162 return results, f"Error: {response.status_code}"
163 except Exception as e:
164 logging.error(f"Exception occurred: {e}")
165 return results, "Error: Exception encountered"
166
167 if not results:
168 Actor.log.info(f"No API results found for '{company_name}', attempting web scraping...")
169 results = await scrape_website(company_name, client)
170
171 CACHE_RESULTS[company_name] = results
172 return results, "Success"
173
174async def bulk_search(companies: list, client: AsyncClient) -> dict:
175 """
176 Processes a list of companies and retrieves search results for each.
177 """
178 all_results = {}
179 for company in companies:
180 results, status = await search_company_for_directory(company, client)
181 all_results[company] = results
182 return all_results
183
184async def main() -> None:
185 async with Actor:
186 # Retrieve the input provided to the Actor.
187 # Expected input JSON should include a key "companies" which is either a list or a comma‐separated string.
188 actor_input = await Actor.get_input() or {}
189 companies = actor_input.get("companies")
190 if isinstance(companies, str):
191 companies = [c.strip() for c in companies.split(",") if c.strip()]
192 elif not isinstance(companies, list) or not companies:
193 Actor.log.error("No valid 'companies' provided in input.")
194 return
195
196 async with AsyncClient() as client:
197 Actor.log.info("Starting bulk search for companies.")
198 results = await bulk_search(companies, client)
199 Actor.log.info("Bulk search completed.")
200
201 # Push the results to the Apify dataset
202 await Actor.push_data(results)
203 Actor.log.info("Results have been pushed to the dataset.")
204
205if __name__ == "__main__":
206 asyncio.run(main())
.dockerignore
1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37# Usually these files are written by a python script from a template
38# before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93# For a library or package, you might want to ignore these files since the code is
94# intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101# in version control.
102# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153# and can be added to the global gitignore or merged into this file. For a more nuclear
154# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/
.gitignore
1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36# Usually these files are written by a python script from a template
37# before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92# For a library or package, you might want to ignore these files since the code is
93# intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100# in version control.
101# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152# and can be added to the global gitignore or merged into this file. For a more nuclear
153# option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4