
DirScrape
Deprecated
Pricing
Pay per usage
Go to Store

DirScrape
Deprecated
DirScrape is an Apify Actor for company directory searches. Provide a company name and it fetches profiles, contacts, and links via the Google Custom Search API with a web scraping fallback. It rotates API keys, logs errors, caches results, and exports data in JSON, CSV, Excel, and HTML.
0.0 (0)
Pricing
Pay per usage
0
Total users
7
Monthly users
7
Runs succeeded
>99%
Last modified
5 months ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.12
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Scrape single page in Python", "description": "Scrape data from single page with provided URL.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "url": { "title": "URL of the page", "type": "string", "description": "The URL of website you want to get the data from.", "editor": "textfield", "prefill": "https://www.apify.com/" } }, "required": ["url"]}
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/main.py
1import random2import json3import logging4import asyncio5from bs4 import BeautifulSoup6from httpx import AsyncClient7from apify import Actor8
9# Set up logging10logging.basicConfig(11 filename='error_log.txt',12 level=logging.ERROR,13 format='%(asctime)s - %(levelname)s - %(message)s'14)15
16# Global cache for results17CACHE_RESULTS = {}18
19class Config:20 API_KEYS = [21 "AIzaSyDbEXo40_A2tLsQaWQmSvaF6SpJrqW34K0",22 "AIzaSyDlToEKJqB2speXspWoYHP-fFsrMrby3gE",23 "AIzaSyDSSfXv7fTCiB_xjCPdI9v43KCqF9LztZ8",24 "AIzaSyAyKJyMXOVzsYAl-d8Nqjappzt_KBHdVm0",25 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",26 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",27 "AIzaSyC4FDovZCwUI0BqAyPIGhiHMXnA9qQl7lg",28 "AIzaSyCbUxyyg8J_DUoKeYgCuAE3CwD9TKLFVWQ",29 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",30 "AIzaSyDTuJCMcTYgFKMcjB71zQuFY7Q0ZbuP218",31 "AIzaSyBruaD3zSc-g0jF-6b_Qu08KU3cjoKwivg",32 "AIzaSyBBKYswGbKlLtJtRWDx428JtIU4KxT4ui0",33 "AIzaSyDB3xuXOdjAIrtyolAyJt8LfxU5pOpSApw",34 "AIzaSyDNVzGNZ4cT7eWwjQD6NmppTZ-AuqzzTk0",35 "AIzaSyDi3BWSNNFu5hZnF5Wtq8GxUMnYu2Zk5gg",36 "AIzaSyAJP5AVdYOgvMuYoT2XEM0MjPrAYxnB8ns"37 ]38 CSE_IDS = [39 "02671555df2224350", "45f738cd2cb484722", "248848c670d0f4f8c",40 "3457c8f927de246f4", "70781f2a9a68c49ac", "d1df5a600b5b94ec3",41 "f40e16c48afe34e3c", "a6072bc1dcf8c4d79", "60c9873c261204718",42 "e26ede76286a544e8", "d4e2ebf258b944f91", "c481c3a651fe142c9",43 "14fe45fe39ab2419a", "654800b8f8639452b", "74e2699945a994c1d",44 "051f685c02e0d44ae", "133d48265e2e44adc", "64a279f8504804cbb",45 "b35926315544d4e47", "212b86f2ce1f74dd1", "76c72f14d9dfe42ab",46 "424836067803a4784", "130dd7fa414ee47d6", "c0d5232b82a734c46",47 "93240ef1d6303432e", "b3603eb617bb94dfd", "f62f696dee17f4064",48 "c14eb8d6211ac4f92", "70372df0a719c4528", "0486eb80373414435",49 "b276e986bd43d4b2c", "46deaab41c5824eff", "f1c7d963e6bc143ad",50 "610360fa85f1a493e", "13184172f1a814248", "0508abdbfea8f436b",51 "a4d8c57ef19ae4020", "d42b9df0610ce4413", "e4a4b7263a9734bad",52 "d06afca341bf34d7e", "92b3e7b373110446b", "03f3112c9093d4271",53 "42ecb107930934e17", "3168c34a9ed4f4baf", "3168c34a9ed4f4baf",54 "9368d0baf14994fa7", "a1f97ec8e8a094f83", "6142dfd71e5994a6f",55 "c731eb764bccb49f1", "823cc36126bef4897", "51ecbae620e074ab9",56 "b2ed19f0318b74759", "a38270c5e616041ae", "d43515117a00d493f"57 ]58 DEFAULT_REGION = "za"59 EXPORT_FORMATS = ["json", "csv", "excel", "html"]60 DEFAULT_QUERY_TERMS = ["company", "business", "directory", "profile"]61
62 # A list of user agents for rotation63 USER_AGENTS = [64 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",65 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",66 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",67 "Mozilla/5.0 (Windows NT 6.3; rv:55.0) Gecko/20100101 Firefox/55.0",68 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",69 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0",70 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",71 "Mozilla/5.0 (Windows NT 6.1; rv:55.0) Gecko/20100101 Firefox/55.0",72 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",73 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",74 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",75 "Mozilla/5.0 (Windows NT 6.3; rv:61.0) Gecko/20100101 Firefox/61.0",76 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",77 "Mozilla/5.0 (Windows NT 6.3; rv:40.0) Gecko/20100101 Firefox/40.0",78 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36",79 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",80 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",81 "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",82 "Mozilla/5.0 (Windows NT 6.3; rv:45.0) Gecko/20100101 Firefox/45.0",83 "Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0",84 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",85 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",86 "Mozilla/5.0 (Windows NT 6.3; rv:50.0) Gecko/20100101 Firefox/50.0",87 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",88 "Mozilla/5.0 (Windows NT 6.1; rv:48.0) Gecko/20100101 Firefox/48.0",89 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",90 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",91 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",92 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",93 "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0"94 ]95
96def validate_api_keys() -> bool:97 if not Config.API_KEYS or not Config.CSE_IDS:98 Actor.log.error("Error: No API keys or CSE IDs available.")99 return False100 return True101
102async def scrape_website(company_name: str, client: AsyncClient) -> list:103 """104 Scrape the website directly for company details if no API results are found.105 """106 try:107 search_url = f"https://www.google.com/search?q={company_name}+business+directory"108 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}109 response = await client.get(search_url, headers=headers)110 if response.status_code == 200:111 soup = BeautifulSoup(response.text, "html.parser")112 links = soup.find_all("a", href=True)113 urls = [link['href'] for link in links if "http" in link['href']]114 return urls115 else:116 Actor.log.error(f"Error fetching the page for {company_name}.")117 return []118 except Exception as e:119 logging.error(f"Error scraping website: {e}")120 return []121
122async def search_company_for_directory(company_name: str, client: AsyncClient) -> tuple:123 """124 Searches for the company's business directory using Google Custom Search API.125 Falls back to web scraping if no API results are found.126 """127 global CACHE_RESULTS128
129 if company_name in CACHE_RESULTS:130 Actor.log.info(f"Cache hit for '{company_name}'.")131 return CACHE_RESULTS[company_name], "Success"132
133 if not validate_api_keys():134 return [], "Error: Missing API keys"135
136 api_key = random.choice(Config.API_KEYS)137 cse_id = random.choice(Config.CSE_IDS)138 query = f'"{company_name}" {" OR ".join(Config.DEFAULT_QUERY_TERMS)}'139 results = []140 start_index = 1141
142 while True:143 try:144 url = (f"https://www.googleapis.com/customsearch/v1?q={query}"145 f"&key={api_key}&cx={cse_id}&start={start_index}&gl={Config.DEFAULT_REGION}")146 headers = {'User-Agent': random.choice(Config.USER_AGENTS)}147 response = await client.get(url, headers=headers)148 if response.status_code == 200:149 data = response.json()150 items = data.get("items", [])151 if not items:152 break153 results.extend(item["link"] for item in items)154 start_index += 10155 await asyncio.sleep(1)156 elif response.status_code in [403, 429]:157 Actor.log.warning("API quota exceeded. Switching API key...")158 await asyncio.sleep(2)159 api_key = random.choice(Config.API_KEYS)160 else:161 logging.error(f"HTTP {response.status_code}: {response.text}")162 return results, f"Error: {response.status_code}"163 except Exception as e:164 logging.error(f"Exception occurred: {e}")165 return results, "Error: Exception encountered"166
167 if not results:168 Actor.log.info(f"No API results found for '{company_name}', attempting web scraping...")169 results = await scrape_website(company_name, client)170
171 CACHE_RESULTS[company_name] = results172 return results, "Success"173
174async def bulk_search(companies: list, client: AsyncClient) -> dict:175 """176 Processes a list of companies and retrieves search results for each.177 """178 all_results = {}179 for company in companies:180 results, status = await search_company_for_directory(company, client)181 all_results[company] = results182 return all_results183
184async def main() -> None:185 async with Actor:186 # Retrieve the input provided to the Actor.187 # Expected input JSON should include a key "companies" which is either a list or a comma‐separated string.188 actor_input = await Actor.get_input() or {}189 companies = actor_input.get("companies")190 if isinstance(companies, str):191 companies = [c.strip() for c in companies.split(",") if c.strip()]192 elif not isinstance(companies, list) or not companies:193 Actor.log.error("No valid 'companies' provided in input.")194 return195
196 async with AsyncClient() as client:197 Actor.log.info("Starting bulk search for companies.")198 results = await bulk_search(companies, client)199 Actor.log.info("Bulk search completed.")200
201 # Push the results to the Apify dataset202 await Actor.push_data(results)203 Actor.log.info("Results have been pushed to the dataset.")204
205if __name__ == "__main__":206 asyncio.run(main())
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05beautifulsoup4[lxml]6httpx7types-beautifulsoup4