Taxdeed_Scraper avatar
Taxdeed_Scraper

Under maintenance

Pricing

Pay per usage

Go to Store
Taxdeed_Scraper

Taxdeed_Scraper

Under maintenance

Developed by

Ishara Samarathunga

Ishara Samarathunga

Maintained by Community

Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.

0.0 (0)

Pricing

Pay per usage

0

Total users

3

Monthly users

3

Runs succeeded

>99%

Last modified

18 days ago

.actor/Dockerfile

# Use the official Apify Python + Playwright image as base
FROM apify/actor-python-playwright:3.13
# Copy only requirements.txt first to install dependencies in a separate layer
COPY requirements.txt ./
# Install Python packages and log versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install --no-cache-dir -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Install Playwright dependencies and browsers
RUN playwright install-deps && \
playwright install
# Copy all source files into the container
COPY . ./
# Precompile Python code for early error detection (optional but recommended)
RUN python3 -m compileall -q .
# Set the default command to run your main script
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "county-taxdeed-scraper",
"title": "Duval & Clay County Tax Deed Scraper",
"description": "Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.",
"version": "0.0",
"buildTag": "latest",
"meta": {
"templateId": "python-playwright"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Duval & Clay County Tax Deed Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"start_urls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start scraping from (Duval/Clay Clerk sites)",
"prefill": [
{ "url": "https://taxdeed.duvalclerk.com/" },
{ "url": "https://landmark.clayclerk.com/TaxDeed/" }
],
"editor": "requestListSources"
},
"max_depth": {
"title": "Maximum depth",
"type": "integer",
"description": "Depth to which the scraper should follow links (not used in current logic but may be useful for future)",
"default": 1
}
},
"required": ["start_urls"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""
2
3from __future__ import annotations
4
5import time
6from urllib.parse import urljoin
7
8from apify import Actor
9from playwright.async_api import async_playwright
10
11
12async def main() -> None:
13 """Main entry point for the Apify Actor."""
14
15 async with Actor:
16 Actor.log.info("Starting the Duval Tax Deed scraper...")
17
18 base_urls = [
19 "https://taxdeed.duvalclerk.com/",
20 "https://landmark.clayclerk.com/TaxDeed/"
21 ]
22
23 data_list = []
24
25 async with async_playwright() as playwright:
26 browser = await playwright.chromium.launch(
27 headless=Actor.config.headless,
28 args=["--disable-gpu"]
29 )
30 context = await browser.new_context()
31 page = await context.new_page()
32
33 for base_url in base_urls:
34 await page.goto(base_url)
35
36 # --- Step 1: Select Date Options and Search ---
37 await page.select_option("#SearchSaleDateFrom", index=3)
38 await page.select_option("#SearchSaleDateTo", index=0)
39 await page.click("#tabs-9 button")
40
41 # Filter for "SALE" status
42 status_input = page.locator("#gs_Status")
43 await status_input.click()
44 await status_input.fill("")
45 await status_input.type("SALE")
46 await page.wait_for_timeout(2000)
47
48 try:
49 total_pages = int((await page.text_content("#sp_1_pager")).strip())
50 except Exception:
51 total_pages = 1
52
53 for current_page in range(1, total_pages + 1):
54 Actor.log.info(f"Processing page {current_page} of {total_pages}")
55 rows = await page.locator("tr[role='row'][id]").all()
56 filtered_row_ids = []
57
58 for row in rows:
59 row_id = await row.get_attribute("id")
60 if row_id and row_id.isdigit():
61 status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()
62 if status.strip() == "SALE":
63 filtered_row_ids.append(row_id)
64
65 for row_id in filtered_row_ids:
66 if "clayclerk" in base_url:
67 details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"
68 else:
69 details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"
70
71 Actor.log.info(f"Visiting details URL: {details_url}")
72 await page.goto(details_url)
73 await page.wait_for_timeout(2000)
74
75 detail_data = {}
76 rows_detail = await page.locator("tr:has(td b)").all()
77
78 for row in rows_detail:
79 try:
80 key = await row.locator("td:nth-child(1) b").text_content()
81 value = await row.locator("td:nth-child(2)").text_content()
82 key = key.strip()
83 value = value.strip()
84
85 if key in ["Property Address", "Parcel ID"]:
86 detail_data[key] = value
87 elif key in ["Opening Bid", "Base Bid"]:
88 detail_data["Opening Bid"] = value
89 except Exception:
90 pass
91
92 for field in ["Property Address", "Parcel ID", "Opening Bid"]:
93 if field not in detail_data:
94 detail_data[field] = "N/A"
95
96 # Extracting County and State from the Property Address field
97 address = detail_data.get("Property Address", "")
98 if address:
99 parts = address.split(",")
100 if len(parts) >= 2:
101 city_and_state = parts[-1].strip()
102 state_parts = city_and_state.split()
103 if len(state_parts) >= 2:
104 state = state_parts[-2].strip()
105 else:
106 state = "N/A"
107 else:
108 state = "N/A"
109 else:
110 state = "N/A"
111
112 if "clayclerk" in base_url:
113 county = "Clay County"
114 else:
115 county = "Duval County"
116
117 # Add county and state to the detail_data dictionary
118 detail_data["County"] = county
119 detail_data["State"] = state
120
121 data_list.append(detail_data)
122
123 # Reinitialize the search for next page
124 await page.goto(base_url)
125 await page.select_option("#SearchSaleDateFrom", index=3)
126 await page.select_option("#SearchSaleDateTo", index=0)
127 await page.click("#tabs-9 button")
128 await status_input.click()
129 await status_input.fill("")
130 await status_input.type("SALE")
131 await page.wait_for_timeout(2000)
132
133 if current_page < total_pages:
134 await page.click("#next_pager")
135 await page.wait_for_timeout(3000)
136
137 await browser.close()
138
139 # Deduplicate based on 'Parcel ID' and 'Property Address'
140 seen = set()
141 deduped_data = []
142 for item in data_list:
143 key = (item.get("Parcel ID"), item.get("Property Address"))
144 if key not in seen:
145 seen.add(key)
146 deduped_data.append(item)
147
148 # Push data to Apify dataset
149 if deduped_data:
150 await Actor.push_data(deduped_data)
151 Actor.log.info(f"Successfully pushed {len(deduped_data)} unique items to the default dataset.")
152 else:
153 Actor.log.warning("No data extracted after deduplication.")

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Zed editor
# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
# to share Project Settings within a team
.zed

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5playwright