
Taxdeed_Scraper
Under maintenance
Pricing
Pay per usage
Go to Store

Taxdeed_Scraper
Under maintenance
Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.
0.0 (0)
Pricing
Pay per usage
0
Total users
3
Monthly users
3
Runs succeeded
>99%
Last modified
18 days ago
.actor/Dockerfile
# Use the official Apify Python + Playwright image as baseFROM apify/actor-python-playwright:3.13
# Copy only requirements.txt first to install dependencies in a separate layerCOPY requirements.txt ./
# Install Python packages and log versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install --no-cache-dir -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Install Playwright dependencies and browsersRUN playwright install-deps && \ playwright install
# Copy all source files into the containerCOPY . ./
# Precompile Python code for early error detection (optional but recommended)RUN python3 -m compileall -q .
# Set the default command to run your main scriptCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "county-taxdeed-scraper", "title": "Duval & Clay County Tax Deed Scraper", "description": "Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-playwright" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Duval & Clay County Tax Deed Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start scraping from (Duval/Clay Clerk sites)", "prefill": [ { "url": "https://taxdeed.duvalclerk.com/" }, { "url": "https://landmark.clayclerk.com/TaxDeed/" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which the scraper should follow links (not used in current logic but may be useful for future)", "default": 1 } }, "required": ["start_urls"]}
src/__init__.py
1
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/main.py
1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""2
3from __future__ import annotations4
5import time6from urllib.parse import urljoin7
8from apify import Actor9from playwright.async_api import async_playwright10
11
12async def main() -> None:13 """Main entry point for the Apify Actor."""14
15 async with Actor:16 Actor.log.info("Starting the Duval Tax Deed scraper...")17
18 base_urls = [19 "https://taxdeed.duvalclerk.com/",20 "https://landmark.clayclerk.com/TaxDeed/"21 ]22
23 data_list = []24
25 async with async_playwright() as playwright:26 browser = await playwright.chromium.launch(27 headless=Actor.config.headless,28 args=["--disable-gpu"]29 )30 context = await browser.new_context()31 page = await context.new_page()32
33 for base_url in base_urls:34 await page.goto(base_url)35
36 # --- Step 1: Select Date Options and Search ---37 await page.select_option("#SearchSaleDateFrom", index=3)38 await page.select_option("#SearchSaleDateTo", index=0)39 await page.click("#tabs-9 button")40
41 # Filter for "SALE" status42 status_input = page.locator("#gs_Status")43 await status_input.click()44 await status_input.fill("")45 await status_input.type("SALE")46 await page.wait_for_timeout(2000)47
48 try:49 total_pages = int((await page.text_content("#sp_1_pager")).strip())50 except Exception:51 total_pages = 152
53 for current_page in range(1, total_pages + 1):54 Actor.log.info(f"Processing page {current_page} of {total_pages}")55 rows = await page.locator("tr[role='row'][id]").all()56 filtered_row_ids = []57
58 for row in rows:59 row_id = await row.get_attribute("id")60 if row_id and row_id.isdigit():61 status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()62 if status.strip() == "SALE":63 filtered_row_ids.append(row_id)64
65 for row_id in filtered_row_ids:66 if "clayclerk" in base_url:67 details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"68 else:69 details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"70
71 Actor.log.info(f"Visiting details URL: {details_url}")72 await page.goto(details_url)73 await page.wait_for_timeout(2000)74
75 detail_data = {}76 rows_detail = await page.locator("tr:has(td b)").all()77
78 for row in rows_detail:79 try:80 key = await row.locator("td:nth-child(1) b").text_content()81 value = await row.locator("td:nth-child(2)").text_content()82 key = key.strip()83 value = value.strip()84
85 if key in ["Property Address", "Parcel ID"]:86 detail_data[key] = value87 elif key in ["Opening Bid", "Base Bid"]:88 detail_data["Opening Bid"] = value89 except Exception:90 pass91
92 for field in ["Property Address", "Parcel ID", "Opening Bid"]:93 if field not in detail_data:94 detail_data[field] = "N/A"95
96 # Extracting County and State from the Property Address field97 address = detail_data.get("Property Address", "")98 if address:99 parts = address.split(",")100 if len(parts) >= 2:101 city_and_state = parts[-1].strip()102 state_parts = city_and_state.split()103 if len(state_parts) >= 2:104 state = state_parts[-2].strip()105 else:106 state = "N/A"107 else:108 state = "N/A"109 else:110 state = "N/A"111
112 if "clayclerk" in base_url:113 county = "Clay County"114 else:115 county = "Duval County"116
117 # Add county and state to the detail_data dictionary118 detail_data["County"] = county119 detail_data["State"] = state120
121 data_list.append(detail_data)122
123 # Reinitialize the search for next page124 await page.goto(base_url)125 await page.select_option("#SearchSaleDateFrom", index=3)126 await page.select_option("#SearchSaleDateTo", index=0)127 await page.click("#tabs-9 button")128 await status_input.click()129 await status_input.fill("")130 await status_input.type("SALE")131 await page.wait_for_timeout(2000)132
133 if current_page < total_pages:134 await page.click("#next_pager")135 await page.wait_for_timeout(3000)136
137 await browser.close()138
139 # Deduplicate based on 'Parcel ID' and 'Property Address'140 seen = set()141 deduped_data = []142 for item in data_list:143 key = (item.get("Parcel ID"), item.get("Property Address"))144 if key not in seen:145 seen.add(key)146 deduped_data.append(item)147
148 # Push data to Apify dataset149 if deduped_data:150 await Actor.push_data(deduped_data)151 Actor.log.info(f"Successfully pushed {len(deduped_data)} unique items to the default dataset.")152 else:153 Actor.log.warning("No data extracted after deduplication.")
src/py.typed
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Zed editor# Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out# to share Project Settings within a team.zed
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05playwright