# Use the official Apify Python + Playwright image as base
FROM apify/actor-python-playwright:3.13

# Copy only requirements.txt first to install dependencies in a separate layer
COPY requirements.txt ./

# Install Python packages and log versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install --no-cache-dir -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Install Playwright dependencies and browsers
RUN playwright install-deps && \
    playwright install

# Copy all source files into the container
COPY . ./

# Precompile Python code for early error detection (optional but recommended)
RUN python3 -m compileall -q .

# Set the default command to run your main script
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "county-taxdeed-scraper",
    "title": "Duval & Clay County Tax Deed Scraper",
    "description": "Scrapes tax deed sale data from Duval and Clay County Clerk websites using Playwright and Apify SDK.",
    "version": "0.0",
    "buildTag": "latest",
    "meta": {
        "templateId": "python-playwright"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Duval & Clay County Tax Deed Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start scraping from (Duval/Clay Clerk sites)",
            "prefill": [
                { "url": "https://taxdeed.duvalclerk.com/" },
                { "url": "https://landmark.clayclerk.com/TaxDeed/" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which the scraper should follow links (not used in current logic but may be useful for future)",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

src/init.py

src/main.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""
2
3from __future__ import annotations
4
5import time
6from urllib.parse import urljoin
7
8from apify import Actor
9from playwright.async_api import async_playwright
10
11
12async def main() -> None:
13    """Main entry point for the Apify Actor."""
14
15    async with Actor:
16        Actor.log.info("Starting the Duval Tax Deed scraper...")
17
18        base_urls = [
19            "https://taxdeed.duvalclerk.com/",
20            "https://landmark.clayclerk.com/TaxDeed/"
21        ]
22
23        data_list = []
24
25        async with async_playwright() as playwright:
26            browser = await playwright.chromium.launch(
27                headless=Actor.config.headless,
28                args=["--disable-gpu"]
29            )
30            context = await browser.new_context()
31            page = await context.new_page()
32
33            for base_url in base_urls:
34                await page.goto(base_url)
35
36                # --- Step 1: Select Date Options and Search ---
37                await page.select_option("#SearchSaleDateFrom", index=3)
38                await page.select_option("#SearchSaleDateTo", index=0)
39                await page.click("#tabs-9 button")
40
41                # Filter for "SALE" status
42                status_input = page.locator("#gs_Status")
43                await status_input.click()
44                await status_input.fill("")
45                await status_input.type("SALE")
46                await page.wait_for_timeout(2000)
47
48                try:
49                    total_pages = int((await page.text_content("#sp_1_pager")).strip())
50                except Exception:
51                    total_pages = 1
52
53                for current_page in range(1, total_pages + 1):
54                    Actor.log.info(f"Processing page {current_page} of {total_pages}")
55                    rows = await page.locator("tr[role='row'][id]").all()
56                    filtered_row_ids = []
57
58                    for row in rows:
59                        row_id = await row.get_attribute("id")
60                        if row_id and row_id.isdigit():
61                            status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()
62                            if status.strip() == "SALE":
63                                filtered_row_ids.append(row_id)
64
65                    for row_id in filtered_row_ids:
66                        if "clayclerk" in base_url:
67                            details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"
68                        else:
69                            details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"
70
71                        Actor.log.info(f"Visiting details URL: {details_url}")
72                        await page.goto(details_url)
73                        await page.wait_for_timeout(2000)
74
75                        detail_data = {}
76                        rows_detail = await page.locator("tr:has(td b)").all()
77
78                        for row in rows_detail:
79                            try:
80                                key = await row.locator("td:nth-child(1) b").text_content()
81                                value = await row.locator("td:nth-child(2)").text_content()
82                                key = key.strip()
83                                value = value.strip()
84
85                                if key in ["Property Address", "Parcel ID"]:
86                                    detail_data[key] = value
87                                elif key in ["Opening Bid", "Base Bid"]:
88                                    detail_data["Opening Bid"] = value
89                            except Exception:
90                                pass
91
92                        for field in ["Property Address", "Parcel ID", "Opening Bid"]:
93                            if field not in detail_data:
94                                detail_data[field] = "N/A"
95
96                        # Extracting County and State from the Property Address field
97                        address = detail_data.get("Property Address", "")
98                        if address:
99                            parts = address.split(",")
100                            if len(parts) >= 2:
101                                city_and_state = parts[-1].strip()
102                                state_parts = city_and_state.split()
103                                if len(state_parts) >= 2:
104                                    state = state_parts[-2].strip()
105                                else:
106                                    state = "N/A"
107                            else:
108                                state = "N/A"
109                        else:
110                            state = "N/A"
111
112                        if "clayclerk" in base_url:
113                            county = "Clay County"
114                        else:
115                            county = "Duval County"
116
117                        # Add county and state to the detail_data dictionary
118                        detail_data["County"] = county
119                        detail_data["State"] = state
120
121                        data_list.append(detail_data)
122
123                    # Reinitialize the search for next page
124                    await page.goto(base_url)
125                    await page.select_option("#SearchSaleDateFrom", index=3)
126                    await page.select_option("#SearchSaleDateTo", index=0)
127                    await page.click("#tabs-9 button")
128                    await status_input.click()
129                    await status_input.fill("")
130                    await status_input.type("SALE")
131                    await page.wait_for_timeout(2000)
132
133                    if current_page < total_pages:
134                        await page.click("#next_pager")
135                        await page.wait_for_timeout(3000)
136
137            await browser.close()
138
139        # Deduplicate based on 'Parcel ID' and 'Property Address'
140        seen = set()
141        deduped_data = []
142        for item in data_list:
143            key = (item.get("Parcel ID"), item.get("Property Address"))
144            if key not in seen:
145                seen.add(key)
146                deduped_data.append(item)
147
148        # Push data to Apify dataset
149        if deduped_data:
150            await Actor.push_data(deduped_data)
151            Actor.log.info(f"Successfully pushed {len(deduped_data)} unique items to the default dataset.")
152        else:
153            Actor.log.warning("No data extracted after deduplication.")

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage

# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
.python-version

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# Zed editor
#  Ignores the folder created when setting Project Settings in the Zed editor. Can be commented out
#  to share Project Settings within a team
.zed

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5playwright

Geographical Data for Zip Code

harvest/geographical-data-for-zip-code

Scrapes geographical data for a list of zip codes. For each zip code, it provides the state, city, county (or counties), and geographical centroid (latitude and longitude).

Harvest Data

Magento E-Commerce Scraper 🚧

jupri/magento-scraper

Scrape data about product price, description and other information from Magento E-Commerce websites.

cat

355

Intelligent Website Scrapper

happitap/intelligent-website-scrapper

An intelligent website scraper that uses LangChain and LLM to extract and process content based on high-level goals like summarization, product extraction, service extraction, and FAQ extraction.

HappiTap

BigCommerce Extractor

jupri/bigcommerce

💫 Scrape BigCommerce Websites

cat

Findify Best

gnyselcuk/findify-best

🔍 AI-powered e-commerce scraper that extracts detailed product data from any online store. Uses LLMs (Mistral/Gemini) for intelligent extraction, handles pagination, variants & CAPTCHAs. Perfect for price monitoring, market research & competitive analysis. #webscraping #ecommerce

selçuk güney

Shopify Products Scraper

dhrumil/shopify-products-scraper

Scrape any shopify powered website / store and automate / monitor all products products in a structured form, including product title, price, description, etc. Data can be exported or connected through API for further analysis and integration.

Dhrumil Bhankhar

381

1.2

WooCommerce Scraper

jupri/woocommerce

💫 Scrape WooCommerce and WordPress websites

cat

4.1

Scrape product data from any e-commerce site with a dataLayer

eloquent_mountain/scrape-product-data-from-any-e-commerce-site

Scrapes e-commerce product data from any (e-commerce) website that has a dataLayer object (mostly used in google analytics implementations). It returns all product data in multiple data formats. Also available as an API to integrate with your own or other products. Circumvents the Cookie wall.

Paco

269

Shopify Extractor

jupri/shopify-scraper

💫 Scrape Shopify e-commerce websites

cat

628

Shopify App Store Research

onescales/shopify-app-store-research

Shopify App Store Research: An Apify scraper to crawl app listings, extract data, & export data. Ideal for research & marketers to study the Shopify ecosystem. Get insights on apps, export to CSV, google sheets excel, and uncover trends and insights about your own apps, and competitor apps with ease