1"""Apify Actor for scraping Duval and Clay County Tax Deed sales data."""
2
3from __future__ import annotations
4
5import time
6from urllib.parse import urljoin
7
8from apify import Actor
9from playwright.async_api import async_playwright
10
11
12async def main() -> None:
13 """Main entry point for the Apify Actor."""
14
15 async with Actor:
16 Actor.log.info("Starting the Duval Tax Deed scraper...")
17
18 base_urls = [
19 "https://taxdeed.duvalclerk.com/",
20 "https://landmark.clayclerk.com/TaxDeed/"
21 ]
22
23 data_list = []
24
25 async with async_playwright() as playwright:
26 browser = await playwright.chromium.launch(
27 headless=Actor.config.headless,
28 args=["--disable-gpu"]
29 )
30 context = await browser.new_context()
31 page = await context.new_page()
32
33 for base_url in base_urls:
34 await page.goto(base_url)
35
36
37 await page.select_option("#SearchSaleDateFrom", index=3)
38 await page.select_option("#SearchSaleDateTo", index=0)
39 await page.click("#tabs-9 button")
40
41
42 status_input = page.locator("#gs_Status")
43 await status_input.click()
44 await status_input.fill("")
45 await status_input.type("SALE")
46 await page.wait_for_timeout(2000)
47
48 try:
49 total_pages = int((await page.text_content("#sp_1_pager")).strip())
50 except Exception:
51 total_pages = 1
52
53 for current_page in range(1, total_pages + 1):
54 Actor.log.info(f"Processing page {current_page} of {total_pages}")
55 rows = await page.locator("tr[role='row'][id]").all()
56 filtered_row_ids = []
57
58 for row in rows:
59 row_id = await row.get_attribute("id")
60 if row_id and row_id.isdigit():
61 status = await row.locator("td[aria-describedby='TaxDeed_Status']").text_content()
62 if status.strip() == "SALE":
63 filtered_row_ids.append(row_id)
64
65 for row_id in filtered_row_ids:
66 if "clayclerk" in base_url:
67 details_url = f"https://landmark.clayclerk.com/TaxDeed/Home/Details?id={row_id}"
68 else:
69 details_url = f"https://taxdeed.duvalclerk.com/Home/Details?id={row_id}"
70
71 Actor.log.info(f"Visiting details URL: {details_url}")
72 await page.goto(details_url)
73 await page.wait_for_timeout(2000)
74
75 detail_data = {}
76 rows_detail = await page.locator("tr:has(td b)").all()
77
78 for row in rows_detail:
79 try:
80 key = await row.locator("td:nth-child(1) b").text_content()
81 value = await row.locator("td:nth-child(2)").text_content()
82 key = key.strip()
83 value = value.strip()
84
85 if key in ["Property Address", "Parcel ID"]:
86 detail_data[key] = value
87 elif key in ["Opening Bid", "Base Bid"]:
88 detail_data["Opening Bid"] = value
89 except Exception:
90 pass
91
92 for field in ["Property Address", "Parcel ID", "Opening Bid"]:
93 if field not in detail_data:
94 detail_data[field] = "N/A"
95
96
97 address = detail_data.get("Property Address", "")
98 if address:
99 parts = address.split(",")
100 if len(parts) >= 2:
101 city_and_state = parts[-1].strip()
102 state_parts = city_and_state.split()
103 if len(state_parts) >= 2:
104 state = state_parts[-2].strip()
105 else:
106 state = "N/A"
107 else:
108 state = "N/A"
109 else:
110 state = "N/A"
111
112 if "clayclerk" in base_url:
113 county = "Clay County"
114 else:
115 county = "Duval County"
116
117
118 detail_data["County"] = county
119 detail_data["State"] = state
120
121 data_list.append(detail_data)
122
123
124 await page.goto(base_url)
125 await page.select_option("#SearchSaleDateFrom", index=3)
126 await page.select_option("#SearchSaleDateTo", index=0)
127 await page.click("#tabs-9 button")
128 await status_input.click()
129 await status_input.fill("")
130 await status_input.type("SALE")
131 await page.wait_for_timeout(2000)
132
133 if current_page < total_pages:
134 await page.click("#next_pager")
135 await page.wait_for_timeout(3000)
136
137 await browser.close()
138
139
140 seen = set()
141 deduped_data = []
142 for item in data_list:
143 key = (item.get("Parcel ID"), item.get("Property Address"))
144 if key not in seen:
145 seen.add(key)
146 deduped_data.append(item)
147
148
149 if deduped_data:
150 await Actor.push_data(deduped_data)
151 Actor.log.info(f"Successfully pushed {len(deduped_data)} unique items to the default dataset.")
152 else:
153 Actor.log.warning("No data extracted after deduplication.")