1"""Federal Register Search — Apify Actor.
2
3Searches the official US Federal Register API (federalregister.gov) for
4documents: rules, proposed rules, notices, and presidential documents
5(executive orders, proclamations). Returns clean, structured JSON.
6
7The Federal Register API is well-documented and uses a `conditions[...]`
8query-parameter style. We use only documented parameters and the documented
9`fields[]` selector so the response is predictable.
10
11API docs: https://www.federalregister.gov/developers/documentation/api/v1
12"""
13
14from __future__ import annotations
15
16import asyncio
17from urllib.parse import urlencode
18
19import httpx
20from apify import Actor
21
22API = "https://www.federalregister.gov/api/v1/documents.json"
23
24
25DOC_TYPES = {"RULE", "PRORULE", "NOTICE", "PRESDOCU"}
26DOC_TYPE_LABELS = {
27 "RULE": "Final Rule",
28 "PRORULE": "Proposed Rule",
29 "NOTICE": "Notice",
30 "PRESDOCU": "Presidential Document",
31}
32
33
34def transform(doc: dict) -> dict:
35 """Normalize a Federal Register document into a clean record."""
36 agencies = doc.get("agencies") or []
37 agency_names = [a.get("name") for a in agencies if isinstance(a, dict) and a.get("name")]
38
39 doc_type = doc.get("type")
40 return {
41 "documentNumber": doc.get("document_number"),
42 "title": doc.get("title"),
43 "type": doc_type,
44 "typeLabel": DOC_TYPE_LABELS.get(doc_type, doc_type),
45 "abstract": doc.get("abstract"),
46 "action": doc.get("action"),
47 "agencies": agency_names,
48 "publicationDate": doc.get("publication_date"),
49 "effectiveDate": doc.get("effective_on"),
50 "commentsCloseDate": doc.get("comments_close_on"),
51 "citation": doc.get("citation"),
52
53 "president": doc.get("president"),
54 "presidentialDocumentType": doc.get("presidential_document_type"),
55 "executiveOrderNumber": doc.get("executive_order_number"),
56 "signingDate": doc.get("signing_date"),
57 "url": doc.get("html_url"),
58 "pdfUrl": doc.get("pdf_url"),
59 }
60
61
62def build_url(actor_input: dict) -> str:
63 """Build a Federal Register API query using only core documented parameters.
64
65 Deliberately minimal: we do NOT use a custom `fields[]` selector (the API
66 returns a rich default field set that already includes everything we read in
67 transform()). Sending unknown/over-specified params is the main cause of a
68 400 from this API, so we keep the request conservative.
69 """
70 params: list[tuple[str, str]] = []
71
72 term = (actor_input.get("searchTerm") or "").strip()
73 if term:
74 params.append(("conditions[term]", term))
75
76
77 for t in (actor_input.get("documentTypes") or []):
78 t = str(t).strip().upper()
79 if t in DOC_TYPES:
80 params.append(("conditions[type][]", t))
81
82
83 agency = (actor_input.get("agency") or "").strip()
84 if agency:
85 params.append(("conditions[agencies][]", agency))
86
87
88 start = (actor_input.get("publishedAfter") or "").strip()
89 if start:
90 params.append(("conditions[publication_date][gte]", start))
91 end = (actor_input.get("publishedBefore") or "").strip()
92 if end:
93 params.append(("conditions[publication_date][lte]", end))
94
95
96 order = actor_input.get("order", "newest")
97 if order not in ("newest", "oldest", "relevance"):
98 order = "newest"
99 params.append(("order", order))
100
101
102 params.append(("per_page", "100"))
103
104 return f"{API}?{urlencode(params)}"
105
106
107async def main() -> None:
108 async with Actor:
109 actor_input = await Actor.get_input() or {}
110 max_items = int(actor_input.get("maxItems", 100))
111
112 base_url = build_url(actor_input)
113 Actor.log.info(f"Federal Register search: {base_url}")
114
115 pushed = 0
116 page = 1
117 total = None
118
119 async with httpx.AsyncClient(
120 timeout=40.0,
121 headers={"User-Agent": "scrapeworks-federal-register/0.1", "Accept": "application/json"},
122 ) as client:
123 while pushed < max_items and page <= 100:
124
125 sep = "&" if "?" in base_url else "?"
126 url = f"{base_url}{sep}page={page}"
127
128 data = None
129 for attempt in range(1, 4):
130 try:
131 resp = await client.get(url)
132 resp.raise_for_status()
133 data = resp.json()
134 break
135 except (httpx.HTTPError, ValueError) as exc:
136 Actor.log.warning(f"Attempt {attempt} failed: {exc}")
137 if attempt < 3:
138 await asyncio.sleep(attempt * 2)
139
140 if data is None:
141 Actor.log.error("Failed to fetch Federal Register data.")
142 break
143
144 if total is None:
145 total = data.get("count")
146 Actor.log.info(f"Federal Register reports {total} matching documents.")
147
148 results = data.get("results") or []
149 if not results:
150 break
151
152 batch = [transform(d) for d in results[: max_items - pushed]]
153 if batch:
154 await Actor.push_data(batch)
155 pushed += len(batch)
156 Actor.log.info(f"Pushed {pushed}/{max_items} documents.")
157
158
159 if total is not None and pushed >= min(total, max_items):
160 break
161 if len(results) < 100:
162 break
163 page += 1
164 await asyncio.sleep(1)
165
166 Actor.log.info(f"Done. Returned {pushed} documents.")