1"""Danish Annual Reports (CVR Regnskaber) — Apify Actor.
2
3Returns the index of published annual reports (regnskaber) for Danish companies
4from the official Erhvervsstyrelsen distribution endpoint. For each filing it
5returns the accounting period, publication timestamp, document type, and direct
6links to the PDF and machine-readable XBRL documents.
7
8This reads only stable, top-level index fields (it does NOT parse figures out of
9the XBRL), so it is robust and maintenance-free. No API key required.
10"""
11
12from __future__ import annotations
13
14import httpx
15from apify import Actor
16
17SEARCH_URL = "http://distribution.virk.dk/offentliggoerelser/_search"
18MAX_SIZE = 3000
19
20
21def _get(d, *path):
22 """Safe nested getter; returns None if any step is missing."""
23 cur = d
24 for key in path:
25 if not isinstance(cur, dict):
26 return None
27 cur = cur.get(key)
28 return cur
29
30
31def transform(source: dict) -> dict:
32 """Flatten one offentliggoerelse (_source) into a clean filing record.
33
34 Field paths confirmed against the official endpoint and two independent
35 open-source clients (cvrminer, XBRL-AI).
36 """
37 period = _get(source, "regnskab", "regnskabsperiode") or {}
38
39 pdf_url = None
40 xml_url = None
41 doc_type = None
42 for dok in source.get("dokumenter") or []:
43 if not isinstance(dok, dict):
44 continue
45 mime = dok.get("dokumentMimeType")
46 if mime == "application/pdf" and not pdf_url:
47 pdf_url = dok.get("dokumentUrl")
48 doc_type = doc_type or dok.get("dokumentType")
49 elif mime == "application/xml" and not xml_url:
50 xml_url = dok.get("dokumentUrl")
51 doc_type = doc_type or dok.get("dokumentType")
52
53 return {
54 "cvrNumber": source.get("cvrNummer"),
55 "offentliggoerelsestype": source.get("offentliggoerelsestype"),
56 "documentType": doc_type,
57 "periodStart": period.get("startDato"),
58 "periodEnd": period.get("slutDato"),
59 "publishedAt": source.get("offentliggoerelsesTidspunkt"),
60 "lastUpdated": source.get("sidstOpdateret"),
61 "isCorrection": source.get("omgoerelse"),
62 "regNumber": source.get("regNummer"),
63 "caseNumber": source.get("sagsNummer"),
64 "loadId": source.get("indlaesningsId"),
65 "loadedAt": source.get("indlaesningsTidspunkt"),
66 "pdfUrl": pdf_url,
67 "xbrlUrl": xml_url,
68 }
69
70
71def build_query(actor_input: dict):
72 """Build the ElasticSearch query body and resolved size."""
73 must = []
74
75 cvr = actor_input.get("cvrNumber")
76 if cvr is not None and str(cvr).strip():
77 digits = "".join(ch for ch in str(cvr) if ch.isdigit())
78 if digits:
79 must.append({"term": {"cvrNummer": int(digits)}})
80
81 from_date = (actor_input.get("fromDate") or "").strip()
82 to_date = (actor_input.get("toDate") or "").strip()
83 if from_date or to_date:
84 rng = {}
85 if from_date:
86 rng["from"] = from_date
87 if to_date:
88 rng["to"] = to_date
89 must.append({"range": {"offentliggoerelse.offentliggoerelsesTidspunkt": rng}})
90
91 max_items = actor_input.get("maxItems") or 100
92 try:
93 max_items = int(max_items)
94 except (ValueError, TypeError):
95 max_items = 100
96 max_items = max(1, min(max_items, MAX_SIZE))
97
98
99 must.append({"term": {"offentliggoerelsestype": "regnskab"}})
100
101 body = {"query": {"bool": {"must": must}}, "size": max_items}
102 return body, max_items
103
104
105async def main() -> None:
106 async with Actor:
107 actor_input = await Actor.get_input() or {}
108
109 if not (actor_input.get("cvrNumber") or actor_input.get("fromDate") or actor_input.get("toDate")):
110
111 actor_input = {**actor_input, "cvrNumber": "24256790"}
112
113 body, max_items = build_query(actor_input)
114 Actor.log.info(f"Querying annual reports (cvr={actor_input.get('cvrNumber')}, "
115 f"from={actor_input.get('fromDate')}, to={actor_input.get('toDate')}, size={max_items}).")
116
117 headers = {
118 "Accept": "application/json; charset=utf-8",
119 "Content-Type": "application/json",
120 "User-Agent": "apify-danish-annual-reports (+https://apify.com)",
121 }
122
123 hits = []
124 try:
125 async with httpx.AsyncClient(timeout=60, headers=headers) as client:
126 resp = await client.post(SEARCH_URL, json=body)
127 resp.raise_for_status()
128 data = resp.json()
129 hits = _get(data, "hits", "hits") or []
130 total = _get(data, "hits", "total")
131 Actor.log.info(f"Found {total if total is not None else len(hits)} matching filings; processing {len(hits)}.")
132 except httpx.HTTPStatusError as exc:
133 Actor.log.error(f"Erhvervsstyrelsen endpoint returned HTTP {exc.response.status_code}.")
134 except Exception as exc:
135 Actor.log.error(f"Failed to fetch annual reports: {exc}")
136
137 count = 0
138 for hit in hits:
139 source = hit.get("_source") if isinstance(hit, dict) else None
140 if not isinstance(source, dict):
141 continue
142 await Actor.push_data(transform(source))
143 count += 1
144
145 Actor.log.info(f"Done. Returned {count} filing(s).")