1"""Sitemap to URL List — Apify Actor.
2
3Give it a sitemap URL (or several) and it returns a clean list of every URL
4inside, with lastmod / priority / changefreq metadata. Handles the three
5flavors of real-world sitemaps:
6
7 1. Regular sitemap (<urlset> with <url> entries)
8 2. Sitemap index (<sitemapindex> -> list of nested sitemap URLs, recursed)
9 3. Gzipped sitemaps (.xml.gz, automatically decompressed)
10
11Also extracts the image:loc URLs when present (Google image sitemap extension).
12"""
13
14from __future__ import annotations
15
16import asyncio
17import gzip
18import io
19from xml.etree import ElementTree as ET
20
21import httpx
22from apify import Actor
23
24
25SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
26IMAGE_NS = "http://www.google.com/schemas/sitemap-image/1.1"
27
28
29def _localname(tag: str) -> str:
30 return tag.rsplit("}", 1)[-1]
31
32
33def _text(el, name: str) -> str | None:
34 for child in el:
35 if _localname(child.tag) == name and child.text:
36 t = child.text.strip()
37 if t:
38 return t
39 return None
40
41
42def parse_sitemap(xml_text: str) -> tuple[str, list[dict]]:
43 """Parse a sitemap XML. Returns ('urlset' | 'sitemapindex', items)."""
44 root = ET.fromstring(xml_text)
45 root_name = _localname(root.tag)
46
47 if root_name == "urlset":
48 urls = []
49 for url_el in root:
50 if _localname(url_el.tag) != "url":
51 continue
52 loc = _text(url_el, "loc")
53 if not loc:
54 continue
55
56 images = []
57 for child in url_el:
58 if _localname(child.tag) == "image":
59 for grand in child:
60 if _localname(grand.tag) == "loc" and grand.text:
61 images.append(grand.text.strip())
62 urls.append({
63 "url": loc,
64 "lastModified": _text(url_el, "lastmod"),
65 "changeFrequency": _text(url_el, "changefreq"),
66 "priority": _text(url_el, "priority"),
67 "images": images,
68 })
69 return "urlset", urls
70
71 if root_name == "sitemapindex":
72 sub = []
73 for sm_el in root:
74 if _localname(sm_el.tag) != "sitemap":
75 continue
76 loc = _text(sm_el, "loc")
77 if loc:
78 sub.append({"sitemap": loc, "lastModified": _text(sm_el, "lastmod")})
79 return "sitemapindex", sub
80
81
82 return root_name, []
83
84
85async def fetch_sitemap(client: httpx.AsyncClient, url: str, log) -> str | None:
86 """Fetch a sitemap. Auto-decompresses .gz responses."""
87 for attempt in range(1, 4):
88 try:
89 resp = await client.get(url)
90 resp.raise_for_status()
91 content = resp.content
92
93 is_gz = url.endswith(".gz") or content[:2] == b"\x1f\x8b"
94 if is_gz:
95 try:
96 content = gzip.decompress(content)
97 except (OSError, gzip.BadGzipFile) as exc:
98 log.warning(f"Failed to gunzip {url}: {exc}")
99 return None
100 return content.decode("utf-8", errors="replace")
101 except httpx.HTTPError as exc:
102 log.warning(f"Fetch attempt {attempt} failed for {url}: {exc}")
103 if attempt < 3:
104 await asyncio.sleep(attempt * 2)
105 return None
106
107
108async def main() -> None:
109 async with Actor:
110 actor_input = await Actor.get_input() or {}
111 seed_urls = [u.strip() for u in (actor_input.get("sitemapUrls", []) or []) if u and u.strip()]
112 follow_indexes = bool(actor_input.get("followIndexes", True))
113 max_urls = int(actor_input.get("maxUrls", 50000))
114 max_nested_sitemaps = int(actor_input.get("maxNestedSitemaps", 50))
115
116 if not seed_urls:
117 Actor.log.warning("No sitemap URLs provided.")
118 await Actor.push_data([])
119 return
120
121 Actor.log.info(f"Processing {len(seed_urls)} sitemap(s). Follow indexes: {follow_indexes}.")
122
123
124 to_process: list[tuple[str, str]] = [(u, u) for u in seed_urls]
125 processed_sitemaps = 0
126 pushed_urls = 0
127 seen_sitemaps: set[str] = set()
128
129 async with httpx.AsyncClient(
130 timeout=40.0, follow_redirects=True,
131 headers={"User-Agent": "scrapeworks-sitemap-to-urls/0.1", "Accept": "application/xml, text/xml, */*"},
132 ) as client:
133 while to_process and pushed_urls < max_urls and processed_sitemaps < max_nested_sitemaps:
134 sm_url, source_root = to_process.pop(0)
135 if sm_url in seen_sitemaps:
136 continue
137 seen_sitemaps.add(sm_url)
138
139 xml = await fetch_sitemap(client, sm_url, Actor.log)
140 if xml is None:
141 Actor.log.warning(f" Skipping unreachable {sm_url}")
142 continue
143
144 try:
145 kind, items = parse_sitemap(xml)
146 except ET.ParseError as exc:
147 Actor.log.warning(f" Could not parse {sm_url}: {exc}")
148 continue
149
150 processed_sitemaps += 1
151 Actor.log.info(f" {sm_url}: {kind} with {len(items)} entries")
152
153 if kind == "urlset":
154 remaining = max_urls - pushed_urls
155 batch = items[:remaining]
156 for rec in batch:
157 rec["sourceSitemap"] = sm_url
158 rec["sourceRoot"] = source_root
159 if batch:
160 await Actor.push_data(batch)
161 pushed_urls += len(batch)
162 elif kind == "sitemapindex":
163 if follow_indexes:
164 for entry in items:
165 sub_url = entry.get("sitemap")
166 if sub_url:
167 to_process.append((sub_url, source_root))
168
169 Actor.log.info(f"Done. Returned {pushed_urls} URLs from {processed_sitemaps} sitemap(s).")