1"""URL Metadata & OpenGraph Extractor — Apify Actor.
2
3Give it a list of URLs and it fetches each page and extracts a clean, unified
4set of metadata: title, description, preview image, site name, favicon,
5canonical URL, author, published date, and type — reconciling OpenGraph,
6Twitter Card, standard <meta> tags, and HTML fallbacks in priority order.
7
8The value is the reconciliation: real pages scatter this info across og:*,
9twitter:*, name="description", <title>, etc. This returns one tidy record per
10URL regardless of which tags a given site happens to use.
11
12Parsing is done with the standard library + regex (no heavyweight HTML deps),
13which keeps the actor light and fast for metadata-only extraction.
14"""
15
16from __future__ import annotations
17
18import asyncio
19import html
20import re
21from urllib.parse import urljoin, urlparse
22
23import httpx
24from apify import Actor
25
26
27
28
29
30_META_TAG = re.compile(r"<meta\b[^>]*>", re.I)
31_TITLE_TAG = re.compile(r"<title[^>]*>(.*?)</title>", re.I | re.S)
32_LINK_TAG = re.compile(r"<link\b[^>]*>", re.I)
33_ATTR = re.compile(r"""(\w[\w:-]*)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))""")
34
35
36def _attrs(tag: str) -> dict:
37 out = {}
38 for m in _ATTR.finditer(tag):
39 key = m.group(1).lower()
40 val = m.group(2) if m.group(2) is not None else (m.group(3) if m.group(3) is not None else m.group(4))
41 out[key] = html.unescape(val.strip()) if val else ""
42 return out
43
44
45def parse_metadata(page_html: str, base_url: str) -> dict:
46 """Extract and reconcile metadata from a page's HTML."""
47 og: dict = {}
48 tw: dict = {}
49 meta: dict = {}
50
51 for tag in _META_TAG.findall(page_html):
52 a = _attrs(tag)
53 content = a.get("content")
54 if content is None:
55 continue
56 prop = a.get("property", "").lower()
57 name = a.get("name", "").lower()
58 if prop.startswith("og:"):
59 og[prop[3:]] = content
60 elif prop.startswith("article:") or prop.startswith("profile:"):
61 og[prop] = content
62 elif name.startswith("twitter:"):
63 tw[name[8:]] = content
64 elif name:
65 meta[name] = content
66
67
68 title_tag = _TITLE_TAG.search(page_html)
69 html_title = html.unescape(title_tag.group(1).strip()) if title_tag else None
70
71
72 canonical = None
73 favicon = None
74 for tag in _LINK_TAG.findall(page_html):
75 a = _attrs(tag)
76 rel = a.get("rel", "").lower()
77 href = a.get("href")
78 if not href:
79 continue
80 if "canonical" in rel and canonical is None:
81 canonical = urljoin(base_url, href)
82 if "icon" in rel and favicon is None:
83 favicon = urljoin(base_url, href)
84
85
86 if favicon is None:
87 parsed = urlparse(base_url)
88 if parsed.scheme and parsed.netloc:
89 favicon = f"{parsed.scheme}://{parsed.netloc}/favicon.ico"
90
91 def pick(*vals):
92 for v in vals:
93 if v:
94 return v
95 return None
96
97 image = pick(og.get("image"), tw.get("image"), tw.get("image:src"))
98 if image:
99 image = urljoin(base_url, image)
100
101 return {
102 "title": pick(og.get("title"), tw.get("title"), meta.get("title"), html_title),
103 "description": pick(og.get("description"), tw.get("description"), meta.get("description")),
104 "image": image,
105 "siteName": pick(og.get("site_name"), tw.get("site")),
106 "type": og.get("type"),
107 "canonicalUrl": canonical,
108 "favicon": favicon,
109 "author": pick(meta.get("author"), og.get("article:author")),
110 "publishedDate": pick(og.get("article:published_time"), meta.get("date")),
111 "themeColor": meta.get("theme-color"),
112 "twitterCard": tw.get("card"),
113 "keywords": meta.get("keywords"),
114 }
115
116
117async def fetch_page(client: httpx.AsyncClient, url: str, log) -> tuple[str | None, int | None, str | None]:
118 """Fetch a page. Returns (html, status_code, final_url)."""
119 for attempt in range(1, 4):
120 try:
121 resp = await client.get(url)
122 status = resp.status_code
123 if status >= 400:
124 return None, status, str(resp.url)
125 ctype = resp.headers.get("content-type", "")
126 if "html" not in ctype and "xml" not in ctype and ctype:
127
128 return None, status, str(resp.url)
129
130 text = resp.text
131 return text[:500000], status, str(resp.url)
132 except httpx.HTTPError as exc:
133 log.warning(f"Fetch attempt {attempt} failed for {url}: {exc}")
134 if attempt < 3:
135 await asyncio.sleep(attempt * 2)
136 return None, None, url
137
138
139async def main() -> None:
140 async with Actor:
141 actor_input = await Actor.get_input() or {}
142 urls = [u.strip() for u in (actor_input.get("urls", []) or []) if u and u.strip()]
143
144 if not urls:
145 Actor.log.warning("No URLs provided.")
146 await Actor.push_data([])
147 return
148
149
150 normalized = []
151 for u in urls:
152 if not re.match(r"^https?://", u, re.I):
153 u = "https://" + u
154 normalized.append(u)
155
156 Actor.log.info(f"Extracting metadata from {len(normalized)} URL(s).")
157
158 async with httpx.AsyncClient(
159 timeout=30.0, follow_redirects=True,
160 headers={
161 "User-Agent": "Mozilla/5.0 (compatible; scrapeworks-url-metadata/0.1; +https://apify.com/scrapeworks)",
162 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
163 },
164 ) as client:
165 for url in normalized:
166 page_html, status, final_url = await fetch_page(client, url, Actor.log)
167
168 if page_html is None:
169 await Actor.push_data([{
170 "url": url,
171 "finalUrl": final_url,
172 "success": False,
173 "statusCode": status,
174 "error": "Could not fetch or not an HTML page",
175 }])
176 Actor.log.info(f" {url}: failed (status {status})")
177 continue
178
179 md = parse_metadata(page_html, final_url or url)
180 record = {"url": url, "finalUrl": final_url, "success": True,
181 "statusCode": status, **md}
182 await Actor.push_data([record])
183 Actor.log.info(f" {url}: '{md.get('title') or '(no title)'}'")
184
185 Actor.log.info("Done.")