1"""PyPI Package Lookup & Health — Apify Actor.
2
3Give it a list of Python package names (or paste a requirements.txt), and it
4returns clean, structured data for each from the official PyPI JSON API, plus a
5computed activity signal (active / slowing / stale / dormant) from release
6recency. Reliable (pure JSON API, per-package) — unlike scraping PyPI's HTML
7search page.
8"""
9
10from __future__ import annotations
11
12import asyncio
13import re
14from datetime import datetime, timezone
15
16import httpx
17from apify import Actor
18
19PYPI = "https://pypi.org/pypi"
20
21_REQ_LINE = re.compile(r"^\s*([A-Za-z0-9][A-Za-z0-9._-]*)")
22
23
24def parse_requirements(text: str) -> list[str]:
25 """Extract package names from requirements.txt content."""
26 names: list[str] = []
27 for line in text.splitlines():
28 line = line.strip()
29 if not line or line.startswith("#") or line.startswith("-"):
30 continue
31 if line.startswith(("git+", "http://", "https://")):
32 continue
33 m = _REQ_LINE.match(line)
34 if m:
35 names.append(m.group(1))
36 return names
37
38
39def _parse_dt(value: str | None) -> datetime | None:
40 if not value:
41 return None
42 try:
43 return datetime.fromisoformat(value.replace("Z", "+00:00"))
44 except (ValueError, AttributeError):
45 return None
46
47
48def compute_activity(last_release_iso: str | None, now: datetime) -> tuple[str, int | None]:
49 dt = _parse_dt(last_release_iso)
50 if dt is None:
51 return "unknown", None
52 if dt.tzinfo is None:
53 dt = dt.replace(tzinfo=timezone.utc)
54 days = int((now - dt).total_seconds() / 86400)
55 if days <= 180:
56 level = "active"
57 elif days <= 365:
58 level = "slowing"
59 elif days <= 730:
60 level = "stale"
61 else:
62 level = "dormant"
63 return level, days
64
65
66def transform(data: dict, now: datetime) -> dict:
67 info = data.get("info", {})
68 version = info.get("version")
69 releases = data.get("releases", {})
70 files = releases.get(version) or []
71 last_upload = files[0].get("upload_time_iso_8601") if files else None
72 yanked = bool(files[0].get("yanked")) if files else False
73
74 urls = info.get("project_urls") or {}
75 repo = None
76 for k, v in urls.items():
77 if k.lower() in ("source", "repository", "code", "github") or "github.com" in (v or ""):
78 repo = v
79 break
80
81 activity, days = compute_activity(last_upload, now)
82 requires = info.get("requires_dist") or []
83
84 return {
85 "name": info.get("name"),
86 "summary": (info.get("summary") or "").strip() or None,
87 "version": version,
88 "lastReleaseDate": last_upload,
89 "activityLevel": activity,
90 "daysSinceRelease": days,
91 "yanked": yanked,
92 "license": info.get("license_expression") or info.get("license") or None,
93 "requiresPython": info.get("requires_python"),
94 "author": info.get("author") or info.get("author_email"),
95 "dependencyCount": len(requires),
96 "releaseCount": len(releases),
97 "homepage": info.get("home_page") or urls.get("Homepage"),
98 "documentation": urls.get("Documentation"),
99 "repository": repo,
100 "keywords": [k.strip() for k in (info.get("keywords") or "").replace(",", " ").split() if k.strip()][:20],
101 "url": f"https://pypi.org/project/{info.get('name')}/",
102 }
103
104
105async def fetch_package(client: httpx.AsyncClient, name: str, log) -> dict | None:
106 for attempt in range(1, 4):
107 try:
108 resp = await client.get(f"{PYPI}/{name}/json")
109 if resp.status_code == 404:
110 return None
111 resp.raise_for_status()
112 return resp.json()
113 except (httpx.HTTPError, ValueError) as exc:
114 log.warning(f"Fetch attempt {attempt} failed for {name}: {exc}")
115 if attempt < 3:
116 await asyncio.sleep(attempt * 2)
117 return None
118
119
120async def main() -> None:
121 async with Actor:
122 actor_input = await Actor.get_input() or {}
123
124 names: list[str] = list(actor_input.get("packages", []) or [])
125 manifest = actor_input.get("requirementsText", "") or ""
126 if manifest.strip():
127 names += parse_requirements(manifest)
128
129
130 seen, unique = set(), []
131 for n in names:
132 n = n.strip()
133 if not n:
134 continue
135 key = n.lower().replace("_", "-")
136 if key not in seen:
137 seen.add(key)
138 unique.append(n)
139 names = unique
140
141 if not names:
142 Actor.log.warning("No packages provided. Add 'packages' or paste a requirements.txt into 'requirementsText'.")
143 await Actor.push_data([])
144 return
145
146 Actor.log.info(f"Looking up {len(names)} PyPI package(s).")
147 now = datetime.now(timezone.utc)
148
149 async with httpx.AsyncClient(
150 timeout=30.0,
151 headers={"User-Agent": "scrapeworks-pypi-lookup/0.1", "Accept": "application/json"},
152 ) as client:
153 for name in names:
154 data = await fetch_package(client, name, Actor.log)
155 if data is None:
156 await Actor.push_data([{"name": name, "found": False,
157 "error": "Package not found on PyPI"}])
158 Actor.log.info(f" {name}: not found")
159 continue
160 rec = transform(data, now)
161 rec["found"] = True
162 await Actor.push_data([rec])
163 Actor.log.info(f" {rec['name']} ({rec['version']}): {rec['activityLevel']}")
164
165 Actor.log.info("Done.")